25 files changed, 5270 insertions, 1329 deletions
diff --git a/src/libmpeg2new/libmpeg2/Makefile.am b/src/libmpeg2new/libmpeg2/Makefile.am
index b834c7df5..f99894f12 100644
--- a/src/libmpeg2new/libmpeg2/Makefile.am
+++ b/src/libmpeg2new/libmpeg2/Makefile.am
@@ -2,12 +2,12 @@ include $(top_srcdir)/misc/Makefile.common
 
 noinst_LTLIBRARIES = libmpeg2.la libmpeg2arch.la
 
-libmpeg2_la_SOURCES = alloc.c header.c decode.c slice.c motion_comp.c idct.c \
-                      motion_comp_mlib.c idct_mlib.c
+libmpeg2_la_SOURCES = alloc.c header.c decode.c slice.c motion_comp.c idct.c
 libmpeg2_la_LIBADD = libmpeg2arch.la
 
 libmpeg2arch_la_SOURCES = motion_comp_mmx.c idct_mmx.c \
                           motion_comp_altivec.c idct_altivec.c \
                           motion_comp_alpha.c idct_alpha.c \
+                          motion_comp_vis.c \
                           cpu_accel.c cpu_state.c
 
diff --git a/src/libmpeg2new/libmpeg2/alloc.c b/src/libmpeg2new/libmpeg2/alloc.c
index 67a5d5c6a..f1a7afa1c 100644
--- a/src/libmpeg2new/libmpeg2/alloc.c
+++ b/src/libmpeg2new/libmpeg2/alloc.c
@@ -21,56 +21,50 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include "config.h"
-
 #include <stdlib.h>
 #include <inttypes.h>
 
 #include "../include/mpeg2.h"
-#include "mpeg2_internal.h"
-
-#if defined(HAVE_MEMALIGN) && !defined(__cplusplus)
-/* some systems have memalign() but no declaration for it */
-void * memalign (size_t align, size_t size);
-#endif
 
-void * (* mpeg2_malloc_hook) (int size, int reason) = NULL;
-int (* mpeg2_free_hook) (void * buf) = NULL;
+static void * (* malloc_hook) (unsigned size, mpeg2_alloc_t reason) = NULL;
+static int (* free_hook) (void * buf) = NULL;
 
-void * mpeg2_malloc (int size, int reason)
+void * mpeg2_malloc (unsigned size, mpeg2_alloc_t reason)
 {
     char * buf;
 
-    if (mpeg2_malloc_hook) {
-	buf = (char *) mpeg2_malloc_hook (size, reason);
+    if (malloc_hook) {
+	buf = (char *) malloc_hook (size, reason);
 	if (buf)
 	    return buf;
     }
 
-#if defined(HAVE_MEMALIGN) && !defined(__cplusplus) && !defined(DEBUG)
-    return memalign (16, size);
-#else
-    buf = (char *) malloc (size + 15 + sizeof (void **));
-    if (buf) {
-	char * align_buf;
+    if (size) {
+	buf = (char *) malloc (size + 63 + sizeof (void **));
+	if (buf) {
+	    char * align_buf;
 
-	align_buf = buf + 15 + sizeof (void **);
-	align_buf -= (long)align_buf & 15;
-	*(((void **)align_buf) - 1) = buf;
-	return align_buf;
+	    align_buf = buf + 63 + sizeof (void **);
+	    align_buf -= (long)align_buf & 63;
+	    *(((void **)align_buf) - 1) = buf;
+	    return align_buf;
+	}
     }
     return NULL;
-#endif
 }
 
 void mpeg2_free (void * buf)
 {
-    if (mpeg2_free_hook && mpeg2_free_hook (buf))
+    if (free_hook && free_hook (buf))
 	return;
 
-#if defined(HAVE_MEMALIGN) && !defined(__cplusplus) && !defined(DEBUG)
-    free (buf);
-#else
-    free (*(((void **)buf) - 1));
-#endif
+    if (buf)
+	free (*(((void **)buf) - 1));
+}
+
+void mpeg2_malloc_hooks (void * malloc (unsigned, mpeg2_alloc_t),
+			 int free (void *))
+{
+    malloc_hook = malloc;
+    free_hook = free;
 }
diff --git a/src/libmpeg2new/libmpeg2/configure.incl b/src/libmpeg2new/libmpeg2/configure.incl
index aa9337774..f8dbd5aef 100644
--- a/src/libmpeg2new/libmpeg2/configure.incl
+++ b/src/libmpeg2new/libmpeg2/configure.incl
@@ -1,5 +1,4 @@
 AC_SUBST([LIBMPEG2_CFLAGS])
-AC_SUBST([LIBMPEG2_LIBS])
 
 dnl avoid -fPIC when possible
 AC_LIBTOOL_NON_PIC([LIBMPEG2_CFLAGS="$LIBMPEG2_CFLAGS -prefer-non-pic"])
@@ -10,16 +9,3 @@ AC_ARG_ENABLE([accel-detect],
 if test x"$enable_accel_detect" != x"no"; then
     AC_DEFINE([ACCEL_DETECT],,[autodetect accelerations])
 fi
-
-dnl check for mlib
-AC_ARG_ENABLE([mlib],
-    [  --disable-mlib          make a version not using mediaLib])
-if test x"$enable_mlib" != x"no"; then
-    cflags_save="$CFLAGS"
-    CFLAGS="$OPT_CFLAGS -L/opt/SUNWmlib/lib -R/opt/SUNWmlib/lib $CFLAGS"
-    AC_CHECK_LIB([mlib],[mlib_VideoColorYUV2RGB420],
-        [AC_DEFINE([LIBMPEG2_MLIB],,[libmpeg2 mediaLib support])
-        LIBMPEG2_CFLAGS="$LIBMPEG2_CFLAGS -I/opt/SUNWmlib/include"
-        LIBMPEG2_LIBS="$LIBMPEG2_LIBS -L/opt/SUNWmlib/lib -R/opt/SUNWmlib/lib -lmlib"])
-    CFLAGS="$cflags_save"
-fi
diff --git a/src/libmpeg2new/libmpeg2/convert_internal.h b/src/libmpeg2new/libmpeg2/convert_internal.h
new file mode 100644
index 000000000..d1e63d5e3
--- /dev/null
+++ b/src/libmpeg2new/libmpeg2/convert_internal.h
@@ -0,0 +1,42 @@
+/*
+ * convert_internal.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+typedef struct {
+    uint8_t * rgb_ptr;
+    int width;
+    int field;
+    int y_stride, rgb_stride, y_increm, uv_increm, rgb_increm, rgb_slice;
+    int chroma420, convert420;
+    int dither_offset, dither_stride;
+    int y_stride_frame, uv_stride_frame, rgb_stride_frame, rgb_stride_min;
+} convert_rgb_t;
+
+typedef void mpeg2convert_copy_t (void * id, uint8_t * const * src,
+				  unsigned int v_offset);
+
+mpeg2convert_copy_t * mpeg2convert_rgb_mmxext (int bpp, int mode,
+					       const mpeg2_sequence_t * seq);
+mpeg2convert_copy_t * mpeg2convert_rgb_mmx (int bpp, int mode,
+					    const mpeg2_sequence_t * seq);
+mpeg2convert_copy_t * mpeg2convert_rgb_vis (int bpp, int mode,
+					    const mpeg2_sequence_t * seq);
diff --git a/src/libmpeg2new/libmpeg2/cpu_accel.c b/src/libmpeg2new/libmpeg2/cpu_accel.c
index dac3cf83d..7846f1e88 100644
--- a/src/libmpeg2new/libmpeg2/cpu_accel.c
+++ b/src/libmpeg2new/libmpeg2/cpu_accel.c
@@ -1,6 +1,6 @@
 /*
  * cpu_accel.c
- * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2000-2004 Michel Lespinasse <walken@zoy.org>
  * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
@@ -26,16 +26,25 @@
 #include <inttypes.h>
 
 #include "../include/mpeg2.h"
+#include "../include/attributes.h"
+#include "mpeg2_internal.h"
 
-#define ACCEL_DETECT  /* Force accel on */
-
-#ifdef ACCEL_DETECT
 #ifdef ARCH_X86
-static inline uint32_t arch_accel (void)
+static inline uint32_t arch_accel (uint32_t accel)
 {
-    uint32_t eax, ebx, ecx, edx;
-    int AMD;
-    uint32_t caps;
+    if (accel & (MPEG2_ACCEL_X86_3DNOW | MPEG2_ACCEL_X86_MMXEXT))
+	accel |= MPEG2_ACCEL_X86_MMX;
+	
+    if (accel & (MPEG2_ACCEL_X86_SSE2 | MPEG2_ACCEL_X86_SSE3))
+	accel |= MPEG2_ACCEL_X86_MMXEXT;
+	
+    if (accel & (MPEG2_ACCEL_X86_SSE3))
+	accel |= MPEG2_ACCEL_X86_SSE2;
+
+#ifdef ACCEL_DETECT
+    if (accel & MPEG2_ACCEL_DETECT) {
+	uint32_t eax, ebx, ecx, edx;
+	int AMD;
 
 #if !defined(PIC) && !defined(__PIC__)
 #define cpuid(op,eax,ebx,ecx,edx)	\
@@ -60,55 +69,63 @@ static inline uint32_t arch_accel (void)
 	     : "cc")
 #endif
 
-    __asm__ ("pushf\n\t"
-	     "pushf\n\t"
-	     "pop %0\n\t"
-	     "movl %0,%1\n\t"
-	     "xorl $0x200000,%0\n\t"
-	     "push %0\n\t"
-	     "popf\n\t"
-	     "pushf\n\t"
-	     "pop %0\n\t"
-	     "popf"
-	     : "=r" (eax),
-	       "=r" (ebx)
-	     :
-	     : "cc");
+	__asm__ ("pushf\n\t"
+		 "pushf\n\t"
+		 "pop %0\n\t"
+		 "movl %0,%1\n\t"
+		 "xorl $0x200000,%0\n\t"
+		 "push %0\n\t"
+		 "popf\n\t"
+		 "pushf\n\t"
+		 "pop %0\n\t"
+		 "popf"
+		 : "=r" (eax),
+		 "=r" (ebx)
+		 :
+		 : "cc");
 
-    if (eax == ebx)		/* no cpuid */
-	return 0;
+	if (eax == ebx)			/* no cpuid */
+	    return accel;
 
-    cpuid (0x00000000, eax, ebx, ecx, edx);
-    if (!eax)			/* vendor string only */
-	return 0;
+	cpuid (0x00000000, eax, ebx, ecx, edx);
+	if (!eax)			/* vendor string only */
+	    return accel;
 
-    AMD = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65);
+	AMD = (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65);
 
-    cpuid (0x00000001, eax, ebx, ecx, edx);
-    if (! (edx & 0x00800000))	/* no MMX */
-	return 0;
+	cpuid (0x00000001, eax, ebx, ecx, edx);
+	if (! (edx & 0x00800000))	/* no MMX */
+	    return accel;
 
-    caps = MPEG2_ACCEL_X86_MMX;
-    if (edx & 0x02000000)	/* SSE - identical to AMD MMX extensions */
-	caps = MPEG2_ACCEL_X86_MMX | MPEG2_ACCEL_X86_MMXEXT;
+	accel |= MPEG2_ACCEL_X86_MMX;
+	if (edx & 0x02000000)	/* SSE - identical to AMD MMX extensions */
+	    accel |= MPEG2_ACCEL_X86_MMXEXT;
 
-    cpuid (0x80000000, eax, ebx, ecx, edx);
-    if (eax < 0x80000001)	/* no extended capabilities */
-	return caps;
+	if (edx & 0x04000000)	/* SSE2 */
+	    accel |= MPEG2_ACCEL_X86_SSE2;
+	    
+	if (ecx & 0x00000001)	/* SSE3 */
+	    accel |= MPEG2_ACCEL_X86_SSE3;
+	    
+	cpuid (0x80000000, eax, ebx, ecx, edx);
+	if (eax < 0x80000001)		/* no extended capabilities */
+	    return accel;
 
-    cpuid (0x80000001, eax, ebx, ecx, edx);
+	cpuid (0x80000001, eax, ebx, ecx, edx);
 
-    if (edx & 0x80000000)
-	caps |= MPEG2_ACCEL_X86_3DNOW;
+	if (edx & 0x80000000)
+	    accel |= MPEG2_ACCEL_X86_3DNOW;
 
-    if (AMD && (edx & 0x00400000))	/* AMD MMX extensions */
-	caps |= MPEG2_ACCEL_X86_MMXEXT;
+	if (AMD && (edx & 0x00400000))	/* AMD MMX extensions */
+	    accel |= MPEG2_ACCEL_X86_MMXEXT;
+    }
+#endif /* ACCEL_DETECT */
 
-    return caps;
+    return accel;
 }
 #endif /* ARCH_X86 */
 
-#ifdef ARCH_PPC
+#if defined(ACCEL_DETECT) && (defined(ARCH_PPC) || defined(ARCH_SPARC))
 #include <signal.h>
 #include <setjmp.h>
 
@@ -125,60 +142,117 @@ static RETSIGTYPE sigill_handler (int sig)
     canjump = 0;
     siglongjmp (jmpbuf, 1);
 }
+#endif /* ACCEL_DETECT && (ARCH_PPC || ARCH_SPARC) */
 
-static inline uint32_t arch_accel (void)
+#ifdef ARCH_PPC
+static inline uint32_t arch_accel (uint32_t accel)
 {
-    static RETSIGTYPE (* oldsig) (int);
+#ifdef ACCEL_DETECT
+    if (accel & (MPEG2_ACCEL_PPC_ALTIVEC | MPEG2_ACCEL_DETECT) ==
+	MPEG2_ACCEL_DETECT) {
+	static RETSIGTYPE (* oldsig) (int);
 
-    oldsig = signal (SIGILL, sigill_handler);
-    if (sigsetjmp (jmpbuf, 1)) {
-	signal (SIGILL, oldsig);
-	return 0;
-    }
+	oldsig = signal (SIGILL, sigill_handler);
+	if (sigsetjmp (jmpbuf, 1)) {
+	    signal (SIGILL, oldsig);
+	    return accel;
+	}
 
-    canjump = 1;
+	canjump = 1;
 
 #ifdef HAVE_ALTIVEC_H	/* gnu */
 #define VAND(a,b,c) "vand " #a "," #b "," #c "\n\t"
 #else			/* apple */
 #define VAND(a,b,c) "vand v" #a ",v" #b ",v" #c "\n\t"
 #endif
-    asm volatile ("mtspr 256, %0\n\t"
-		  VAND (0, 0, 0)
-		  :
-		  : "r" (-1));
+	asm volatile ("mtspr 256, %0\n\t"
+		      VAND (0, 0, 0)
+		      :
+		      : "r" (-1));
 
-    signal (SIGILL, oldsig);
-    return MPEG2_ACCEL_PPC_ALTIVEC;
+	canjump = 0;
+	accel |= MPEG2_ACCEL_PPC_ALTIVEC;
+
+	signal (SIGILL, oldsig);
+    }
+#endif /* ACCEL_DETECT */
+
+    return accel;
 }
 #endif /* ARCH_PPC */
 
-#ifdef ARCH_ALPHA
-static inline uint32_t arch_accel (void)
+#ifdef ARCH_SPARC
+static inline uint32_t arch_accel (uint32_t accel)
 {
-    uint64_t no_mvi;
+    if (accel & MPEG2_ACCEL_SPARC_VIS2)
+	accel |= MPEG2_ACCEL_SPARC_VIS;
+
+#ifdef ACCEL_DETECT
+    if (accel & (MPEG2_ACCEL_SPARC_VIS2 | MPEG2_ACCEL_DETECT) ==
+	MPEG2_ACCEL_DETECT) {
+	static RETSIGTYPE (* oldsig) (int);
+
+	oldsig = signal (SIGILL, sigill_handler);
+	if (sigsetjmp (jmpbuf, 1)) {
+	    signal (SIGILL, oldsig);
+	    return accel;
+	}
+
+	canjump = 1;
+
+	/* pdist %f0, %f0, %f0 */
+	__asm__ __volatile__(".word\t0x81b007c0");
 
-    asm volatile ("amask %1, %0"
-		  : "=r" (no_mvi)
-		  : "rI" (256));	/* AMASK_MVI */
-    return no_mvi ? MPEG2_ACCEL_ALPHA : (MPEG2_ACCEL_ALPHA |
-					 MPEG2_ACCEL_ALPHA_MVI);
+	canjump = 0;
+	accel |= MPEG2_ACCEL_SPARC_VIS;
+
+	if (sigsetjmp (jmpbuf, 1)) {
+	    signal (SIGILL, oldsig);
+	    return accel;
+	}
+
+	canjump = 1;
+
+	/* edge8n %g0, %g0, %g0 */
+	__asm__ __volatile__(".word\t0x81b00020");
+
+	canjump = 0;
+	accel |= MPEG2_ACCEL_SPARC_VIS2;
+
+	signal (SIGILL, oldsig);
+    }
+#endif /* ACCEL_DETECT */
+
+    return accel;
 }
-#endif /* ARCH_ALPHA */
-#endif
+#endif /* ARCH_SPARC */
 
-uint32_t mpeg2_detect_accel (void)
+#ifdef ARCH_ALPHA
+static inline uint32_t arch_accel (uint32_t accel)
 {
-    uint32_t accel;
+    if (accel & MPEG2_ACCEL_ALPHA_MVI)
+	accel |= MPEG2_ACCEL_ALPHA;
 
-    accel = 0;
 #ifdef ACCEL_DETECT
-#ifdef LIBMPEG2_MLIB
-    accel = MPEG2_ACCEL_MLIB;
-#endif
-#if defined (ARCH_X86) || defined (ARCH_PPC) || defined (ARCH_ALPHA)
-    accel |= arch_accel ();
-#endif
+    if (accel & MPEG2_ACCEL_DETECT) {
+	uint64_t no_mvi;
+
+	asm volatile ("amask %1, %0"
+		      : "=r" (no_mvi)
+		      : "rI" (256));	/* AMASK_MVI */
+	accel |= no_mvi ? MPEG2_ACCEL_ALPHA : (MPEG2_ACCEL_ALPHA |
+					       MPEG2_ACCEL_ALPHA_MVI);
+    }
+#endif /* ACCEL_DETECT */
+
+    return accel;
+}
+#endif /* ARCH_ALPHA */
+
+uint32_t mpeg2_detect_accel (uint32_t accel)
+{
+#if defined (ARCH_X86) || defined (ARCH_PPC) || defined (ARCH_ALPHA) || defined (ARCH_SPARC)
+    accel = arch_accel (accel);
 #endif
     return accel;
 }
diff --git a/src/libmpeg2new/libmpeg2/cpu_state.c b/src/libmpeg2new/libmpeg2/cpu_state.c
index 6761747fa..edbf2dd28 100644
--- a/src/libmpeg2new/libmpeg2/cpu_state.c
+++ b/src/libmpeg2new/libmpeg2/cpu_state.c
@@ -27,8 +27,8 @@
 #include <inttypes.h>
 
 #include "../include/mpeg2.h"
-#include "mpeg2_internal.h"
 #include "../include/attributes.h"
+#include "mpeg2_internal.h"
 #ifdef ARCH_X86
 #include "../include/mmx.h"
 #endif
diff --git a/src/libmpeg2new/libmpeg2/decode.c b/src/libmpeg2new/libmpeg2/decode.c
index 7d096c835..337ba4466 100644
--- a/src/libmpeg2new/libmpeg2/decode.c
+++ b/src/libmpeg2new/libmpeg2/decode.c
@@ -23,14 +23,13 @@
 
 #include "config.h"
 
-#include <stdio.h>  /* For testing printf */
 #include <string.h>	/* memcmp/memset, try to remove */
 #include <stdlib.h>
 #include <inttypes.h>
 
 #include "../include/mpeg2.h"
+#include "../include/attributes.h"
 #include "mpeg2_internal.h"
-#include "../include/convert.h"
 
 static int mpeg2_accels = 0;
 
@@ -45,7 +44,6 @@ static inline int skip_chunk (mpeg2dec_t * mpeg2dec, int bytes)
 {
     uint8_t * current;
     uint32_t shift;
-    uint8_t * chunk_ptr;
     uint8_t * limit;
     uint8_t byte;
 
@@ -54,7 +52,6 @@ static inline int skip_chunk (mpeg2dec_t * mpeg2dec, int bytes)
 
     current = mpeg2dec->buf_start;
     shift = mpeg2dec->shift;
-    chunk_ptr = mpeg2dec->chunk_ptr;
     limit = current + bytes;
 
     do {
@@ -129,30 +126,25 @@ static inline mpeg2_state_t seek_chunk (mpeg2dec_t * mpeg2dec)
     size = mpeg2dec->buf_end - mpeg2dec->buf_start;
     skipped = skip_chunk (mpeg2dec, size);
     if (!skipped) {
-	mpeg2dec->bytes_since_pts += size;
+	mpeg2dec->bytes_since_tag += size;
 	return STATE_BUFFER;
     }
-    mpeg2dec->bytes_since_pts += skipped;
+    mpeg2dec->bytes_since_tag += skipped;
     mpeg2dec->code = mpeg2dec->buf_start[-1];
-    return (mpeg2_state_t)-1;
+    return STATE_INTERNAL_NORETURN;
 }
 
-static mpeg2_state_t seek_header (mpeg2dec_t * mpeg2dec)
+mpeg2_state_t mpeg2_seek_header (mpeg2dec_t * mpeg2dec)
 {
-    while (mpeg2dec->code != 0xb3 &&
-	   ((mpeg2dec->code != 0xb7 && mpeg2dec->code != 0xb8 &&
-	     mpeg2dec->code) || mpeg2dec->sequence.width == (unsigned)-1))
+    while (!(mpeg2dec->code == 0xb3 ||
+	     ((mpeg2dec->code == 0xb7 || mpeg2dec->code == 0xb8 ||
+	       !mpeg2dec->code) && mpeg2dec->sequence.width != (unsigned)-1)))
 	if (seek_chunk (mpeg2dec) == STATE_BUFFER)
 	    return STATE_BUFFER;
     mpeg2dec->chunk_start = mpeg2dec->chunk_ptr = mpeg2dec->chunk_buffer;
-    return (mpeg2dec->code ? mpeg2_parse_header (mpeg2dec) :
-	    mpeg2_header_picture_start (mpeg2dec));
-}
-
-mpeg2_state_t mpeg2_seek_sequence (mpeg2dec_t * mpeg2dec)
-{
-    mpeg2dec->sequence.width = (unsigned)-1;
-    return seek_header (mpeg2dec);
+    mpeg2dec->user_data_len = 0;
+    return ((mpeg2dec->code == 0xb7) ?
+	    mpeg2_header_end (mpeg2dec) : mpeg2_parse_header (mpeg2dec));
 }
 
 #define RECEIVED(code,state) (((state) << 8) + (code))
@@ -165,7 +157,7 @@ mpeg2_state_t mpeg2_parse (mpeg2dec_t * mpeg2dec)
 	mpeg2_state_t state;
 
 	state = mpeg2dec->action (mpeg2dec);
-	if ((int)state >= 0)
+	if ((int)state > (int)STATE_INTERNAL_NORETURN)
 	    return state;
     }
 
@@ -178,7 +170,7 @@ mpeg2_state_t mpeg2_parse (mpeg2dec_t * mpeg2dec)
 	    if (size_buffer <= size_chunk) {
 		copied = copy_chunk (mpeg2dec, size_buffer);
 		if (!copied) {
-		    mpeg2dec->bytes_since_pts += size_buffer;
+		    mpeg2dec->bytes_since_tag += size_buffer;
 		    mpeg2dec->chunk_ptr += size_buffer;
 		    return STATE_BUFFER;
 		}
@@ -186,12 +178,12 @@ mpeg2_state_t mpeg2_parse (mpeg2dec_t * mpeg2dec)
 		copied = copy_chunk (mpeg2dec, size_chunk);
 		if (!copied) {
 		    /* filled the chunk buffer without finding a start code */
-		    mpeg2dec->bytes_since_pts += size_chunk;
+		    mpeg2dec->bytes_since_tag += size_chunk;
 		    mpeg2dec->action = seek_chunk;
 		    return STATE_INVALID;
 		}
 	    }
-	    mpeg2dec->bytes_since_pts += copied;
+	    mpeg2dec->bytes_since_tag += copied;
 
 	    mpeg2_slice (&(mpeg2dec->decoder), mpeg2dec->code,
 			 mpeg2dec->chunk_start);
@@ -203,64 +195,19 @@ mpeg2_state_t mpeg2_parse (mpeg2dec_t * mpeg2dec)
 	if (seek_chunk (mpeg2dec) == STATE_BUFFER)
 	    return STATE_BUFFER;
     }
+
+    mpeg2dec->action = mpeg2_seek_header;
     switch (mpeg2dec->code) {
     case 0x00:
-	mpeg2dec->action = mpeg2_header_picture_start;
-    if (mpeg2dec->state == STATE_SLICE) {
-      mpeg2dec->info.current_picture = mpeg2dec->info.current_picture_2nd = NULL;
-      mpeg2dec->info.display_picture = mpeg2dec->info.display_picture_2nd = NULL;
-      mpeg2dec->info.current_fbuf = mpeg2dec->info.display_fbuf = mpeg2dec->info.discard_fbuf = NULL;
-      mpeg2dec->info.user_data = NULL;
-      mpeg2dec->info.user_data_len = 0;
-
-      mpeg2dec->info.current_fbuf = mpeg2dec->fbuf[0];
-      if (mpeg2dec->decoder.coding_type == B_TYPE) {
-        mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[0];
-        mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[0];
-        mpeg2dec->fbuf[0]=0;
-      } else {
-        mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[1];
-        mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[2];
-        mpeg2dec->fbuf[2]=0;
-      }
-    }
-
 	return mpeg2dec->state;
-    case 0xb7:
-	mpeg2dec->action = mpeg2_header_end;
-	break;
     case 0xb3:
+    case 0xb7:
     case 0xb8:
-	mpeg2dec->action = mpeg2_parse_header;
-	break;
-    case 0xb2:
-        printf("libmpeg2:USER DATA for CC\n");
+	return (mpeg2dec->state == STATE_SLICE) ? STATE_SLICE : STATE_INVALID;
     default:
 	mpeg2dec->action = seek_chunk;
 	return STATE_INVALID;
     }
-    if (mpeg2dec->state == STATE_SLICE) {
-      mpeg2dec->info.current_picture = mpeg2dec->info.current_picture_2nd = NULL;
-      mpeg2dec->info.display_picture = mpeg2dec->info.display_picture_2nd = NULL;
-      mpeg2dec->info.current_fbuf = mpeg2dec->info.display_fbuf = mpeg2dec->info.discard_fbuf = NULL;
-      mpeg2dec->info.user_data = NULL;
-      mpeg2dec->info.user_data_len = 0;
-
-      mpeg2dec->info.current_fbuf = mpeg2dec->fbuf[0];
-      if (mpeg2dec->decoder.coding_type == B_TYPE) {
-        mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[0];
-        mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[0];
-        mpeg2dec->fbuf[0]=0;
-      } else {
-        mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[1];
-        mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[2];
-        mpeg2dec->fbuf[2]=0;
-      }
-    }
-
-
-
-    return (mpeg2dec->state == STATE_SLICE) ? STATE_SLICE : STATE_INVALID;
 }
 
 mpeg2_state_t mpeg2_parse_header (mpeg2dec_t * mpeg2dec)
@@ -272,6 +219,7 @@ mpeg2_state_t mpeg2_parse_header (mpeg2dec_t * mpeg2dec)
     int size_buffer, size_chunk, copied;
 
     mpeg2dec->action = mpeg2_parse_header;
+    mpeg2dec->info.user_data = NULL;	mpeg2dec->info.user_data_len = 0;
     while (1) {
 	size_buffer = mpeg2dec->buf_end - mpeg2dec->buf_start;
 	size_chunk = (mpeg2dec->chunk_buffer + BUFFER_SIZE -
@@ -279,7 +227,7 @@ mpeg2_state_t mpeg2_parse_header (mpeg2dec_t * mpeg2dec)
 	if (size_buffer <= size_chunk) {
 	    copied = copy_chunk (mpeg2dec, size_buffer);
 	    if (!copied) {
-		mpeg2dec->bytes_since_pts += size_buffer;
+		mpeg2dec->bytes_since_tag += size_buffer;
 		mpeg2dec->chunk_ptr += size_buffer;
 		return STATE_BUFFER;
 	    }
@@ -287,17 +235,17 @@ mpeg2_state_t mpeg2_parse_header (mpeg2dec_t * mpeg2dec)
 	    copied = copy_chunk (mpeg2dec, size_chunk);
 	    if (!copied) {
 		/* filled the chunk buffer without finding a start code */
-		mpeg2dec->bytes_since_pts += size_chunk;
+		mpeg2dec->bytes_since_tag += size_chunk;
 		mpeg2dec->code = 0xb4;
-		mpeg2dec->action = seek_header;
+		mpeg2dec->action = mpeg2_seek_header;
 		return STATE_INVALID;
 	    }
 	}
-	mpeg2dec->bytes_since_pts += copied;
+	mpeg2dec->bytes_since_tag += copied;
 
 	if (process_header[mpeg2dec->code & 0x0b] (mpeg2dec)) {
 	    mpeg2dec->code = mpeg2dec->buf_start[-1];
-	    mpeg2dec->action = seek_header;
+	    mpeg2dec->action = mpeg2_seek_header;
 	    return STATE_INVALID;
 	}
 
@@ -306,18 +254,17 @@ mpeg2_state_t mpeg2_parse_header (mpeg2dec_t * mpeg2dec)
 
 	/* state transition after a sequence header */
 	case RECEIVED (0x00, STATE_SEQUENCE):
-	    mpeg2dec->action = mpeg2_header_picture_start;
 	case RECEIVED (0xb8, STATE_SEQUENCE):
 	    mpeg2_header_sequence_finalize (mpeg2dec);
 	    break;
 
 	/* other legal state transitions */
 	case RECEIVED (0x00, STATE_GOP):
-	    mpeg2dec->action = mpeg2_header_picture_start;
+	    mpeg2_header_gop_finalize (mpeg2dec);
 	    break;
 	case RECEIVED (0x01, STATE_PICTURE):
 	case RECEIVED (0x01, STATE_PICTURE_2ND):
-	    mpeg2_header_matrix_finalize (mpeg2dec);
+	    mpeg2_header_picture_finalize (mpeg2dec, mpeg2_accels);
 	    mpeg2dec->action = mpeg2_header_slice_start;
 	    break;
 
@@ -333,48 +280,49 @@ mpeg2_state_t mpeg2_parse_header (mpeg2dec_t * mpeg2dec)
 	    continue;
 
 	default:
-	    mpeg2dec->action = seek_header;
+	    mpeg2dec->action = mpeg2_seek_header;
 	    return STATE_INVALID;
 	}
 
 	mpeg2dec->chunk_start = mpeg2dec->chunk_ptr = mpeg2dec->chunk_buffer;
+	mpeg2dec->user_data_len = 0;
 	return mpeg2dec->state;
     }
 }
 
-void mpeg2_convert (mpeg2dec_t * mpeg2dec,
-		    void (* convert) (int, int, uint32_t, void *,
-				      struct convert_init_s *), void * arg)
+int mpeg2_convert (mpeg2dec_t * mpeg2dec, mpeg2_convert_t convert, void * arg)
+{
+    mpeg2_convert_init_t convert_init;
+    int error;
+
+    error = convert (MPEG2_CONVERT_SET, NULL, &(mpeg2dec->sequence), 0,
+		     mpeg2_accels, arg, &convert_init);
+    if (!error) {
+	mpeg2dec->convert = convert;
+	mpeg2dec->convert_arg = arg;
+	mpeg2dec->convert_id_size = convert_init.id_size;
+	mpeg2dec->convert_stride = 0;
+    }
+    return error;
+}
+
+int mpeg2_stride (mpeg2dec_t * mpeg2dec, int stride)
 {
-    convert_init_t convert_init;
-    int size;
-
-    convert_init.id = NULL;
-    convert (mpeg2dec->decoder.width, mpeg2dec->decoder.height,
-	     mpeg2_accels, arg, &convert_init);
-    if (convert_init.id_size) {
-	convert_init.id = mpeg2dec->convert_id =
-	    mpeg2_malloc (convert_init.id_size, ALLOC_CONVERT_ID);
-	convert (mpeg2dec->decoder.width, mpeg2dec->decoder.height,
-		 mpeg2_accels, arg, &convert_init);
+    if (!mpeg2dec->convert) {
+	if (stride < (int) mpeg2dec->sequence.width)
+	    stride = mpeg2dec->sequence.width;
+	mpeg2dec->decoder.stride_frame = stride;
+    } else {
+	mpeg2_convert_init_t convert_init;
+
+	stride = mpeg2dec->convert (MPEG2_CONVERT_STRIDE, NULL,
+				    &(mpeg2dec->sequence), stride,
+				    mpeg2_accels, mpeg2dec->convert_arg,
+				    &convert_init);
+	mpeg2dec->convert_id_size = convert_init.id_size;
+	mpeg2dec->convert_stride = stride;
     }
-    mpeg2dec->convert_size[0] = size = convert_init.buf_size[0];
-    mpeg2dec->convert_size[1] = size += convert_init.buf_size[1];
-    mpeg2dec->convert_size[2] = size += convert_init.buf_size[2];
-    mpeg2dec->convert_start = convert_init.start;
-    mpeg2dec->convert_copy = convert_init.copy;
-
-    size = mpeg2dec->decoder.width * mpeg2dec->decoder.height >> 2;
-    mpeg2dec->yuv_buf[0][0] = (uint8_t *) mpeg2_malloc (6 * size, ALLOC_YUV);
-    mpeg2dec->yuv_buf[0][1] = mpeg2dec->yuv_buf[0][0] + 4 * size;
-    mpeg2dec->yuv_buf[0][2] = mpeg2dec->yuv_buf[0][0] + 5 * size;
-    mpeg2dec->yuv_buf[1][0] = (uint8_t *) mpeg2_malloc (6 * size, ALLOC_YUV);
-    mpeg2dec->yuv_buf[1][1] = mpeg2dec->yuv_buf[1][0] + 4 * size;
-    mpeg2dec->yuv_buf[1][2] = mpeg2dec->yuv_buf[1][0] + 5 * size;
-    size = mpeg2dec->decoder.width * 8;
-    mpeg2dec->yuv_buf[2][0] = (uint8_t *) mpeg2_malloc (6 * size, ALLOC_YUV);
-    mpeg2dec->yuv_buf[2][1] = mpeg2dec->yuv_buf[2][0] + 4 * size;
-    mpeg2dec->yuv_buf[2][2] = mpeg2dec->yuv_buf[2][0] + 5 * size;
+    return stride;
 }
 
 void mpeg2_set_buf (mpeg2dec_t * mpeg2dec, uint8_t * buf[3], void * id)
@@ -382,12 +330,13 @@ void mpeg2_set_buf (mpeg2dec_t * mpeg2dec, uint8_t * buf[3], void * id)
     mpeg2_fbuf_t * fbuf;
 
     if (mpeg2dec->custom_fbuf) {
-	mpeg2_set_fbuf (mpeg2dec, mpeg2dec->decoder.coding_type);
-	fbuf = mpeg2dec->fbuf[0];
 	if (mpeg2dec->state == STATE_SEQUENCE) {
 	    mpeg2dec->fbuf[2] = mpeg2dec->fbuf[1];
 	    mpeg2dec->fbuf[1] = mpeg2dec->fbuf[0];
 	}
+	mpeg2_set_fbuf (mpeg2dec, (mpeg2dec->decoder.coding_type ==
+				   PIC_FLAG_CODING_TYPE_B));
+	fbuf = mpeg2dec->fbuf[0];
     } else {
 	fbuf = &(mpeg2dec->fbuf_alloc[mpeg2dec->alloc_index].fbuf);
 	mpeg2dec->alloc_index_user = ++mpeg2dec->alloc_index;
@@ -401,10 +350,6 @@ void mpeg2_set_buf (mpeg2dec_t * mpeg2dec, uint8_t * buf[3], void * id)
 void mpeg2_custom_fbuf (mpeg2dec_t * mpeg2dec, int custom_fbuf)
 {
     mpeg2dec->custom_fbuf = custom_fbuf;
-    mpeg2dec->fbuf[0] = NULL;
-    mpeg2dec->fbuf[1] = NULL;
-    mpeg2dec->fbuf[2] = NULL;
-
 }
 
 void mpeg2_skip (mpeg2dec_t * mpeg2dec, int skip)
@@ -421,27 +366,48 @@ void mpeg2_slice_region (mpeg2dec_t * mpeg2dec, int start, int end)
     mpeg2dec->nb_decode_slices = end - start;
 }
 
-void mpeg2_pts (mpeg2dec_t * mpeg2dec, uint32_t pts)
+void mpeg2_tag_picture (mpeg2dec_t * mpeg2dec, uint32_t tag, uint32_t tag2)
 {
-    mpeg2dec->pts_previous = mpeg2dec->pts_current;
-    mpeg2dec->pts_current = pts;
-    mpeg2dec->num_pts++;
-    mpeg2dec->bytes_since_pts = 0;
+    mpeg2dec->tag_previous = mpeg2dec->tag_current;
+    mpeg2dec->tag2_previous = mpeg2dec->tag2_current;
+    mpeg2dec->tag_current = tag;
+    mpeg2dec->tag2_current = tag2;
+    mpeg2dec->num_tags++;
+    mpeg2dec->bytes_since_tag = 0;
 }
 
 uint32_t mpeg2_accel (uint32_t accel)
 {
     if (!mpeg2_accels) {
-	if (accel & MPEG2_ACCEL_DETECT)
-	    accel |= mpeg2_detect_accel ();
-	mpeg2_accels = accel |= MPEG2_ACCEL_DETECT;
-	mpeg2_cpu_state_init (accel);
-	mpeg2_idct_init (accel);
-	mpeg2_mc_init (accel);
+	mpeg2_accels = mpeg2_detect_accel (accel) | MPEG2_ACCEL_DETECT;
+	mpeg2_cpu_state_init (mpeg2_accels);
+	mpeg2_idct_init (mpeg2_accels);
+	mpeg2_mc_init (mpeg2_accels);
     }
     return mpeg2_accels & ~MPEG2_ACCEL_DETECT;
 }
 
+void mpeg2_reset (mpeg2dec_t * mpeg2dec, int full_reset)
+{
+    mpeg2dec->buf_start = mpeg2dec->buf_end = NULL;
+    mpeg2dec->num_tags = 0;
+    mpeg2dec->shift = 0xffffff00;
+    mpeg2dec->code = 0xb4;
+    mpeg2dec->action = mpeg2_seek_header;
+    mpeg2dec->state = STATE_INVALID;
+    mpeg2dec->first = 1;
+
+    mpeg2_reset_info(&(mpeg2dec->info));
+    mpeg2dec->info.gop = NULL;
+    mpeg2dec->info.user_data = NULL;
+    mpeg2dec->info.user_data_len = 0;
+    if (full_reset) {
+	mpeg2dec->info.sequence = NULL;
+	mpeg2_header_state_init (mpeg2dec);
+    }
+
+}
+
 mpeg2dec_t * mpeg2_init (void)
 {
     mpeg2dec_t * mpeg2dec;
@@ -449,42 +415,25 @@ mpeg2dec_t * mpeg2_init (void)
     mpeg2_accel (MPEG2_ACCEL_DETECT);
 
     mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),
-					    ALLOC_MPEG2DEC);
+					    MPEG2_ALLOC_MPEG2DEC);
     if (mpeg2dec == NULL)
 	return NULL;
 
-    memset (mpeg2dec, 0, sizeof (mpeg2dec_t));
+    memset (mpeg2dec->decoder.DCTblock, 0, 64 * sizeof (int16_t));
+    memset (mpeg2dec->quantizer_matrix, 0, 4 * 64 * sizeof (uint8_t));
 
     mpeg2dec->chunk_buffer = (uint8_t *) mpeg2_malloc (BUFFER_SIZE + 4,
-						       ALLOC_CHUNK);
+						       MPEG2_ALLOC_CHUNK);
 
-    mpeg2dec->shift = 0xffffff00;
-    mpeg2dec->action = mpeg2_seek_sequence;
-    mpeg2dec->code = 0xb4;
-    mpeg2dec->first_decode_slice = 1;
-    mpeg2dec->nb_decode_slices = 0xb0 - 1;
-    mpeg2dec->convert_id = NULL;
+    mpeg2dec->sequence.width = (unsigned)-1;
+    mpeg2_reset (mpeg2dec, 1);
 
-    /* initialize substructures */
-    mpeg2_header_state_init (mpeg2dec);
     return mpeg2dec;
 }
 
 void mpeg2_close (mpeg2dec_t * mpeg2dec)
 {
-    int i;
-
-    /* static uint8_t finalizer[] = {0,0,1,0xb4}; */
-    /* mpeg2_decode_data (mpeg2dec, finalizer, finalizer+4); */
-
+    mpeg2_header_state_init (mpeg2dec);
     mpeg2_free (mpeg2dec->chunk_buffer);
-    if (!mpeg2dec->custom_fbuf)
-	for (i = mpeg2dec->alloc_index_user; i < mpeg2dec->alloc_index; i++)
-	    mpeg2_free (mpeg2dec->fbuf_alloc[i].fbuf.buf[0]);
-    if (mpeg2dec->convert_start)
-	for (i = 0; i < 3; i++)
-	    mpeg2_free (mpeg2dec->yuv_buf[i][0]);
-    if (mpeg2dec->convert_id)
-	mpeg2_free (mpeg2dec->convert_id);
     mpeg2_free (mpeg2dec);
 }
diff --git a/src/libmpeg2new/libmpeg2/header.c b/src/libmpeg2new/libmpeg2/header.c
index 894a86b30..935a50aa3 100644
--- a/src/libmpeg2new/libmpeg2/header.c
+++ b/src/libmpeg2new/libmpeg2/header.c
@@ -1,6 +1,7 @@
 /*
  * header.c
  * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2003      Regis Duchesne <hpreg@zoy.org>
  * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
@@ -23,16 +24,13 @@
 
 #include "config.h"
 
-#include <stdio.h>      /* For printf */
 #include <inttypes.h>
 #include <stdlib.h>	/* defines NULL */
 #include <string.h>	/* memcmp */
-#include <assert.h>
 
 #include "../include/mpeg2.h"
-#include "mpeg2_internal.h"
-#include "../include/convert.h"
 #include "../include/attributes.h"
+#include "mpeg2_internal.h"
 
 #define SEQ_EXT 2
 #define SEQ_DISPLAY_EXT 4
@@ -78,7 +76,29 @@ uint8_t mpeg2_scan_alt[64] ATTR_ALIGN(16) = {
 
 void mpeg2_header_state_init (mpeg2dec_t * mpeg2dec)
 {
-    mpeg2dec->decoder.scan = mpeg2_scan_norm;
+    if (mpeg2dec->sequence.width != (unsigned)-1) {
+	int i;
+
+	mpeg2dec->sequence.width = (unsigned)-1;
+	if (!mpeg2dec->custom_fbuf)
+	    for (i = mpeg2dec->alloc_index_user;
+		 i < mpeg2dec->alloc_index; i++) {
+		mpeg2_free (mpeg2dec->fbuf_alloc[i].fbuf.buf[0]);
+		mpeg2_free (mpeg2dec->fbuf_alloc[i].fbuf.buf[1]);
+		mpeg2_free (mpeg2dec->fbuf_alloc[i].fbuf.buf[2]);
+	    }
+	if (mpeg2dec->convert_start)
+	    for (i = 0; i < 3; i++) {
+		mpeg2_free (mpeg2dec->yuv_buf[i][0]);
+		mpeg2_free (mpeg2dec->yuv_buf[i][1]);
+		mpeg2_free (mpeg2dec->yuv_buf[i][2]);
+	    }
+	if (mpeg2dec->decoder.convert_id)
+	    mpeg2_free (mpeg2dec->decoder.convert_id);
+    }
+    mpeg2dec->decoder.coding_type = I_TYPE;
+    mpeg2dec->decoder.convert = NULL;
+    mpeg2dec->decoder.convert_id = NULL;
     mpeg2dec->picture = mpeg2dec->pictures;
     mpeg2dec->fbuf[0] = &mpeg2dec->fbuf_alloc[0].fbuf;
     mpeg2dec->fbuf[1] = &mpeg2dec->fbuf_alloc[1].fbuf;
@@ -86,22 +106,39 @@ void mpeg2_header_state_init (mpeg2dec_t * mpeg2dec)
     mpeg2dec->first = 1;
     mpeg2dec->alloc_index = 0;
     mpeg2dec->alloc_index_user = 0;
+    mpeg2dec->first_decode_slice = 1;
+    mpeg2dec->nb_decode_slices = 0xb0 - 1;
+    mpeg2dec->convert = NULL;
+    mpeg2dec->convert_start = NULL;
+    mpeg2dec->custom_fbuf = 0;
+    mpeg2dec->yuv_index = 0;
 }
 
-static void reset_info (mpeg2_info_t * info)
+void mpeg2_reset_info (mpeg2_info_t * info)
 {
     info->current_picture = info->current_picture_2nd = NULL;
     info->display_picture = info->display_picture_2nd = NULL;
     info->current_fbuf = info->display_fbuf = info->discard_fbuf = NULL;
-    info->user_data = NULL;	info->user_data_len = 0;
+}
+
+static void info_user_data (mpeg2dec_t * mpeg2dec)
+{
+    if (mpeg2dec->user_data_len) {
+	mpeg2dec->info.user_data = mpeg2dec->chunk_buffer;
+	mpeg2dec->info.user_data_len = mpeg2dec->user_data_len - 3;
+    }
 }
 
 int mpeg2_header_sequence (mpeg2dec_t * mpeg2dec)
 {
     uint8_t * buffer = mpeg2dec->chunk_start;
     mpeg2_sequence_t * sequence = &(mpeg2dec->new_sequence);
-    static unsigned int frame_period[9] = {
-	0, 1126125, 1125000, 1080000, 900900, 900000, 540000, 450450, 450000
+    static unsigned int frame_period[16] = {
+	0, 1126125, 1125000, 1080000, 900900, 900000, 540000, 450450, 450000,
+	/* unofficial: xing 15 fps */
+	1800000,
+	/* unofficial: libmpeg3 "Unofficial economy rates" 5/10/12/15 fps */
+	5400000, 2700000, 2250000, 1800000, 0, 0
     };
     int i;
 
@@ -122,9 +159,7 @@ int mpeg2_header_sequence (mpeg2dec_t * mpeg2dec)
 		       SEQ_VIDEO_FORMAT_UNSPECIFIED);
 
     sequence->pixel_width = buffer[3] >> 4;	/* aspect ratio */
-    sequence->frame_period = 0;
-    if ((buffer[3] & 15) < 9)
-	sequence->frame_period = frame_period[buffer[3] & 15];
+    sequence->frame_period = frame_period[buffer[3] & 15];
 
     sequence->byte_rate = (buffer[4]<<10) | (buffer[5]<<2) | (buffer[6]>>6);
 
@@ -136,21 +171,20 @@ int mpeg2_header_sequence (mpeg2dec_t * mpeg2dec)
     mpeg2dec->copy_matrix = 3;
     if (buffer[7] & 2) {
 	for (i = 0; i < 64; i++)
-	    mpeg2dec->intra_quantizer_matrix[mpeg2_scan_norm[i]] =
+	    mpeg2dec->new_quantizer_matrix[0][mpeg2_scan_norm[i]] =
 		(buffer[i+7] << 7) | (buffer[i+8] >> 1);
 	buffer += 64;
     } else
 	for (i = 0; i < 64; i++)
-	    mpeg2dec->intra_quantizer_matrix[mpeg2_scan_norm[i]] =
+	    mpeg2dec->new_quantizer_matrix[0][mpeg2_scan_norm[i]] =
 		default_intra_quantizer_matrix[i];
 
     if (buffer[7] & 1)
 	for (i = 0; i < 64; i++)
-	    mpeg2dec->non_intra_quantizer_matrix[mpeg2_scan_norm[i]] =
+	    mpeg2dec->new_quantizer_matrix[1][mpeg2_scan_norm[i]] =
 		buffer[i+8];
     else
-	for (i = 0; i < 64; i++)
-	    mpeg2dec->non_intra_quantizer_matrix[i] = 16;
+	memset (mpeg2dec->new_quantizer_matrix[1], 16, 64);
 
     sequence->profile_level_id = 0x80;
     sequence->colour_primaries = 0;
@@ -161,8 +195,6 @@ int mpeg2_header_sequence (mpeg2dec_t * mpeg2dec)
     mpeg2dec->state = STATE_SEQUENCE;
     mpeg2dec->display_offset_x = mpeg2dec->display_offset_y = 0;
 
-    reset_info (&(mpeg2dec->info));
-    mpeg2dec->info.gop = NULL;
     return 0;
 }
 
@@ -218,12 +250,11 @@ static int sequence_display_ext (mpeg2dec_t * mpeg2dec)
 {
     uint8_t * buffer = mpeg2dec->chunk_start;
     mpeg2_sequence_t * sequence = &(mpeg2dec->new_sequence);
-    uint32_t flags;
 
-    flags = ((sequence->flags & ~SEQ_MASK_VIDEO_FORMAT) |
-	     ((buffer[0]<<4) & SEQ_MASK_VIDEO_FORMAT));
+    sequence->flags = ((sequence->flags & ~SEQ_MASK_VIDEO_FORMAT) |
+		       ((buffer[0]<<4) & SEQ_MASK_VIDEO_FORMAT));
     if (buffer[0] & 1) {
-	flags |= SEQ_FLAG_COLOUR_DESCRIPTION;
+	sequence->flags |= SEQ_FLAG_COLOUR_DESCRIPTION;
 	sequence->colour_primaries = buffer[1];
 	sequence->transfer_characteristics = buffer[2];
 	sequence->matrix_coefficients = buffer[3];
@@ -240,6 +271,17 @@ static int sequence_display_ext (mpeg2dec_t * mpeg2dec)
     return 0;
 }
 
+static inline void simplify (unsigned int * u, unsigned int * v)
+{
+    unsigned int a, b, tmp;
+
+    a = *u;	b = *v;
+    while (a) {	/* find greatest common divisor */
+	tmp = a;	a = b % tmp;	b = tmp;
+    }
+    *u /= b;	*v /= b;
+}
+
 static inline void finalize_sequence (mpeg2_sequence_t * sequence)
 {
     int width;
@@ -276,8 +318,10 @@ static inline void finalize_sequence (mpeg2_sequence_t * sequence)
 	    sequence->pixel_width = 64;	sequence->pixel_height = 45;	return;
 	case 6:	/* 720x480 16:9 */
 	    sequence->pixel_width = 32;	sequence->pixel_height = 27;	return;
-	case 12:	/* 720*480 4:3 */
-	    sequence->pixel_width = 8;	sequence->pixel_height = 9;	return;
+	case 8: /* BT.601 625 lines 4:3 */
+	    sequence->pixel_width = 59;	sequence->pixel_height = 54;	return;
+	case 12: /* BT.601 525 lines 4:3 */
+	    sequence->pixel_width = 10;	sequence->pixel_height = 11;	return;
 	default:
 	    height = 88 * sequence->pixel_width + 1171;
 	    width = 2000;
@@ -286,28 +330,120 @@ static inline void finalize_sequence (mpeg2_sequence_t * sequence)
 
     sequence->pixel_width = width;
     sequence->pixel_height = height;
-    while (width) {	/* find greatest common divisor */
-	int tmp = width;
-	width = height % tmp;
-	height = tmp;
+    simplify (&sequence->pixel_width, &sequence->pixel_height);
+}
+
+int mpeg2_guess_aspect (const mpeg2_sequence_t * sequence,
+			unsigned int * pixel_width,
+			unsigned int * pixel_height)
+{
+    static struct {
+	unsigned int width, height;
+    } video_modes[] = {
+	{720, 576}, /* 625 lines, 13.5 MHz (D1, DV, DVB, DVD) */
+	{704, 576}, /* 625 lines, 13.5 MHz (1/1 D1, DVB, DVD, 4CIF) */
+	{544, 576}, /* 625 lines, 10.125 MHz (DVB, laserdisc) */
+	{528, 576}, /* 625 lines, 10.125 MHz (3/4 D1, DVB, laserdisc) */
+	{480, 576}, /* 625 lines, 9 MHz (2/3 D1, DVB, SVCD) */
+	{352, 576}, /* 625 lines, 6.75 MHz (D2, 1/2 D1, CVD, DVB, DVD) */
+	{352, 288}, /* 625 lines, 6.75 MHz, 1 field (D4, VCD, DVB, DVD, CIF) */
+	{176, 144}, /* 625 lines, 3.375 MHz, half field (QCIF) */
+	{720, 486}, /* 525 lines, 13.5 MHz (D1) */
+	{704, 486}, /* 525 lines, 13.5 MHz */
+	{720, 480}, /* 525 lines, 13.5 MHz (DV, DSS, DVD) */
+	{704, 480}, /* 525 lines, 13.5 MHz (1/1 D1, ATSC, DVD) */
+	{544, 480}, /* 525 lines. 10.125 MHz (DSS, laserdisc) */
+	{528, 480}, /* 525 lines. 10.125 MHz (3/4 D1, laserdisc) */
+	{480, 480}, /* 525 lines, 9 MHz (2/3 D1, SVCD) */
+	{352, 480}, /* 525 lines, 6.75 MHz (D2, 1/2 D1, CVD, DVD) */
+	{352, 240}  /* 525  lines. 6.75 MHz, 1 field (D4, VCD, DSS, DVD) */
+    };
+    unsigned int width, height, pix_width, pix_height, i, DAR_16_9;
+
+    *pixel_width = sequence->pixel_width;
+    *pixel_height = sequence->pixel_height;
+    width = sequence->picture_width;
+    height = sequence->picture_height;
+    for (i = 0; i < sizeof (video_modes) / sizeof (video_modes[0]); i++)
+	if (width == video_modes[i].width && height == video_modes[i].height)
+	    break;
+    if (i == sizeof (video_modes) / sizeof (video_modes[0]) ||
+	(sequence->pixel_width == 1 && sequence->pixel_height == 1) ||
+	width != sequence->display_width || height != sequence->display_height)
+	return 0;
+
+    for (pix_height = 1; height * pix_height < 480; pix_height <<= 1);
+    height *= pix_height;
+    for (pix_width = 1; width * pix_width <= 352; pix_width <<= 1);
+    width *= pix_width;
+
+    if (! (sequence->flags & SEQ_FLAG_MPEG2)) {
+	static unsigned int mpeg1_check[2][2] = {{11, 54}, {27, 45}};
+	DAR_16_9 = (sequence->pixel_height == 27 ||
+		    sequence->pixel_height == 45);
+	if (width < 704 ||
+	    sequence->pixel_height != mpeg1_check[DAR_16_9][height == 576])
+	    return 0;
+    } else {
+	DAR_16_9 = (3 * sequence->picture_width * sequence->pixel_width >
+		    4 * sequence->picture_height * sequence->pixel_height);
+	switch (width) {
+	case 528: case 544:	pix_width *= 4; pix_height *= 3; break;
+	case 480:		pix_width *= 3; pix_height *= 2; break;
+	}
+    }
+    if (DAR_16_9) {
+	pix_width *= 4; pix_height *= 3;
     }
-    sequence->pixel_width /= height;
-    sequence->pixel_height /= height;
+    if (height == 576) {
+	pix_width *= 59; pix_height *= 54;
+    } else {
+	pix_width *= 10; pix_height *= 11;
+    }
+    *pixel_width = pix_width;
+    *pixel_height = pix_height;
+    simplify (pixel_width, pixel_height);
+    return (height == 576) ? 1 : 2;
 }
 
-void mpeg2_header_matrix_finalize (mpeg2dec_t * mpeg2dec)
+static void copy_matrix (mpeg2dec_t * mpeg2dec, int index)
+{
+    if (memcmp (mpeg2dec->quantizer_matrix[index],
+		mpeg2dec->new_quantizer_matrix[index], 64)) {
+	memcpy (mpeg2dec->quantizer_matrix[index],
+		mpeg2dec->new_quantizer_matrix[index], 64);
+	mpeg2dec->scaled[index] = -1;
+    }
+}
+
+static void finalize_matrix (mpeg2dec_t * mpeg2dec)
 {
     mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
     int i;
 
-    if (mpeg2dec->copy_matrix & 1)
-	for (i = 0; i < 64; i++)
-	    decoder->intra_quantizer_matrix[i] =
-		mpeg2dec->intra_quantizer_matrix[i];
-    if (mpeg2dec->copy_matrix & 2)
-	for (i = 0; i < 64; i++)
-	    decoder->non_intra_quantizer_matrix[i] =
-		mpeg2dec->non_intra_quantizer_matrix[i];
+    for (i = 0; i < 2; i++) {
+	if (mpeg2dec->copy_matrix & (1 << i))
+	    copy_matrix (mpeg2dec, i);
+	if ((mpeg2dec->copy_matrix & (4 << i)) &&
+	    memcmp (mpeg2dec->quantizer_matrix[i],
+		    mpeg2dec->new_quantizer_matrix[i+2], 64)) {
+	    copy_matrix (mpeg2dec, i + 2);
+	    decoder->chroma_quantizer[i] = decoder->quantizer_prescale[i+2];
+	} else if (mpeg2dec->copy_matrix & (5 << i))
+	    decoder->chroma_quantizer[i] = decoder->quantizer_prescale[i];
+    }
+}
+
+static mpeg2_state_t invalid_end_action (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2_reset_info (&(mpeg2dec->info));
+    mpeg2dec->info.gop = NULL;
+    info_user_data (mpeg2dec);
+    mpeg2_header_state_init (mpeg2dec);
+    mpeg2dec->sequence = mpeg2dec->new_sequence;
+    mpeg2dec->action = mpeg2_seek_header;
+    mpeg2dec->state = STATE_SEQUENCE;
+    return STATE_SEQUENCE;
 }
 
 void mpeg2_header_sequence_finalize (mpeg2dec_t * mpeg2dec)
@@ -316,37 +452,56 @@ void mpeg2_header_sequence_finalize (mpeg2dec_t * mpeg2dec)
     mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
 
     finalize_sequence (sequence);
+    finalize_matrix (mpeg2dec);
 
-    mpeg2_header_matrix_finalize (mpeg2dec);
     decoder->mpeg1 = !(sequence->flags & SEQ_FLAG_MPEG2);
     decoder->width = sequence->width;
     decoder->height = sequence->height;
     decoder->vertical_position_extension = (sequence->picture_height > 2800);
-
-    /*
-     * according to 6.1.1.6, repeat sequence headers should be
-     * identical to the original. However some DVDs dont respect that
-     * and have different bitrates in the repeat sequence headers. So
-     * we'll ignore that in the comparison and still consider these as
-     * repeat sequence headers.
-     */
-    mpeg2dec->sequence.byte_rate = sequence->byte_rate;
-    if (!memcmp (&(mpeg2dec->sequence), sequence, sizeof (mpeg2_sequence_t)))
-	mpeg2dec->state = STATE_SEQUENCE_REPEATED;
+    decoder->chroma_format = ((sequence->chroma_width == sequence->width) +
+			      (sequence->chroma_height == sequence->height));
+
+    if (mpeg2dec->sequence.width != (unsigned)-1) {
+	/*
+	 * According to 6.1.1.6, repeat sequence headers should be
+	 * identical to the original. However some encoders dont
+	 * respect that and change various fields (including bitrate
+	 * and aspect ratio) in the repeat sequence headers. So we
+	 * choose to be as conservative as possible and only restart
+	 * the decoder if the width, height, chroma_width,
+	 * chroma_height or low_delay flag are modified.
+	 */
+	if (sequence->width != mpeg2dec->sequence.width ||
+	    sequence->height != mpeg2dec->sequence.height ||
+	    sequence->chroma_width != mpeg2dec->sequence.chroma_width ||
+	    sequence->chroma_height != mpeg2dec->sequence.chroma_height ||
+	    ((sequence->flags ^ mpeg2dec->sequence.flags) &
+	     SEQ_FLAG_LOW_DELAY)) {
+	    decoder->stride_frame = sequence->width;
+	    mpeg2_header_end (mpeg2dec);
+	    mpeg2dec->action = invalid_end_action;
+	    mpeg2dec->state = STATE_INVALID_END;
+	    return;
+	}
+	mpeg2dec->state = (memcmp (&(mpeg2dec->sequence), sequence,
+				   sizeof (mpeg2_sequence_t)) ?
+			   STATE_SEQUENCE_MODIFIED : STATE_SEQUENCE_REPEATED);
+    } else
+	decoder->stride_frame = sequence->width;
     mpeg2dec->sequence = *sequence;
-
+    mpeg2_reset_info (&(mpeg2dec->info));
     mpeg2dec->info.sequence = &(mpeg2dec->sequence);
+    mpeg2dec->info.gop = NULL;
+    info_user_data (mpeg2dec);
 }
 
 int mpeg2_header_gop (mpeg2dec_t * mpeg2dec)
 {
     uint8_t * buffer = mpeg2dec->chunk_start;
-    mpeg2_gop_t * gop = &(mpeg2dec->gop);
+    mpeg2_gop_t * gop = &(mpeg2dec->new_gop);
 
-    reset_info (&(mpeg2dec->info));
     if (! (buffer[1] & 8))
 	return 1;
-    mpeg2dec->info.gop = gop;
     gop->hours = (buffer[0] >> 2) & 31;
     gop->minutes = ((buffer[0] << 4) | (buffer[1] >> 4)) & 63;
     gop->seconds = ((buffer[1] << 3) | (buffer[2] >> 5)) & 63;
@@ -356,7 +511,15 @@ int mpeg2_header_gop (mpeg2dec_t * mpeg2dec)
     return 0;
 }
 
-void mpeg2_set_fbuf (mpeg2dec_t * mpeg2dec, int coding_type)
+void mpeg2_header_gop_finalize (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2dec->gop = mpeg2dec->new_gop;
+    mpeg2_reset_info (&(mpeg2dec->info));
+    mpeg2dec->info.gop = &(mpeg2dec->gop);
+    info_user_data (mpeg2dec);
+}
+
+void mpeg2_set_fbuf (mpeg2dec_t * mpeg2dec, int b_type)
 {
     int i;
 
@@ -364,139 +527,30 @@ void mpeg2_set_fbuf (mpeg2dec_t * mpeg2dec, int coding_type)
 	if (mpeg2dec->fbuf[1] != &mpeg2dec->fbuf_alloc[i].fbuf &&
 	    mpeg2dec->fbuf[2] != &mpeg2dec->fbuf_alloc[i].fbuf) {
 	    mpeg2dec->fbuf[0] = &mpeg2dec->fbuf_alloc[i].fbuf;
-	    if (!mpeg2dec->custom_fbuf) {
-	      mpeg2dec->info.current_fbuf = mpeg2dec->fbuf[0];
-	      if ((coding_type == B_TYPE) ||
-		  (mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY)) {
-		  mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[0];
-		  if ((coding_type == B_TYPE) || (mpeg2dec->convert_start)) {
-		      mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[0];
-                      mpeg2dec->fbuf[0]=0;
-                  }
-	      }
-            }
+	    mpeg2dec->info.current_fbuf = mpeg2dec->fbuf[0];
+	    if (b_type || (mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY)) {
+		if (b_type || mpeg2dec->convert)
+		    mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[0];
+		mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[0];
+	    }
 	    break;
 	}
 }
 
-mpeg2_state_t mpeg2_header_picture_start (mpeg2dec_t * mpeg2dec)
-{
-    mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
-    mpeg2_picture_t * picture;
-
-    if (mpeg2dec->state != STATE_SLICE_1ST) {
-	mpeg2dec->state = STATE_PICTURE;
-	picture = mpeg2dec->pictures;
-	if ((decoder->coding_type != PIC_FLAG_CODING_TYPE_B) ^
-	    (mpeg2dec->picture >= mpeg2dec->pictures + 2))
-	    picture += 2;
-    } else {
-	mpeg2dec->state = STATE_PICTURE_2ND;
-	picture = mpeg2dec->picture + 1;	/* second field picture */
-    }
-    mpeg2dec->picture = picture;
-    picture->flags = 0;
-    if (mpeg2dec->num_pts) {
-	if (mpeg2dec->bytes_since_pts >= 4) {
-	    mpeg2dec->num_pts = 0;
-	    picture->pts = mpeg2dec->pts_current;
-	    picture->flags = PIC_FLAG_PTS;
-	} else if (mpeg2dec->num_pts > 1) {
-	    mpeg2dec->num_pts = 1;
-	    picture->pts = mpeg2dec->pts_previous;
-	    picture->flags = PIC_FLAG_PTS;
-	}
-    }
-    picture->display_offset[0].x = picture->display_offset[1].x =
-	picture->display_offset[2].x = mpeg2dec->display_offset_x;
-    picture->display_offset[0].y = picture->display_offset[1].y =
-	picture->display_offset[2].y = mpeg2dec->display_offset_y;
-    return mpeg2_parse_header (mpeg2dec);
-}
-
 int mpeg2_header_picture (mpeg2dec_t * mpeg2dec)
 {
     uint8_t * buffer = mpeg2dec->chunk_start;
-    mpeg2_picture_t * picture = mpeg2dec->picture;
+    mpeg2_picture_t * picture = &(mpeg2dec->new_picture);
     mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
     int type;
-    int low_delay;
-
-    type = (buffer [1] >> 3) & 7;
-    low_delay = mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY;
-
-    if (mpeg2dec->state == STATE_PICTURE) {
-	mpeg2_picture_t * other;
-
-	decoder->second_field = 0;
-	other = mpeg2dec->pictures;
-	if (other == picture)
-	    other += 2;
-	if (decoder->coding_type != PIC_FLAG_CODING_TYPE_B) {
-	    mpeg2dec->fbuf[2] = mpeg2dec->fbuf[1];
-	    mpeg2dec->fbuf[1] = mpeg2dec->fbuf[0];
-	}
-	mpeg2dec->fbuf[0] = NULL;
-	reset_info (&(mpeg2dec->info));
-	mpeg2dec->info.current_picture = picture;
-	mpeg2dec->info.display_picture = picture;
-	if (type != PIC_FLAG_CODING_TYPE_B) {
-	    if (!low_delay) {
-		if (mpeg2dec->first) {
-		    mpeg2dec->info.display_picture = NULL;
-		    mpeg2dec->first = 0;
-		} else {
-		    mpeg2dec->info.display_picture = other;
-		    if (other->nb_fields == 1)
-			mpeg2dec->info.display_picture_2nd = other + 1;
-		    mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[1];
-		}
-	    }
-	    if (!low_delay + !mpeg2dec->convert_start) {
-		mpeg2dec->info.discard_fbuf =
-		    mpeg2dec->fbuf[!low_delay + !mpeg2dec->convert_start];
-                // FIXME: Might want to wipe this whole section, once pictures is sorted. 
-                // mpeg2dec->fbuf[!low_delay + !mpeg2dec->convert_start]=0;
-            }
-	}
-	if (!mpeg2dec->custom_fbuf) {
-	    while (mpeg2dec->alloc_index < 3) {
-		mpeg2_fbuf_t * fbuf;
 
-		fbuf = &(mpeg2dec->fbuf_alloc[mpeg2dec->alloc_index++].fbuf);
-		fbuf->id = NULL;
-		if (mpeg2dec->convert_start) {    
-		    fbuf->buf[0] =
-			(uint8_t *) mpeg2_malloc (mpeg2dec->convert_size[0],
-						  ALLOC_CONVERTED);
-		    fbuf->buf[1] = fbuf->buf[0] + mpeg2dec->convert_size[1];
-		    fbuf->buf[2] = fbuf->buf[0] + mpeg2dec->convert_size[2];
-		} else {
-		    int size;
-		    size = mpeg2dec->decoder.width * mpeg2dec->decoder.height;
-		    fbuf->buf[0] = (uint8_t *) mpeg2_malloc (6 * size >> 2,
-							     ALLOC_YUV);
-		    fbuf->buf[1] = fbuf->buf[0] + size;
-		    fbuf->buf[2] = fbuf->buf[1] + (size >> 2);
-		}
-	    }
-            abort();
-	    mpeg2_set_fbuf (mpeg2dec, type);
-	}
-    } else {
-	decoder->second_field = 1;
-	mpeg2dec->info.current_picture_2nd = picture;
-	mpeg2dec->info.user_data = NULL; mpeg2dec->info.user_data_len = 0;
-	if (low_delay || type == PIC_FLAG_CODING_TYPE_B)
-	    mpeg2dec->info.display_picture_2nd = picture;
-    }
+    mpeg2dec->state = ((mpeg2dec->state != STATE_SLICE_1ST) ?
+		       STATE_PICTURE : STATE_PICTURE_2ND);
     mpeg2dec->ext_state = PIC_CODING_EXT;
 
     picture->temporal_reference = (buffer[0] << 2) | (buffer[1] >> 6);
 
-    decoder->coding_type = type;
-    picture->flags |= type;
-
+    type = (buffer [1] >> 3) & 7;
     if (type == PIC_FLAG_CODING_TYPE_P || type == PIC_FLAG_CODING_TYPE_B) {
 	/* forward_f_code and backward_f_code - used in mpeg1 only */
 	decoder->f_motion.f_code[1] = (buffer[3] >> 2) & 1;
@@ -506,13 +560,32 @@ int mpeg2_header_picture (mpeg2dec_t * mpeg2dec)
 	decoder->b_motion.f_code[0] = ((buffer[4] >> 3) & 7) - 1;
     }
 
-    /* XXXXXX decode extra_information_picture as well */
-
+    picture->flags = PIC_FLAG_PROGRESSIVE_FRAME | type;
+    picture->tag = picture->tag2 = 0;
+    if (mpeg2dec->num_tags) {
+	if (mpeg2dec->bytes_since_tag >= mpeg2dec->chunk_ptr - buffer + 4) {
+	    mpeg2dec->num_tags = 0;
+	    picture->tag = mpeg2dec->tag_current;
+	    picture->tag2 = mpeg2dec->tag2_current;
+	    picture->flags |= PIC_FLAG_TAGS;
+	} else if (mpeg2dec->num_tags > 1) {
+	    mpeg2dec->num_tags = 1;
+	    picture->tag = mpeg2dec->tag_previous;
+	    picture->tag2 = mpeg2dec->tag2_previous;
+	    picture->flags |= PIC_FLAG_TAGS;
+	}
+    }
     picture->nb_fields = 2;
+    picture->display_offset[0].x = picture->display_offset[1].x =
+	picture->display_offset[2].x = mpeg2dec->display_offset_x;
+    picture->display_offset[0].y = picture->display_offset[1].y =
+	picture->display_offset[2].y = mpeg2dec->display_offset_y;
 
-    decoder->intra_dc_precision = 0;
+    /* XXXXXX decode extra_information_picture as well */
+
+    mpeg2dec->q_scale_type = 0;
+    decoder->intra_dc_precision = 7;
     decoder->frame_pred_frame_dct = 1;
-    decoder->q_scale_type = 0;
     decoder->concealment_motion_vectors = 0;
     decoder->scan = mpeg2_scan_norm;
     decoder->picture_structure = FRAME_PICTURE;
@@ -524,7 +597,7 @@ int mpeg2_header_picture (mpeg2dec_t * mpeg2dec)
 static int picture_coding_ext (mpeg2dec_t * mpeg2dec)
 {
     uint8_t * buffer = mpeg2dec->chunk_start;
-    mpeg2_picture_t * picture = mpeg2dec->picture;
+    mpeg2_picture_t * picture = &(mpeg2dec->new_picture);
     mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
     uint32_t flags;
 
@@ -535,7 +608,7 @@ static int picture_coding_ext (mpeg2dec_t * mpeg2dec)
     decoder->b_motion.f_code[1] = (buffer[2] >> 4) - 1;
 
     flags = picture->flags;
-    decoder->intra_dc_precision = (buffer[2] >> 2) & 3;
+    decoder->intra_dc_precision = 7 - ((buffer[2] >> 2) & 3);
     decoder->picture_structure = buffer[2] & 3;
     switch (decoder->picture_structure) {
     case TOP_FIELD:
@@ -544,7 +617,6 @@ static int picture_coding_ext (mpeg2dec_t * mpeg2dec)
 	picture->nb_fields = 1;
 	break;
     case FRAME_PICTURE:
-        /* buffer[3] & 2 is repeat first field */
 	if (!(mpeg2dec->sequence.flags & SEQ_FLAG_PROGRESSIVE_SEQUENCE)) {
 	    picture->nb_fields = (buffer[3] & 2) ? 3 : 2;
 	    flags |= (buffer[3] & 128) ? PIC_FLAG_TOP_FIELD_FIRST : 0;
@@ -557,10 +629,11 @@ static int picture_coding_ext (mpeg2dec_t * mpeg2dec)
     decoder->top_field_first = buffer[3] >> 7;
     decoder->frame_pred_frame_dct = (buffer[3] >> 6) & 1;
     decoder->concealment_motion_vectors = (buffer[3] >> 5) & 1;
-    decoder->q_scale_type = (buffer[3] >> 4) & 1;
+    mpeg2dec->q_scale_type = buffer[3] & 16;
     decoder->intra_vlc_format = (buffer[3] >> 3) & 1;
     decoder->scan = (buffer[3] & 4) ? mpeg2_scan_alt : mpeg2_scan_norm;
-    flags |= (buffer[4] & 0x80) ? PIC_FLAG_PROGRESSIVE_FRAME : 0;
+    if (!(buffer[4] & 0x80))
+	flags &= ~PIC_FLAG_PROGRESSIVE_FRAME;
     if (buffer[4] & 0x40)
 	flags |= (((buffer[4]<<26) | (buffer[5]<<18) | (buffer[6]<<10)) &
 		  PIC_MASK_COMPOSITE_DISPLAY) | PIC_FLAG_COMPOSITE_DISPLAY;
@@ -574,7 +647,7 @@ static int picture_coding_ext (mpeg2dec_t * mpeg2dec)
 static int picture_display_ext (mpeg2dec_t * mpeg2dec)
 {
     uint8_t * buffer = mpeg2dec->chunk_start;
-    mpeg2_picture_t * picture = mpeg2dec->picture;
+    mpeg2_picture_t * picture = &(mpeg2dec->new_picture);
     int i, nb_pos;
 
     nb_pos = picture->nb_fields;
@@ -600,6 +673,140 @@ static int picture_display_ext (mpeg2dec_t * mpeg2dec)
     return 0;
 }
 
+void mpeg2_header_picture_finalize (mpeg2dec_t * mpeg2dec, uint32_t accels)
+{
+    mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
+    int old_type_b = (decoder->coding_type == B_TYPE);
+    int low_delay = mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY;
+
+    finalize_matrix (mpeg2dec);
+    decoder->coding_type = mpeg2dec->new_picture.flags & PIC_MASK_CODING_TYPE;
+
+    if (mpeg2dec->state == STATE_PICTURE) {
+	mpeg2_picture_t * picture;
+	mpeg2_picture_t * other;
+
+	decoder->second_field = 0;
+
+	picture = other = mpeg2dec->pictures;
+	if (old_type_b ^ (mpeg2dec->picture < mpeg2dec->pictures + 2))
+	    picture += 2;
+	else
+	    other += 2;
+	mpeg2dec->picture = picture;
+	*picture = mpeg2dec->new_picture;
+
+	if (!old_type_b) {
+	    mpeg2dec->fbuf[2] = mpeg2dec->fbuf[1];
+	    mpeg2dec->fbuf[1] = mpeg2dec->fbuf[0];
+	}
+	mpeg2dec->fbuf[0] = NULL;
+	mpeg2_reset_info (&(mpeg2dec->info));
+	mpeg2dec->info.current_picture = picture;
+	mpeg2dec->info.display_picture = picture;
+	if (decoder->coding_type != B_TYPE) {
+	    if (!low_delay) {
+		if (mpeg2dec->first) {
+		    mpeg2dec->info.display_picture = NULL;
+		    mpeg2dec->first = 0;
+		} else {
+		    mpeg2dec->info.display_picture = other;
+		    if (other->nb_fields == 1)
+			mpeg2dec->info.display_picture_2nd = other + 1;
+		    mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[1];
+		}
+	    }
+	    if (!low_delay + !mpeg2dec->convert)
+		mpeg2dec->info.discard_fbuf =
+		    mpeg2dec->fbuf[!low_delay + !mpeg2dec->convert];
+	}
+	if (mpeg2dec->convert) {
+	    mpeg2_convert_init_t convert_init;
+	    if (!mpeg2dec->convert_start) {
+		int y_size, uv_size;
+
+		mpeg2dec->decoder.convert_id =
+		    mpeg2_malloc (mpeg2dec->convert_id_size,
+				  MPEG2_ALLOC_CONVERT_ID);
+		mpeg2dec->convert (MPEG2_CONVERT_START,
+				   mpeg2dec->decoder.convert_id,
+				   &(mpeg2dec->sequence),
+				   mpeg2dec->convert_stride, accels,
+				   mpeg2dec->convert_arg, &convert_init);
+		mpeg2dec->convert_start = convert_init.start;
+		mpeg2dec->decoder.convert = convert_init.copy;
+
+		y_size = decoder->stride_frame * mpeg2dec->sequence.height;
+		uv_size = y_size >> (2 - mpeg2dec->decoder.chroma_format);
+		mpeg2dec->yuv_buf[0][0] =
+		    (uint8_t *) mpeg2_malloc (y_size, MPEG2_ALLOC_YUV);
+		mpeg2dec->yuv_buf[0][1] =
+		    (uint8_t *) mpeg2_malloc (uv_size, MPEG2_ALLOC_YUV);
+		mpeg2dec->yuv_buf[0][2] =
+		    (uint8_t *) mpeg2_malloc (uv_size, MPEG2_ALLOC_YUV);
+		mpeg2dec->yuv_buf[1][0] =
+		    (uint8_t *) mpeg2_malloc (y_size, MPEG2_ALLOC_YUV);
+		mpeg2dec->yuv_buf[1][1] =
+		    (uint8_t *) mpeg2_malloc (uv_size, MPEG2_ALLOC_YUV);
+		mpeg2dec->yuv_buf[1][2] =
+		    (uint8_t *) mpeg2_malloc (uv_size, MPEG2_ALLOC_YUV);
+		y_size = decoder->stride_frame * 32;
+		uv_size = y_size >> (2 - mpeg2dec->decoder.chroma_format);
+		mpeg2dec->yuv_buf[2][0] =
+		    (uint8_t *) mpeg2_malloc (y_size, MPEG2_ALLOC_YUV);
+		mpeg2dec->yuv_buf[2][1] =
+		    (uint8_t *) mpeg2_malloc (uv_size, MPEG2_ALLOC_YUV);
+		mpeg2dec->yuv_buf[2][2] =
+		    (uint8_t *) mpeg2_malloc (uv_size, MPEG2_ALLOC_YUV);
+	    }
+	    if (!mpeg2dec->custom_fbuf) {
+		while (mpeg2dec->alloc_index < 3) {
+		    mpeg2_fbuf_t * fbuf;
+
+		    fbuf = &mpeg2dec->fbuf_alloc[mpeg2dec->alloc_index++].fbuf;
+		    fbuf->id = NULL;
+		    fbuf->buf[0] =
+			(uint8_t *) mpeg2_malloc (convert_init.buf_size[0],
+						  MPEG2_ALLOC_CONVERTED);
+		    fbuf->buf[1] =
+			(uint8_t *) mpeg2_malloc (convert_init.buf_size[1],
+						  MPEG2_ALLOC_CONVERTED);
+		    fbuf->buf[2] =
+			(uint8_t *) mpeg2_malloc (convert_init.buf_size[2],
+						  MPEG2_ALLOC_CONVERTED);
+		}
+		mpeg2_set_fbuf (mpeg2dec, (decoder->coding_type == B_TYPE));
+	    }
+	} else if (!mpeg2dec->custom_fbuf) {
+	    while (mpeg2dec->alloc_index < 3) {
+		mpeg2_fbuf_t * fbuf;
+		int y_size, uv_size;
+
+		fbuf = &(mpeg2dec->fbuf_alloc[mpeg2dec->alloc_index++].fbuf);
+		fbuf->id = NULL;
+		y_size = decoder->stride_frame * mpeg2dec->sequence.height;
+		uv_size = y_size >> (2 - decoder->chroma_format);
+		fbuf->buf[0] = (uint8_t *) mpeg2_malloc (y_size,
+							 MPEG2_ALLOC_YUV);
+		fbuf->buf[1] = (uint8_t *) mpeg2_malloc (uv_size,
+							 MPEG2_ALLOC_YUV);
+		fbuf->buf[2] = (uint8_t *) mpeg2_malloc (uv_size,
+							 MPEG2_ALLOC_YUV);
+	    }
+	    mpeg2_set_fbuf (mpeg2dec, (decoder->coding_type == B_TYPE));
+	}
+    } else {
+	decoder->second_field = 1;
+	mpeg2dec->picture++;	/* second field picture */
+	*(mpeg2dec->picture) = mpeg2dec->new_picture;
+	mpeg2dec->info.current_picture_2nd = mpeg2dec->picture;
+	if (low_delay || decoder->coding_type == B_TYPE)
+	    mpeg2dec->info.display_picture_2nd = mpeg2dec->picture;
+    }
+
+    info_user_data (mpeg2dec);
+}
+
 static int copyright_ext (mpeg2dec_t * mpeg2dec)
 {
     return 0;
@@ -608,22 +815,16 @@ static int copyright_ext (mpeg2dec_t * mpeg2dec)
 static int quant_matrix_ext (mpeg2dec_t * mpeg2dec)
 {
     uint8_t * buffer = mpeg2dec->chunk_start;
-    int i;
-
-    if (buffer[0] & 8) {
-	for (i = 0; i < 64; i++)
-	    mpeg2dec->intra_quantizer_matrix[mpeg2_scan_norm[i]] =
-		(buffer[i] << 5) | (buffer[i+1] >> 3);
-	mpeg2dec->copy_matrix |= 1;
-	buffer += 64;
-    }
-
-    if (buffer[0] & 4) {
-	for (i = 0; i < 64; i++)
-	    mpeg2dec->non_intra_quantizer_matrix[mpeg2_scan_norm[i]] =
-		(buffer[i] << 6) | (buffer[i+1] >> 2);
-	mpeg2dec->copy_matrix |= 2;
-    }
+    int i, j;
+
+    for (i = 0; i < 4; i++)
+	if (buffer[0] & (8 >> i)) {
+	    for (j = 0; j < 64; j++)
+		mpeg2dec->new_quantizer_matrix[i][mpeg2_scan_norm[j]] =
+		    (buffer[j] << (i+5)) | (buffer[j+1] >> (3-i));
+	    mpeg2dec->copy_matrix |= 1 << i;
+	    buffer += 64;
+	}
 
     return 0;
 }
@@ -647,42 +848,59 @@ int mpeg2_header_extension (mpeg2dec_t * mpeg2dec)
 
 int mpeg2_header_user_data (mpeg2dec_t * mpeg2dec)
 {
-    if (!mpeg2dec->info.user_data_len)
-	mpeg2dec->info.user_data = mpeg2dec->chunk_start;
-    else
-	mpeg2dec->info.user_data_len += 3;
-    mpeg2dec->info.user_data_len += (mpeg2dec->chunk_ptr - 4 -
-				     mpeg2dec->chunk_start);
+    mpeg2dec->user_data_len += mpeg2dec->chunk_ptr - 1 - mpeg2dec->chunk_start;
     mpeg2dec->chunk_start = mpeg2dec->chunk_ptr - 1;
     
     return 0;
 }
 
+static void prescale (mpeg2dec_t * mpeg2dec, int index)
+{
+    static int non_linear_scale [] = {
+	 0,  1,  2,  3,  4,  5,   6,   7,
+	 8, 10, 12, 14, 16, 18,  20,  22,
+	24, 28, 32, 36, 40, 44,  48,  52,
+	56, 64, 72, 80, 88, 96, 104, 112
+    };
+    int i, j, k;
+    mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
+
+    if (mpeg2dec->scaled[index] != mpeg2dec->q_scale_type) {
+	mpeg2dec->scaled[index] = mpeg2dec->q_scale_type;
+	for (i = 0; i < 32; i++) {
+	    k = mpeg2dec->q_scale_type ? non_linear_scale[i] : (i << 1);
+	    for (j = 0; j < 64; j++)
+		decoder->quantizer_prescale[index][i][j] =
+		    k * mpeg2dec->quantizer_matrix[index][j];
+	}
+    }
+}
+
 mpeg2_state_t mpeg2_header_slice_start (mpeg2dec_t * mpeg2dec)
 {
-    mpeg2dec->info.user_data = NULL; mpeg2dec->info.user_data_len = 0;
+    mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
+
+    mpeg2dec->info.user_data = NULL;	mpeg2dec->info.user_data_len = 0;
     mpeg2dec->state = ((mpeg2dec->picture->nb_fields > 1 ||
 			mpeg2dec->state == STATE_PICTURE_2ND) ?
 		       STATE_SLICE : STATE_SLICE_1ST);
 
+    if (mpeg2dec->decoder.coding_type != D_TYPE) {
+	prescale (mpeg2dec, 0);
+	if (decoder->chroma_quantizer[0] == decoder->quantizer_prescale[2])
+	    prescale (mpeg2dec, 2);
+	if (mpeg2dec->decoder.coding_type != I_TYPE) {
+	    prescale (mpeg2dec, 1);
+	    if (decoder->chroma_quantizer[1] == decoder->quantizer_prescale[3])
+		prescale (mpeg2dec, 3);
+	}
+    }
+
     if (!(mpeg2dec->nb_decode_slices))
 	mpeg2dec->picture->flags |= PIC_FLAG_SKIP;
     else if (mpeg2dec->convert_start) {
-	int flags;
-
-	switch (mpeg2dec->decoder.picture_structure) {
-	case TOP_FIELD:		flags = CONVERT_TOP_FIELD;	break;
-	case BOTTOM_FIELD:	flags = CONVERT_BOTTOM_FIELD;	break;
-	default:
-	    flags =
-		((mpeg2dec->sequence.flags & SEQ_FLAG_PROGRESSIVE_SEQUENCE) ?
-		 CONVERT_FRAME : CONVERT_BOTH_FIELDS);
-	}
-	mpeg2dec->convert_start (mpeg2dec->convert_id,
-				 mpeg2dec->fbuf[0]->buf, flags);
-
-	mpeg2dec->decoder.convert = mpeg2dec->convert_copy;
-	mpeg2dec->decoder.fbuf_id = mpeg2dec->convert_id;
+	mpeg2dec->convert_start (decoder->convert_id, mpeg2dec->fbuf[0],
+				 mpeg2dec->picture, mpeg2dec->info.gop);
 
 	if (mpeg2dec->decoder.coding_type == B_TYPE)
 	    mpeg2_init_fbuf (&(mpeg2dec->decoder), mpeg2dec->yuv_buf[2],
@@ -699,69 +917,23 @@ mpeg2_state_t mpeg2_header_slice_start (mpeg2dec_t * mpeg2dec)
     } else {
 	int b_type;
 
-	mpeg2dec->decoder.convert = NULL;
 	b_type = (mpeg2dec->decoder.coding_type == B_TYPE);
 	mpeg2_init_fbuf (&(mpeg2dec->decoder), mpeg2dec->fbuf[0]->buf,
 			 mpeg2dec->fbuf[b_type + 1]->buf,
 			 mpeg2dec->fbuf[b_type]->buf);
     }
     mpeg2dec->action = NULL;
-    return (mpeg2_state_t)-1;
-}
-
-mpeg2_state_t mpeg2_header_end_btype2 (mpeg2dec_t * mpeg2dec)
-{
-	mpeg2dec->info.display_fbuf = 0;
-	mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[1];
-        mpeg2dec->fbuf[1]=0;
-        mpeg2dec->action = mpeg2_seek_sequence;
-        mpeg2dec->first = 1;
-        return STATE_END;
-}
-mpeg2_state_t mpeg2_header_end_btype (mpeg2dec_t * mpeg2dec)
-{
-	mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[1];
-	mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[2];
-        mpeg2dec->fbuf[2]=0;
-        mpeg2dec->action = mpeg2_header_end_btype2;
-        return STATE_SLICE;
+    return STATE_INTERNAL_NORETURN;
 }
 
-mpeg2_state_t mpeg2_reset (mpeg2dec_t * mpeg2dec)
+static mpeg2_state_t seek_sequence (mpeg2dec_t * mpeg2dec)
 {
-	mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[0];
-	mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[0];
-        mpeg2dec->fbuf[0]=0;
-        mpeg2dec->action = mpeg2_header_end_btype;
-        mpeg2dec->shift = 0xffffff00;
-        mpeg2dec->code = 0xb4;
-        mpeg2dec->first_decode_slice = 1;
-        mpeg2dec->nb_decode_slices = 0xb0 - 1;
-        mpeg2dec->decoder.scan = mpeg2_scan_norm;
-        mpeg2dec->picture = mpeg2dec->pictures;
-        mpeg2dec->first = 1;
-        mpeg2dec->alloc_index = 0;
-        mpeg2dec->alloc_index_user = 0;
-
-        return STATE_SLICE;
-}
-
-mpeg2_state_t mpeg2_header_end_itype2 (mpeg2dec_t * mpeg2dec)
-{
-	mpeg2dec->info.display_fbuf = 0;
-	mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[0];
-        mpeg2dec->fbuf[0]=0;
-        mpeg2dec->action = mpeg2_seek_sequence;
-        mpeg2dec->first = 1;
-        return STATE_END;
-}
-mpeg2_state_t mpeg2_header_end_itype (mpeg2dec_t * mpeg2dec)
-{
-	mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[0];
-	mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[1];
-        mpeg2dec->fbuf[1]=0;
-        mpeg2dec->action = mpeg2_header_end_itype2;
-        return STATE_SLICE;
+    mpeg2_reset_info (&(mpeg2dec->info));
+    mpeg2dec->info.sequence = NULL;
+    mpeg2dec->info.gop = NULL;
+    mpeg2_header_state_init (mpeg2dec);
+    mpeg2dec->action = mpeg2_seek_header;
+    return mpeg2_seek_header (mpeg2dec);
 }
 
 mpeg2_state_t mpeg2_header_end (mpeg2dec_t * mpeg2dec)
@@ -774,54 +946,16 @@ mpeg2_state_t mpeg2_header_end (mpeg2dec_t * mpeg2dec)
     if ((mpeg2dec->picture >= picture + 2) ^ b_type)
 	picture = mpeg2dec->pictures + 2;
 
-    mpeg2dec->state = STATE_END;
-    reset_info (&(mpeg2dec->info));
-    if (b_type) {
-	mpeg2dec->info.display_picture = picture;
-	if (picture->nb_fields == 1)
-	    mpeg2dec->info.display_picture_2nd = picture + 1;
-	mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[0];
-	mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[0];
-        mpeg2dec->fbuf[0]=0;
-        mpeg2dec->action = mpeg2_header_end_btype;
-        return STATE_SLICE;
-    } else {
-	mpeg2dec->info.display_picture = picture;
-	if (picture->nb_fields == 1)
-	    mpeg2dec->info.display_picture_2nd = picture + 1;
-	if (mpeg2dec->fbuf[2]) {
-          mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[1];
-	  mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[2];
-          mpeg2dec->fbuf[2]=0;
-          mpeg2dec->action = mpeg2_header_end_itype;
-          return STATE_SLICE;
-        } else {
-          mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[0];
-	  mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[1];
-          mpeg2dec->fbuf[1]=0;
-          mpeg2dec->action = mpeg2_header_end_itype2;
-          return STATE_SLICE;
-        }
-
-    }
-
-      
-#if 0
+    mpeg2_reset_info (&(mpeg2dec->info));
     if (!(mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY)) {
 	mpeg2dec->info.display_picture = picture;
 	if (picture->nb_fields == 1)
 	    mpeg2dec->info.display_picture_2nd = picture + 1;
 	mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[b_type];
-	if (!mpeg2dec->convert_start) {
+	if (!mpeg2dec->convert)
 	    mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[b_type + 1];
-        }
-    } else if (!mpeg2dec->convert_start) {
+    } else if (!mpeg2dec->convert)
 	mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[b_type];
-    }
-    mpeg2dec->action = mpeg2_seek_sequence;
-    mpeg2dec->first = 1;
+    mpeg2dec->action = seek_sequence;
     return STATE_END;
-#endif
 }
-
-
diff --git a/src/libmpeg2new/libmpeg2/idct.c b/src/libmpeg2new/libmpeg2/idct.c
index 57aba175b..8b982bb33 100644
--- a/src/libmpeg2new/libmpeg2/idct.c
+++ b/src/libmpeg2new/libmpeg2/idct.c
@@ -27,8 +27,8 @@
 #include <inttypes.h>
 
 #include "../include/mpeg2.h"
-#include "mpeg2_internal.h"
 #include "../include/attributes.h"
+#include "mpeg2_internal.h"
 
 #define W1 2841 /* 2048 * sqrt (2) * cos (1 * pi / 16) */
 #define W2 2676 /* 2048 * sqrt (2) * cos (2 * pi / 16) */
@@ -75,7 +75,7 @@ static void inline idct_row (int16_t * const block)
     /* shortcut */
     if (likely (!(block[1] | ((int32_t *)block)[1] | ((int32_t *)block)[2] |
 		  ((int32_t *)block)[3]))) {
-	uint32_t tmp = (uint16_t) (block[0] << 3);
+	uint32_t tmp = (uint16_t) (block[0] >> 1);
 	tmp |= tmp << 16;
 	((int32_t *)block)[0] = tmp;
 	((int32_t *)block)[1] = tmp;
@@ -84,7 +84,7 @@ static void inline idct_row (int16_t * const block)
 	return;
     }
 
-    d0 = (block[0] << 11) + 128;
+    d0 = (block[0] << 11) + 2048;
     d1 = block[1];
     d2 = block[2] << 11;
     d3 = block[3];
@@ -106,17 +106,17 @@ static void inline idct_row (int16_t * const block)
     b3 = t1 + t3;
     t0 -= t2;
     t1 -= t3;
-    b1 = ((t0 + t1) * 181) >> 8;
-    b2 = ((t0 - t1) * 181) >> 8;
+    b1 = ((t0 + t1) >> 8) * 181;
+    b2 = ((t0 - t1) >> 8) * 181;
 
-    block[0] = (a0 + b0) >> 8;
-    block[1] = (a1 + b1) >> 8;
-    block[2] = (a2 + b2) >> 8;
-    block[3] = (a3 + b3) >> 8;
-    block[4] = (a3 - b3) >> 8;
-    block[5] = (a2 - b2) >> 8;
-    block[6] = (a1 - b1) >> 8;
-    block[7] = (a0 - b0) >> 8;
+    block[0] = (a0 + b0) >> 12;
+    block[1] = (a1 + b1) >> 12;
+    block[2] = (a2 + b2) >> 12;
+    block[3] = (a3 + b3) >> 12;
+    block[4] = (a3 - b3) >> 12;
+    block[5] = (a2 - b2) >> 12;
+    block[6] = (a1 - b1) >> 12;
+    block[7] = (a0 - b0) >> 12;
 }
 
 static void inline idct_col (int16_t * const block)
@@ -145,10 +145,10 @@ static void inline idct_col (int16_t * const block)
     BUTTERFLY (t2, t3, W3, W5, d1, d2);
     b0 = t0 + t2;
     b3 = t1 + t3;
-    t0 = (t0 - t2) >> 8;
-    t1 = (t1 - t3) >> 8;
-    b1 = (t0 + t1) * 181;
-    b2 = (t0 - t1) * 181;
+    t0 -= t2;
+    t1 -= t3;
+    b1 = ((t0 + t1) >> 8) * 181;
+    b2 = ((t0 - t1) >> 8) * 181;
 
     block[8*0] = (a0 + b0) >> 17;
     block[8*1] = (a1 + b1) >> 17;
@@ -179,8 +179,8 @@ static void mpeg2_idct_copy_c (int16_t * block, uint8_t * dest,
 	dest[6] = CLIP (block[6]);
 	dest[7] = CLIP (block[7]);
 
-	block[0] = 0;	block[1] = 0;	block[2] = 0;	block[3] = 0;
-	block[4] = 0;	block[5] = 0;	block[6] = 0;	block[7] = 0;
+	((int32_t *)block)[0] = 0;	((int32_t *)block)[1] = 0;
+	((int32_t *)block)[2] = 0;	((int32_t *)block)[3] = 0;
 
 	dest += stride;
 	block += 8;
@@ -192,7 +192,7 @@ static void mpeg2_idct_add_c (const int last, int16_t * block,
 {
     int i;
 
-    if (last != 129 || (block[0] & 7) == 4) {
+    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
 	for (i = 0; i < 8; i++)
 	    idct_row (block + 8 * i);
 	for (i = 0; i < 8; i++)
@@ -207,8 +207,8 @@ static void mpeg2_idct_add_c (const int last, int16_t * block,
 	    dest[6] = CLIP (block[6] + dest[6]);
 	    dest[7] = CLIP (block[7] + dest[7]);
 
-	    block[0] = 0;	block[1] = 0;	block[2] = 0;	block[3] = 0;
-	    block[4] = 0;	block[5] = 0;	block[6] = 0;	block[7] = 0;
+	    ((int32_t *)block)[0] = 0;	((int32_t *)block)[1] = 0;
+	    ((int32_t *)block)[2] = 0;	((int32_t *)block)[3] = 0;
 
 	    dest += stride;
 	    block += 8;
@@ -216,7 +216,7 @@ static void mpeg2_idct_add_c (const int last, int16_t * block,
     } else {
 	int DC;
 
-	DC = (block[0] + 4) >> 3;
+	DC = (block[0] + 64) >> 7;
 	block[0] = block[63] = 0;
 	i = 8;
 	do {
@@ -268,13 +268,6 @@ void mpeg2_idct_init (uint32_t accel)
 	    CLIP(i) = (i < 0) ? 0 : ((i > 255) ? 255 : i);
     } else
 #endif
-#ifdef LIBMPEG2_MLIB
-    if (accel & MPEG2_ACCEL_MLIB) {
-	mpeg2_idct_copy = mpeg2_idct_copy_mlib_non_ieee;
-	mpeg2_idct_add = (getenv ("MLIB_NON_IEEE") ?
-			  mpeg2_idct_add_mlib_non_ieee : mpeg2_idct_add_mlib);
-    } else
-#endif
     {
 	extern uint8_t mpeg2_scan_norm[64];
 	extern uint8_t mpeg2_scan_alt[64];
diff --git a/src/libmpeg2new/libmpeg2/idct_alpha.c b/src/libmpeg2new/libmpeg2/idct_alpha.c
index 68c605508..8f9beaf22 100644
--- a/src/libmpeg2new/libmpeg2/idct_alpha.c
+++ b/src/libmpeg2new/libmpeg2/idct_alpha.c
@@ -29,8 +29,10 @@
 #include <stdlib.h>
 #include <inttypes.h>
 
-#include "alpha_asm.h"
+#include "mpeg2.h"
 #include "attributes.h"
+#include "mpeg2_internal.h"
+#include "alpha_asm.h"
 
 #define W1 2841 /* 2048 * sqrt (2) * cos (1 * pi / 16) */
 #define W2 2676 /* 2048 * sqrt (2) * cos (2 * pi / 16) */
@@ -69,7 +71,7 @@ static void inline idct_row (int16_t * const block)
 
     /* shortcut */
     if (likely (!((l & ~0xffffUL) | r))) {
-	uint64_t tmp = (uint16_t) (l << 3);
+	uint64_t tmp = (uint16_t) (l >> 1);
 	tmp |= tmp << 16;
 	tmp |= tmp << 32;
 	((int32_t *)block)[0] = tmp;
@@ -79,7 +81,7 @@ static void inline idct_row (int16_t * const block)
 	return;
     }
 
-    d0 = (sextw (l) << 11) + 128;
+    d0 = (sextw (l) << 11) + 2048;
     d1 = sextw (extwl (l, 2));
     d2 = sextw (extwl (l, 4)) << 11;
     d3 = sextw (extwl (l, 6));
@@ -101,17 +103,17 @@ static void inline idct_row (int16_t * const block)
     b3 = t1 + t3;
     t0 -= t2;
     t1 -= t3;
-    b1 = ((t0 + t1) * 181) >> 8;
-    b2 = ((t0 - t1) * 181) >> 8;
-
-    block[0] = (a0 + b0) >> 8;
-    block[1] = (a1 + b1) >> 8;
-    block[2] = (a2 + b2) >> 8;
-    block[3] = (a3 + b3) >> 8;
-    block[4] = (a3 - b3) >> 8;
-    block[5] = (a2 - b2) >> 8;
-    block[6] = (a1 - b1) >> 8;
-    block[7] = (a0 - b0) >> 8;
+    b1 = ((t0 + t1) >> 8) * 181;
+    b2 = ((t0 - t1) >> 8) * 181;
+
+    block[0] = (a0 + b0) >> 12;
+    block[1] = (a1 + b1) >> 12;
+    block[2] = (a2 + b2) >> 12;
+    block[3] = (a3 + b3) >> 12;
+    block[4] = (a3 - b3) >> 12;
+    block[5] = (a2 - b2) >> 12;
+    block[6] = (a1 - b1) >> 12;
+    block[7] = (a0 - b0) >> 12;
 }
 
 static void inline idct_col (int16_t * const block)
@@ -140,10 +142,10 @@ static void inline idct_col (int16_t * const block)
     BUTTERFLY (t2, t3, W3, W5, d1, d2);
     b0 = t0 + t2;
     b3 = t1 + t3;
-    t0 = (t0 - t2) >> 8;
-    t1 = (t1 - t3) >> 8;
-    b1 = (t0 + t1) * 181;
-    b2 = (t0 - t1) * 181;
+    t0 -= t2;
+    t1 -= t3;
+    b1 = ((t0 + t1) >> 8) * 181;
+    b2 = ((t0 - t1) >> 8) * 181;
 
     block[8*0] = (a0 + b0) >> 17;
     block[8*1] = (a1 + b1) >> 17;
@@ -195,7 +197,7 @@ void mpeg2_idct_add_mvi (const int last, int16_t * block,
     uint64_t signmask;
     int i;
 
-    if (last != 129 || (block[0] & 7) == 4) {
+    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
 	for (i = 0; i < 8; i++)
 	    idct_row (block + 8 * i);
 	for (i = 0; i < 8; i++)
@@ -243,7 +245,7 @@ void mpeg2_idct_add_mvi (const int last, int16_t * block,
 	uint64_t p0, p1, p2, p3, p4, p5, p6, p7;
 	uint64_t DCs;
 
-	DC = (block[0] + 4) >> 3;
+	DC = (block[0] + 64) >> 7;
 	block[0] = block[63] = 0;
 
 	p0 = ldq (dest + 0 * stride);
@@ -319,7 +321,7 @@ void mpeg2_idct_add_alpha (const int last, int16_t * block,
 {
     int i;
 
-    if (last != 129 || (block[0] & 7) == 4) {
+    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
 	for (i = 0; i < 8; i++)
 	    idct_row (block + 8 * i);
 	for (i = 0; i < 8; i++)
@@ -343,7 +345,7 @@ void mpeg2_idct_add_alpha (const int last, int16_t * block,
     } else {
 	int DC;
 
-	DC = (block[0] + 4) >> 3;
+	DC = (block[0] + 64) >> 7;
 	block[0] = block[63] = 0;
 	i = 8;
 	do {
diff --git a/src/libmpeg2new/libmpeg2/idct_altivec.c b/src/libmpeg2new/libmpeg2/idct_altivec.c
index d8f3ceab0..6b1b8586c 100644
--- a/src/libmpeg2new/libmpeg2/idct_altivec.c
+++ b/src/libmpeg2new/libmpeg2/idct_altivec.c
@@ -30,9 +30,9 @@
 #endif
 #include <inttypes.h>
 
-#include "../include/mpeg2.h"
+#include "mpeg2.h"
+#include "attributes.h"
 #include "mpeg2_internal.h"
-#include "../include/attributes.h"
 
 typedef vector signed char vector_s8_t;
 typedef vector unsigned char vector_u8_t;
@@ -67,46 +67,11 @@ static const vector_s16_t constants ATTR_ALIGN(16) =
 static const vector_s16_t constants_1 ATTR_ALIGN(16) =
     VEC_S16 (16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725);
 static const vector_s16_t constants_2 ATTR_ALIGN(16) =
-    VEC_S16 (22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521);
+    VEC_S16 (16069, 22289, 20995, 18895, 16069, 18895, 20995, 22289);
 static const vector_s16_t constants_3 ATTR_ALIGN(16) =
     VEC_S16 (21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692);
 static const vector_s16_t constants_4 ATTR_ALIGN(16) =
-    VEC_S16 (19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722);
-
-#define IDCT_HALF					\
-    /* 1st stage */					\
-    t1 = vec_mradds (a1, vx7, vx1 );			\
-    t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7));	\
-    t7 = vec_mradds (a2, vx5, vx3);			\
-    t3 = vec_mradds (ma2, vx3, vx5);			\
-							\
-    /* 2nd stage */					\
-    t5 = vec_adds (vx0, vx4);				\
-    t0 = vec_subs (vx0, vx4);				\
-    t2 = vec_mradds (a0, vx6, vx2);			\
-    t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6));	\
-    t6 = vec_adds (t8, t3);				\
-    t3 = vec_subs (t8, t3);				\
-    t8 = vec_subs (t1, t7);				\
-    t1 = vec_adds (t1, t7);				\
-							\
-    /* 3rd stage */					\
-    t7 = vec_adds (t5, t2);				\
-    t2 = vec_subs (t5, t2);				\
-    t5 = vec_adds (t0, t4);				\
-    t0 = vec_subs (t0, t4);				\
-    t4 = vec_subs (t8, t3);				\
-    t3 = vec_adds (t8, t3);				\
-							\
-    /* 4th stage */					\
-    vy0 = vec_adds (t7, t1);				\
-    vy7 = vec_subs (t7, t1);				\
-    vy1 = vec_mradds (c4, t3, t5);			\
-    vy6 = vec_mradds (mc4, t3, t5);			\
-    vy2 = vec_mradds (c4, t4, t0);			\
-    vy5 = vec_mradds (mc4, t4, t0);			\
-    vy3 = vec_adds (t2, t6);				\
-    vy4 = vec_subs (t2, t6);
+    VEC_S16 (13623, 18895, 17799, 16019, 13623, 16019, 17799, 18895);
 
 #define IDCT								\
     vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;		\
@@ -124,18 +89,49 @@ static const vector_s16_t constants_4 ATTR_ALIGN(16) =
     bias = (vector_s16_t)vec_splat ((vector_s32_t)constants, 3);	\
 									\
     zero = vec_splat_s16 (0);						\
-    shift = vec_splat_u16 (4);						\
 									\
-    vx0 = vec_mradds (vec_sl (block[0], shift), constants_1, zero);	\
-    vx1 = vec_mradds (vec_sl (block[1], shift), constants_2, zero);	\
-    vx2 = vec_mradds (vec_sl (block[2], shift), constants_3, zero);	\
-    vx3 = vec_mradds (vec_sl (block[3], shift), constants_4, zero);	\
-    vx4 = vec_mradds (vec_sl (block[4], shift), constants_1, zero);	\
-    vx5 = vec_mradds (vec_sl (block[5], shift), constants_4, zero);	\
-    vx6 = vec_mradds (vec_sl (block[6], shift), constants_3, zero);	\
-    vx7 = vec_mradds (vec_sl (block[7], shift), constants_2, zero);	\
+    vx0 = vec_adds (block[0], block[4]);				\
+    vx4 = vec_subs (block[0], block[4]);				\
+    t5 = vec_mradds (vx0, constants_1, zero);				\
+    t0 = vec_mradds (vx4, constants_1, zero);				\
+									\
+    vx1 = vec_mradds (a1, block[7], block[1]);				\
+    vx7 = vec_mradds (a1, block[1], vec_subs (zero, block[7]));		\
+    t1 = vec_mradds (vx1, constants_2, zero);				\
+    t8 = vec_mradds (vx7, constants_2, zero);				\
+									\
+    vx2 = vec_mradds (a0, block[6], block[2]);				\
+    vx6 = vec_mradds (a0, block[2], vec_subs (zero, block[6]));		\
+    t2 = vec_mradds (vx2, constants_3, zero);				\
+    t4 = vec_mradds (vx6, constants_3, zero);				\
+									\
+    vx3 = vec_mradds (block[3], constants_4, zero);			\
+    vx5 = vec_mradds (block[5], constants_4, zero);			\
+    t7 = vec_mradds (a2, vx5, vx3);					\
+    t3 = vec_mradds (ma2, vx3, vx5);					\
 									\
-    IDCT_HALF								\
+    t6 = vec_adds (t8, t3);						\
+    t3 = vec_subs (t8, t3);						\
+    t8 = vec_subs (t1, t7);						\
+    t1 = vec_adds (t1, t7);						\
+    t6 = vec_mradds (a0, t6, t6);	/* a0+1 == 2*c4 */		\
+    t1 = vec_mradds (a0, t1, t1);	/* a0+1 == 2*c4 */		\
+									\
+    t7 = vec_adds (t5, t2);						\
+    t2 = vec_subs (t5, t2);						\
+    t5 = vec_adds (t0, t4);						\
+    t0 = vec_subs (t0, t4);						\
+    t4 = vec_subs (t8, t3);						\
+    t3 = vec_adds (t8, t3);						\
+									\
+    vy0 = vec_adds (t7, t1);						\
+    vy7 = vec_subs (t7, t1);						\
+    vy1 = vec_adds (t5, t3);						\
+    vy6 = vec_subs (t5, t3);						\
+    vy2 = vec_adds (t0, t4);						\
+    vy5 = vec_subs (t0, t4);						\
+    vy3 = vec_adds (t2, t6);						\
+    vy4 = vec_subs (t2, t6);						\
 									\
     vx0 = vec_mergeh (vy0, vy4);					\
     vx1 = vec_mergel (vy0, vy4);					\
@@ -155,7 +151,7 @@ static const vector_s16_t constants_4 ATTR_ALIGN(16) =
     vy6 = vec_mergeh (vx3, vx7);					\
     vy7 = vec_mergel (vx3, vx7);					\
 									\
-    vx0 = vec_adds (vec_mergeh (vy0, vy4), bias);			\
+    vx0 = vec_mergeh (vy0, vy4);					\
     vx1 = vec_mergel (vy0, vy4);					\
     vx2 = vec_mergeh (vy1, vy5);					\
     vx3 = vec_mergel (vy1, vy5);					\
@@ -164,7 +160,39 @@ static const vector_s16_t constants_4 ATTR_ALIGN(16) =
     vx6 = vec_mergeh (vy3, vy7);					\
     vx7 = vec_mergel (vy3, vy7);					\
 									\
-    IDCT_HALF								\
+    vx0 = vec_adds (vx0, bias);						\
+    t5 = vec_adds (vx0, vx4);						\
+    t0 = vec_subs (vx0, vx4);						\
+									\
+    t1 = vec_mradds (a1, vx7, vx1);					\
+    t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7));			\
+									\
+    t2 = vec_mradds (a0, vx6, vx2);					\
+    t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6));			\
+									\
+    t7 = vec_mradds (a2, vx5, vx3);					\
+    t3 = vec_mradds (ma2, vx3, vx5);					\
+									\
+    t6 = vec_adds (t8, t3);						\
+    t3 = vec_subs (t8, t3);						\
+    t8 = vec_subs (t1, t7);						\
+    t1 = vec_adds (t1, t7);						\
+									\
+    t7 = vec_adds (t5, t2);						\
+    t2 = vec_subs (t5, t2);						\
+    t5 = vec_adds (t0, t4);						\
+    t0 = vec_subs (t0, t4);						\
+    t4 = vec_subs (t8, t3);						\
+    t3 = vec_adds (t8, t3);						\
+									\
+    vy0 = vec_adds (t7, t1);						\
+    vy7 = vec_subs (t7, t1);						\
+    vy1 = vec_mradds (c4, t3, t5);					\
+    vy6 = vec_mradds (mc4, t3, t5);					\
+    vy2 = vec_mradds (c4, t4, t0);					\
+    vy5 = vec_mradds (mc4, t4, t0);					\
+    vy3 = vec_adds (t2, t6);						\
+    vy4 = vec_subs (t2, t6);						\
 									\
     shift = vec_splat_u16 (6);						\
     vx0 = vec_sra (vy0, shift);						\
diff --git a/src/libmpeg2new/libmpeg2/idct_mmx.c b/src/libmpeg2new/libmpeg2/idct_mmx.c
index c0e88f220..d5a5c08a4 100644
--- a/src/libmpeg2new/libmpeg2/idct_mmx.c
+++ b/src/libmpeg2new/libmpeg2/idct_mmx.c
@@ -28,11 +28,11 @@
 #include <inttypes.h>
 
 #include "../include/mpeg2.h"
-#include "mpeg2_internal.h"
 #include "../include/attributes.h"
+#include "mpeg2_internal.h"
 #include "../include/mmx.h"
 
-#define ROW_SHIFT 11
+#define ROW_SHIFT 15
 #define COL_SHIFT 6
 
 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
@@ -701,7 +701,7 @@ do {					\
 static inline void block_add_DC (int16_t * const block, uint8_t * dest,
 				 const int stride, const int cpu)
 {
-    movd_v2r ((block[0] + 4) >> 3, mm0);
+    movd_v2r ((block[0] + 64) >> 7, mm0);
     pxor_r2r (mm1, mm1);
     movq_m2r (*dest, mm2);
     dup4 (mm0);
@@ -763,7 +763,7 @@ void mpeg2_idct_copy_mmxext (int16_t * const block, uint8_t * const dest,
 void mpeg2_idct_add_mmxext (const int last, int16_t * const block,
 			    uint8_t * const dest, const int stride)
 {
-    if (last != 129 || (block[0] & 7) == 4) {
+    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
 	mmxext_idct (block);
 	block_add (block, dest, stride);
 	block_zero (block);
@@ -786,7 +786,7 @@ void mpeg2_idct_copy_mmx (int16_t * const block, uint8_t * const dest,
 void mpeg2_idct_add_mmx (const int last, int16_t * const block,
 			 uint8_t * const dest, const int stride)
 {
-    if (last != 129 || (block[0] & 7) == 4) {
+    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
 	mmx_idct (block);
 	block_add (block, dest, stride);
 	block_zero (block);
diff --git a/src/libmpeg2new/libmpeg2/libmpeg2convert.pc.in b/src/libmpeg2new/libmpeg2/libmpeg2convert.pc.in
new file mode 100644
index 000000000..42383a6e2
--- /dev/null
+++ b/src/libmpeg2new/libmpeg2/libmpeg2convert.pc.in
@@ -0,0 +1,10 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libmpeg2convert
+Description: libmpeg2 helper functions for converting to various formats.
+Version: @VERSION@
+Libs: -L${libdir} -lmpeg2convert
+Cflags: -I${includedir}/@PACKAGE@
diff --git a/src/libmpeg2new/libmpeg2/motion_comp.c b/src/libmpeg2new/libmpeg2/motion_comp.c
index cf9f807e2..d5a265d5c 100644
--- a/src/libmpeg2new/libmpeg2/motion_comp.c
+++ b/src/libmpeg2new/libmpeg2/motion_comp.c
@@ -26,6 +26,7 @@
 #include <inttypes.h>
 
 #include "../include/mpeg2.h"
+#include "../include/attributes.h"
 #include "mpeg2_internal.h"
 
 mpeg2_mc_t mpeg2_mc;
@@ -51,9 +52,9 @@ void mpeg2_mc_init (uint32_t accel)
 	mpeg2_mc = mpeg2_mc_alpha;
     else
 #endif
-#ifdef LIBMPEG2_MLIB
-    if (accel & MPEG2_ACCEL_MLIB)
-	mpeg2_mc = mpeg2_mc_mlib;
+#ifdef ARCH_SPARC
+    if (accel & MPEG2_ACCEL_SPARC_VIS)
+	mpeg2_mc = mpeg2_mc_vis;
     else
 #endif
 	mpeg2_mc = mpeg2_mc_c;
diff --git a/src/libmpeg2new/libmpeg2/motion_comp_alpha.c b/src/libmpeg2new/libmpeg2/motion_comp_alpha.c
index efa0c44af..05cd55084 100644
--- a/src/libmpeg2new/libmpeg2/motion_comp_alpha.c
+++ b/src/libmpeg2new/libmpeg2/motion_comp_alpha.c
@@ -26,9 +26,10 @@
 
 #include <inttypes.h>
 
-#include "../include/mpeg2.h"
+#include "mpeg2.h"
+#include "attributes.h"
 #include "mpeg2_internal.h"
-#include "../include/alpha_asm.h"
+#include "alpha_asm.h"
 
 static inline uint64_t avg2 (uint64_t a, uint64_t b)
 {
diff --git a/src/libmpeg2new/libmpeg2/motion_comp_altivec.c b/src/libmpeg2new/libmpeg2/motion_comp_altivec.c
index f0b6fa691..4356aa6e7 100644
--- a/src/libmpeg2new/libmpeg2/motion_comp_altivec.c
+++ b/src/libmpeg2new/libmpeg2/motion_comp_altivec.c
@@ -30,7 +30,8 @@
 #endif
 #include <inttypes.h>
 
-#include "../include/mpeg2.h"
+#include "mpeg2.h"
+#include "attributes.h"
 #include "mpeg2_internal.h"
 
 typedef vector signed char vector_s8_t;
diff --git a/src/libmpeg2new/libmpeg2/motion_comp_mmx.c b/src/libmpeg2new/libmpeg2/motion_comp_mmx.c
index fc8e83abc..8694bdfea 100644
--- a/src/libmpeg2new/libmpeg2/motion_comp_mmx.c
+++ b/src/libmpeg2new/libmpeg2/motion_comp_mmx.c
@@ -28,8 +28,8 @@
 #include <inttypes.h>
 
 #include "../include/mpeg2.h"
-#include "mpeg2_internal.h"
 #include "../include/attributes.h"
+#include "mpeg2_internal.h"
 #include "../include/mmx.h"
 
 #define CPU_MMXEXT 0
diff --git a/src/libmpeg2new/libmpeg2/motion_comp_vis.c b/src/libmpeg2new/libmpeg2/motion_comp_vis.c
new file mode 100644
index 000000000..54c0f7e75
--- /dev/null
+++ b/src/libmpeg2new/libmpeg2/motion_comp_vis.c
@@ -0,0 +1,2061 @@
+/*
+ * motion_comp_vis.c
+ * Copyright (C) 2003 David S. Miller <davem@redhat.com>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef ARCH_SPARC
+
+#include <inttypes.h>
+
+#include "mpeg2.h"
+#include "attributes.h"
+#include "mpeg2_internal.h"
+#include "vis.h"
+
+/* The trick used in some of this file is the formula from the MMX
+ * motion comp code, which is:
+ *
+ * (x+y+1)>>1 == (x|y)-((x^y)>>1)
+ *
+ * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
+ * We avoid overflows by masking before we do the shift, and we
+ * implement the shift by multiplying by 1/2 using mul8x16.  So in
+ * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
+ * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
+ * the value 0x80808080 is in f8):
+ *
+ *	fxor		f0, f2, f10
+ *	fand		f10, f4, f10
+ *	fmul8x16	f8, f10, f10
+ *	fand		f10, f6, f10
+ *	for		f0, f2, f12
+ *	fpsub16		f12, f10, f10
+ */
+
+#define DUP4(x) {x, x, x, x}
+#define DUP8(x) {x, x, x, x, x, x, x, x}
+static const int16_t constants1[] ATTR_ALIGN(8) = DUP4 (1);
+static const int16_t constants2[] ATTR_ALIGN(8) = DUP4 (2);
+static const int16_t constants3[] ATTR_ALIGN(8) = DUP4 (3);
+static const int16_t constants6[] ATTR_ALIGN(8) = DUP4 (6);
+static const int8_t constants_fe[] ATTR_ALIGN(8) = DUP8 (0xfe);
+static const int8_t constants_7f[] ATTR_ALIGN(8) = DUP8 (0x7f);
+static const int8_t constants128[] ATTR_ALIGN(8) = DUP8 (128);
+static const int16_t constants256_512[] ATTR_ALIGN(8) =
+	{256, 512, 256, 512};
+static const int16_t constants256_1024[] ATTR_ALIGN(8) =
+	{256, 1024, 256, 1024};
+
+#define REF_0		0
+#define REF_0_1		1
+#define REF_2		2
+#define REF_2_1		3
+#define REF_4		4
+#define REF_4_1		5
+#define REF_6		6
+#define REF_6_1		7
+#define REF_S0		8
+#define REF_S0_1	9
+#define REF_S2		10
+#define REF_S2_1	11
+#define REF_S4		12
+#define REF_S4_1	13
+#define REF_S6		14
+#define REF_S6_1	15
+#define DST_0		16
+#define DST_1		17
+#define DST_2		18
+#define DST_3		19
+#define CONST_1		20
+#define CONST_2		20
+#define CONST_3		20
+#define CONST_6		20
+#define MASK_fe		20
+#define CONST_128	22
+#define CONST_256	22
+#define CONST_512	22
+#define CONST_1024	22
+#define TMP0		24
+#define TMP1		25
+#define TMP2		26
+#define TMP3		27
+#define TMP4		28
+#define TMP5		29
+#define ZERO		30
+#define MASK_7f		30
+
+#define TMP6		32
+#define TMP8		34
+#define TMP10		36
+#define TMP12		38
+#define TMP14		40
+#define TMP16		42
+#define TMP18		44
+#define TMP20		46
+#define TMP22		48
+#define TMP24		50
+#define TMP26		52
+#define TMP28		54
+#define TMP30		56
+#define TMP32		58
+
+static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 16 : 0;
+	do {	/* 5 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64_2(ref, 8, TMP2);
+
+		vis_ld64_2(ref, offset, TMP4);
+		ref += stride;
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_st64(REF_0, dest[0]);
+
+		vis_faligndata(TMP2, TMP4, REF_2);
+		vis_st64_2(REF_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 8 : 0;
+	do {	/* 4 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64_2(ref, offset, TMP2);
+		ref += stride;
+
+		/* stall */
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_st64(REF_0, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+
+static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8 = stride + 8;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 16 : 0;
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64(ref[8], TMP2);
+
+	vis_ld64_2(ref, offset, TMP4);
+
+	vis_ld64(dest[0], DST_0);
+
+	vis_ld64(dest[8], DST_2);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP2, TMP4, REF_2);
+
+	vis_ld64(constants128[0], CONST_128);
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 24 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(DST_0, REF_0, TMP6);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_and(TMP6, MASK_fe, TMP6);
+
+		vis_ld64_2(ref, offset, TMP4);
+		ref += stride;
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_xor(DST_2, REF_2, TMP8);
+
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_or(DST_0, REF_0, TMP10);
+		vis_ld64_2(dest, stride, DST_0);
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+
+		vis_or(DST_2, REF_2, TMP12);
+		vis_ld64_2(dest, stride_8, DST_2);
+
+		vis_ld64(ref[0], TMP14);
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_psub16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_psub16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+
+		dest += stride;
+		vis_ld64_2(ref, 8, TMP16);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, offset, TMP18);
+		vis_faligndata(TMP2, TMP4, REF_2);
+		ref += stride;
+
+		vis_xor(DST_0, REF_0, TMP20);
+
+		vis_and(TMP20, MASK_fe, TMP20);
+
+		vis_xor(DST_2, REF_2, TMP22);
+		vis_mul8x16(CONST_128, TMP20, TMP20);
+
+		vis_and(TMP22, MASK_fe, TMP22);
+
+		vis_or(DST_0, REF_0, TMP24);
+		vis_mul8x16(CONST_128, TMP22, TMP22);
+
+		vis_or(DST_2, REF_2, TMP26);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_faligndata(TMP14, TMP16, REF_0);
+
+		vis_ld64_2(dest, stride_8, DST_2);
+		vis_faligndata(TMP16, TMP18, REF_2);
+
+		vis_and(TMP20, MASK_7f, TMP20);
+
+		vis_and(TMP22, MASK_7f, TMP22);
+
+		vis_psub16(TMP24, TMP20, TMP20);
+		vis_st64(TMP20, dest[0]);
+
+		vis_psub16(TMP26, TMP22, TMP22);
+		vis_st64_2(TMP22, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(DST_0, REF_0, TMP6);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_ld64_2(ref, offset, TMP4);
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_xor(DST_2, REF_2, TMP8);
+
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_or(DST_0, REF_0, TMP10);
+	vis_ld64_2(dest, stride, DST_0);
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+
+	vis_or(DST_2, REF_2, TMP12);
+	vis_ld64_2(dest, stride_8, DST_2);
+
+	vis_ld64(ref[0], TMP14);
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_psub16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_psub16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+
+	dest += stride;
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_2);
+
+	vis_xor(DST_0, REF_0, TMP20);
+
+	vis_and(TMP20, MASK_fe, TMP20);
+
+	vis_xor(DST_2, REF_2, TMP22);
+	vis_mul8x16(CONST_128, TMP20, TMP20);
+
+	vis_and(TMP22, MASK_fe, TMP22);
+
+	vis_or(DST_0, REF_0, TMP24);
+	vis_mul8x16(CONST_128, TMP22, TMP22);
+
+	vis_or(DST_2, REF_2, TMP26);
+
+	vis_and(TMP20, MASK_7f, TMP20);
+
+	vis_and(TMP22, MASK_7f, TMP22);
+
+	vis_psub16(TMP24, TMP20, TMP20);
+	vis_st64(TMP20, dest[0]);
+
+	vis_psub16(TMP26, TMP22, TMP22);
+	vis_st64_2(TMP22, dest, 8);
+}
+
+static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 8 : 0;
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64_2(ref, offset, TMP2);
+
+	vis_ld64(dest[0], DST_0);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants128[0], CONST_128);
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 12 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(DST_0, REF_0, TMP4);
+
+		vis_ld64_2(ref, offset, TMP2);
+		vis_and(TMP4, MASK_fe, TMP4);
+
+		vis_or(DST_0, REF_0, TMP6);
+		vis_ld64_2(dest, stride, DST_0);
+		ref += stride;
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_ld64(ref[0], TMP12);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, offset, TMP2);
+		vis_xor(DST_0, REF_0, TMP0);
+		ref += stride;
+
+		vis_and(TMP0, MASK_fe, TMP0);
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_psub16(TMP6, TMP4, TMP4);
+		vis_st64(TMP4, dest[0]);
+		dest += stride;
+		vis_mul8x16(CONST_128, TMP0, TMP0);
+
+		vis_or(DST_0, REF_0, TMP6);
+		vis_ld64_2(dest, stride, DST_0);
+
+		vis_faligndata(TMP12, TMP2, REF_0);
+
+		vis_and(TMP0, MASK_7f, TMP0);
+
+		vis_psub16(TMP6, TMP0, TMP4);
+		vis_st64(TMP4, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(DST_0, REF_0, TMP4);
+
+	vis_ld64_2(ref, offset, TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_or(DST_0, REF_0, TMP6);
+	vis_ld64_2(dest, stride, DST_0);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_xor(DST_0, REF_0, TMP0);
+
+	vis_and(TMP0, MASK_fe, TMP0);
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_psub16(TMP6, TMP4, TMP4);
+	vis_st64(TMP4, dest[0]);
+	dest += stride;
+	vis_mul8x16(CONST_128, TMP0, TMP0);
+
+	vis_or(DST_0, REF_0, TMP6);
+
+	vis_and(TMP0, MASK_7f, TMP0);
+
+	vis_psub16(TMP6, TMP0, TMP4);
+	vis_st64(TMP4, dest[0]);
+}
+
+static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0],    TMP0);
+
+	vis_ld64_2(ref, 8,  TMP2);
+
+	vis_ld64_2(ref, 16, TMP4);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants128[0], CONST_128);
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+		vis_faligndata(TMP2, TMP4, REF_6);
+	} else {
+		vis_src1(TMP2, REF_2);
+		vis_src1(TMP4, REF_6);
+	}
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 34 cycles */
+		vis_ld64(ref[0],    TMP0);
+		vis_xor(REF_0, REF_2, TMP6);
+
+		vis_ld64_2(ref, 8,  TMP2);
+		vis_xor(REF_4, REF_6, TMP8);
+
+		vis_ld64_2(ref, 16, TMP4);
+		vis_and(TMP6, MASK_fe, TMP6);
+		ref += stride;
+
+		vis_ld64(ref[0],    TMP14);
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_ld64_2(ref, 8,  TMP16);
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+		vis_or(REF_0, REF_2, TMP10);
+
+		vis_ld64_2(ref, 16, TMP18);
+		ref += stride;
+		vis_or(REF_4, REF_6, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+		}
+
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_psub16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_psub16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+		dest += stride;
+
+		vis_xor(REF_0, REF_2, TMP6);
+
+		vis_xor(REF_4, REF_6, TMP8);
+
+		vis_and(TMP6, MASK_fe, TMP6);
+
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+		vis_or(REF_0, REF_2, TMP10);
+
+		vis_or(REF_4, REF_6, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_faligndata(TMP14, TMP16, REF_0);
+
+		vis_faligndata(TMP16, TMP18, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP14, TMP16, REF_2);
+			vis_faligndata(TMP16, TMP18, REF_6);
+		} else {
+			vis_src1(TMP16, REF_2);
+			vis_src1(TMP18, REF_6);
+		}
+
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_psub16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_psub16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0],    TMP0);
+	vis_xor(REF_0, REF_2, TMP6);
+
+	vis_ld64_2(ref, 8,  TMP2);
+	vis_xor(REF_4, REF_6, TMP8);
+
+	vis_ld64_2(ref, 16, TMP4);
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+	vis_or(REF_0, REF_2, TMP10);
+
+	vis_or(REF_4, REF_6, TMP12);
+
+	vis_alignaddr_g0((void *)off);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+		vis_faligndata(TMP2, TMP4, REF_6);
+	} else {
+		vis_src1(TMP2, REF_2);
+		vis_src1(TMP4, REF_6);
+	}
+
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_psub16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_psub16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+	dest += stride;
+
+	vis_xor(REF_0, REF_2, TMP6);
+
+	vis_xor(REF_4, REF_6, TMP8);
+
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+	vis_or(REF_0, REF_2, TMP10);
+
+	vis_or(REF_4, REF_6, TMP12);
+
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_psub16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_psub16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+}
+
+static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64(ref[8], TMP2);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+
+	vis_ld64(constants128[0], CONST_128);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+	} else {
+		vis_src1(TMP2, REF_2);
+	}
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 20 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP4);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_and(TMP4, MASK_fe, TMP4);
+		ref += stride;
+
+		vis_ld64(ref[0], TMP8);
+		vis_or(REF_0, REF_2, TMP6);
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, 8, TMP10);
+		ref += stride;
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+		} else {
+			vis_src1(TMP2, REF_2);
+		}
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_psub16(TMP6, TMP4, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_or(REF_0, REF_2, TMP14);
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+		vis_faligndata(TMP8, TMP10, REF_0);
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP8, TMP10, REF_2);
+		} else {
+			vis_src1(TMP10, REF_2);
+		}
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_psub16(TMP14, TMP12, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP4);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_or(REF_0, REF_2, TMP6);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_alignaddr_g0((void *)off);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+	} else {
+		vis_src1(TMP2, REF_2);
+	}
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_psub16(TMP6, TMP4, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_or(REF_0, REF_2, TMP14);
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_psub16(TMP14, TMP12, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+}
+
+static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_fzero(ZERO);
+	vis_ld64(constants256_512[0], CONST_256);
+
+	ref = vis_alignaddr(ref);
+	do {	/* 26 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64(ref[8], TMP2);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64(ref[16], TMP4);
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64(dest[8], DST_2);
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+		}
+
+		vis_mul8x16au(REF_0,   CONST_256, TMP0);
+
+		vis_pmerge(ZERO,     REF_2,     TMP4);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_pmerge(ZERO, REF_2_1, TMP6);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_mul8x16al(DST_0,   CONST_512, TMP4);
+		vis_padd16(TMP2, TMP6, TMP2);
+
+		vis_mul8x16al(DST_1,   CONST_512, TMP6);
+
+		vis_mul8x16au(REF_6,   CONST_256, TMP12);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4,   CONST_256, TMP16);
+
+		vis_padd16(TMP0, CONST_3, TMP8);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP18);
+
+		vis_padd16(TMP2, CONST_3, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_padd16(TMP16, TMP12, TMP0);
+
+		vis_st64(DST_0, dest[0]);
+		vis_mul8x16al(DST_2,   CONST_512, TMP4);
+		vis_padd16(TMP18, TMP14, TMP2);
+
+		vis_mul8x16al(DST_3,   CONST_512, TMP6);
+		vis_padd16(TMP0, CONST_3, TMP0);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64(DST_2, dest[8]);
+
+		ref += stride;
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_times_2 = stride << 1;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_fzero(ZERO);
+	vis_ld64(constants256_512[0], CONST_256);
+
+	ref = vis_alignaddr(ref);
+	height >>= 2;
+	do {	/* 47 cycles */
+		vis_ld64(ref[0],   TMP0);
+
+		vis_ld64_2(ref, 8, TMP2);
+		ref += stride;
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64(ref[0],   TMP4);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, 8, TMP6);
+		ref += stride;
+
+		vis_ld64(ref[0],   TMP8);
+
+		vis_ld64_2(ref, 8, TMP10);
+		ref += stride;
+		vis_faligndata(TMP4, TMP6, REF_4);
+
+		vis_ld64(ref[0],   TMP12);
+
+		vis_ld64_2(ref, 8, TMP14);
+		ref += stride;
+		vis_faligndata(TMP8, TMP10, REF_S0);
+
+		vis_faligndata(TMP12, TMP14, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+
+			vis_ld64(dest[0], DST_0);
+			vis_faligndata(TMP0, TMP2, REF_2);
+
+			vis_ld64_2(dest, stride, DST_2);
+			vis_faligndata(TMP4, TMP6, REF_6);
+
+			vis_faligndata(TMP8, TMP10, REF_S2);
+
+			vis_faligndata(TMP12, TMP14, REF_S6);
+		} else {
+			vis_ld64(dest[0], DST_0);
+			vis_src1(TMP2, REF_2);
+
+			vis_ld64_2(dest, stride, DST_2);
+			vis_src1(TMP6, REF_6);
+
+			vis_src1(TMP10, REF_S2);
+
+			vis_src1(TMP14, REF_S6);
+		}
+
+		vis_pmerge(ZERO,     REF_0,     TMP0);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_pmerge(ZERO,     REF_2,     TMP4);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP6);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_4, CONST_256, TMP8);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP10);
+
+		vis_padd16(TMP0, TMP16, TMP0);
+		vis_mul8x16au(REF_6, CONST_256, TMP12);
+
+		vis_padd16(TMP2, TMP18, TMP2);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP8, CONST_3, TMP8);
+		vis_mul8x16al(DST_2, CONST_512, TMP16);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+		vis_mul8x16al(DST_3, CONST_512, TMP18);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP0, DST_0);
+
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP10, CONST_3, TMP10);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_padd16(TMP8, TMP16, TMP8);
+
+		vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
+		vis_padd16(TMP10, TMP18, TMP10);
+		vis_pack16(TMP8, DST_2);
+
+		vis_pack16(TMP10, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
+		vis_pmerge(ZERO,     REF_S0,     TMP0);
+
+		vis_pmerge(ZERO,     REF_S2,     TMP24);
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16au(REF_S4, CONST_256, TMP8);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
+
+		vis_padd16(TMP0, TMP24, TMP0);
+		vis_mul8x16au(REF_S6, CONST_256, TMP12);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP8, CONST_3, TMP8);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+
+		vis_padd16(TMP10, CONST_3, TMP10);
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+		vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
+
+		vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
+		vis_padd16(TMP0, TMP16, TMP0);
+
+		vis_padd16(TMP2, TMP18, TMP2);
+		vis_pack16(TMP0, DST_0);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_padd16(TMP8, TMP20, TMP8);
+
+		vis_padd16(TMP10, TMP22, TMP10);
+		vis_pack16(TMP8, DST_2);
+
+		vis_pack16(TMP10, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 16 : 0;
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64_2(ref, 8, TMP2);
+
+	vis_ld64_2(ref, offset, TMP4);
+	ref += stride;
+
+	vis_ld64(ref[0], TMP6);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64_2(ref, 8, TMP8);
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	vis_ld64_2(ref, offset, TMP10);
+	ref += stride;
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP6, TMP8, REF_2);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP8, TMP10, REF_6);
+
+	vis_ld64(constants128[0], CONST_128);
+	height = (height >> 1) - 1;
+	do {	/* 24 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_xor(REF_4, REF_6, TMP16);
+
+		vis_ld64_2(ref, offset, TMP4);
+		ref += stride;
+		vis_or(REF_0, REF_2, TMP14);
+
+		vis_ld64(ref[0], TMP6);
+		vis_or(REF_4, REF_6, TMP18);
+
+		vis_ld64_2(ref, 8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, offset, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_and(TMP16, MASK_fe, TMP16);
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+
+		vis_mul8x16(CONST_128, TMP16, TMP16);
+		vis_xor(REF_0, REF_2, TMP0);
+
+		vis_xor(REF_4, REF_6, TMP2);
+
+		vis_or(REF_0, REF_2, TMP20);
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_and(TMP16, MASK_7f, TMP16);
+
+		vis_psub16(TMP14, TMP12, TMP12);
+		vis_st64(TMP12, dest[0]);
+
+		vis_psub16(TMP18, TMP16, TMP16);
+		vis_st64_2(TMP16, dest, 8);
+		dest += stride;
+
+		vis_or(REF_4, REF_6, TMP18);
+
+		vis_and(TMP0, MASK_fe, TMP0);
+
+		vis_and(TMP2, MASK_fe, TMP2);
+		vis_mul8x16(CONST_128, TMP0, TMP0);
+
+		vis_faligndata(TMP6, TMP8, REF_2);
+		vis_mul8x16(CONST_128, TMP2, TMP2);
+
+		vis_faligndata(TMP8, TMP10, REF_6);
+
+		vis_and(TMP0, MASK_7f, TMP0);
+
+		vis_and(TMP2, MASK_7f, TMP2);
+
+		vis_psub16(TMP20, TMP0, TMP0);
+		vis_st64(TMP0, dest[0]);
+
+		vis_psub16(TMP18, TMP2, TMP2);
+		vis_st64_2(TMP2, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_xor(REF_4, REF_6, TMP16);
+
+	vis_ld64_2(ref, offset, TMP4);
+	vis_or(REF_0, REF_2, TMP14);
+
+	vis_or(REF_4, REF_6, TMP18);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_and(TMP16, MASK_fe, TMP16);
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+
+	vis_mul8x16(CONST_128, TMP16, TMP16);
+	vis_xor(REF_0, REF_2, TMP0);
+
+	vis_xor(REF_4, REF_6, TMP2);
+
+	vis_or(REF_0, REF_2, TMP20);
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_and(TMP16, MASK_7f, TMP16);
+
+	vis_psub16(TMP14, TMP12, TMP12);
+	vis_st64(TMP12, dest[0]);
+
+	vis_psub16(TMP18, TMP16, TMP16);
+	vis_st64_2(TMP16, dest, 8);
+	dest += stride;
+
+	vis_or(REF_4, REF_6, TMP18);
+
+	vis_and(TMP0, MASK_fe, TMP0);
+
+	vis_and(TMP2, MASK_fe, TMP2);
+	vis_mul8x16(CONST_128, TMP0, TMP0);
+
+	vis_mul8x16(CONST_128, TMP2, TMP2);
+
+	vis_and(TMP0, MASK_7f, TMP0);
+
+	vis_and(TMP2, MASK_7f, TMP2);
+
+	vis_psub16(TMP20, TMP0, TMP0);
+	vis_st64(TMP0, dest[0]);
+
+	vis_psub16(TMP18, TMP2, TMP2);
+	vis_st64_2(TMP2, dest, 8);
+}
+
+static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 8 : 0;
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64_2(ref, offset, TMP2);
+	ref += stride;
+
+	vis_ld64(ref[0], TMP4);
+
+	vis_ld64_2(ref, offset, TMP6);
+	ref += stride;
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP4, TMP6, REF_2);
+
+	vis_ld64(constants128[0], CONST_128);
+	height = (height >> 1) - 1;
+	do {	/* 12 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP4);
+
+		vis_ld64_2(ref, offset, TMP2);
+		ref += stride;
+		vis_and(TMP4, MASK_fe, TMP4);
+
+		vis_or(REF_0, REF_2, TMP6);
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64_2(ref, offset, TMP2);
+		ref += stride;
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+		vis_or(REF_0, REF_2, TMP14);
+
+		vis_psub16(TMP6, TMP4, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_faligndata(TMP0, TMP2, REF_2);
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_psub16(TMP14, TMP12, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP4);
+
+	vis_ld64_2(ref, offset, TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_or(REF_0, REF_2, TMP6);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+	vis_or(REF_0, REF_2, TMP14);
+
+	vis_psub16(TMP6, TMP4, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_psub16(TMP14, TMP12, DST_0);
+	vis_st64(DST_0, dest[0]);
+}
+
+static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8 = stride + 8;
+	int stride_16;
+	int offset;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 16 : 0;
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64_2(ref, offset, TMP4);
+	stride_16 = stride + offset;
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_faligndata(TMP0, TMP2, REF_2);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_6);
+	height >>= 1;
+
+	do {	/* 31 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_pmerge(ZERO,       REF_2,     TMP12);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP14);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_pmerge(ZERO,       REF_6,     TMP16);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_pmerge(ZERO,     REF_0,     TMP0);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_pmerge(ZERO,     REF_4,     TMP4);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+
+		vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
+		vis_faligndata(TMP6, TMP8, REF_2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP6);
+
+		vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
+		vis_faligndata(TMP8, TMP10, REF_6);
+		vis_mul8x16al(DST_0,   CONST_512, TMP20);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16al(DST_1,   CONST_512, TMP22);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16al(DST_2,   CONST_512, TMP24);
+
+		vis_padd16(TMP4, CONST_3, TMP4);
+		vis_mul8x16al(DST_3,   CONST_512, TMP26);
+
+		vis_padd16(TMP6, CONST_3, TMP6);
+
+		vis_padd16(TMP12, TMP20, TMP12);
+		vis_mul8x16al(REF_S0,   CONST_512, TMP20);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+		vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
+
+		vis_padd16(TMP16, TMP24, TMP16);
+		vis_mul8x16al(REF_S2,   CONST_512, TMP24);
+
+		vis_padd16(TMP18, TMP26, TMP18);
+		vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
+
+		vis_padd16(TMP12, TMP0, TMP12);
+		vis_mul8x16au(REF_2,   CONST_256, TMP28);
+
+		vis_padd16(TMP14, TMP2, TMP14);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP30);
+
+		vis_padd16(TMP16, TMP4, TMP16);
+		vis_mul8x16au(REF_6,   CONST_256, REF_S4);
+
+		vis_padd16(TMP18, TMP6, TMP18);
+		vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
+
+		vis_pack16(TMP12, DST_0);
+		vis_padd16(TMP28, TMP0, TMP12);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP30, TMP2, TMP14);
+
+		vis_pack16(TMP16, DST_2);
+		vis_padd16(REF_S4, TMP4, TMP16);
+
+		vis_pack16(TMP18, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+		vis_padd16(REF_S6, TMP6, TMP18);
+
+		vis_padd16(TMP12, TMP20, TMP12);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_padd16(TMP16, TMP24, TMP16);
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+
+		vis_padd16(TMP18, TMP26, TMP18);
+		vis_pack16(TMP16, DST_2);
+
+		vis_pack16(TMP18, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8;
+	int offset;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 8 : 0;
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64_2(ref, offset, TMP2);
+	stride_8 = stride + offset;
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_faligndata(TMP0, TMP2, REF_2);
+
+	vis_ld64(constants256_512[0], CONST_256);
+
+	height >>= 1;
+	do {	/* 20 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_pmerge(ZERO,       REF_2,     TMP8);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP10);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+
+		vis_ld64_2(dest, stride, DST_2);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride, TMP4);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+		vis_pmerge(ZERO,       REF_0,     TMP12);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+		vis_pmerge(ZERO,       REF_0_1,   TMP14);
+
+		vis_padd16(TMP12, CONST_3, TMP12);
+		vis_mul8x16al(DST_2,   CONST_512, TMP24);
+
+		vis_padd16(TMP14, CONST_3, TMP14);
+		vis_mul8x16al(DST_3,   CONST_512, TMP26);
+
+		vis_faligndata(TMP4, TMP6, REF_2);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_mul8x16au(REF_2,   CONST_256, TMP20);
+
+		vis_padd16(TMP8, TMP16, TMP0);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP22);
+
+		vis_padd16(TMP10, TMP18, TMP2);
+		vis_pack16(TMP0, DST_0);
+
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP12, TMP20, TMP12);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+
+		vis_padd16(TMP12, TMP24, TMP0);
+
+		vis_padd16(TMP14, TMP26, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
+			      const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+	int stride_16 = stride + 16;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(ref[16], TMP4);
+
+	vis_ld64(constants2[0], CONST_2);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_S4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+		vis_faligndata(TMP2, TMP4, REF_S6);
+	} else {
+		vis_src1(TMP2, REF_S2);
+		vis_src1(TMP4, REF_S6);
+	}
+
+	height >>= 1;
+	do {
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_mul8x16au(REF_S2, CONST_256, TMP16);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+		vis_mul8x16au(REF_S4, CONST_256, TMP20);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP22);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_mul8x16au(REF_S6, CONST_256, TMP24);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP26);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_faligndata(TMP6, TMP8, REF_S0);
+
+		vis_faligndata(TMP8, TMP10, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+			vis_faligndata(TMP6, TMP8, REF_S2);
+			vis_faligndata(TMP8, TMP10, REF_S6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+			vis_src1(TMP8, REF_S2);
+			vis_src1(TMP10, REF_S6);
+		}
+
+		vis_mul8x16au(REF_0, CONST_256, TMP0);
+		vis_pmerge(ZERO,      REF_0_1,  TMP2);
+
+		vis_mul8x16au(REF_2, CONST_256, TMP4);
+		vis_pmerge(ZERO,      REF_2_1,  TMP6);
+
+		vis_padd16(TMP0, CONST_2, TMP8);
+		vis_mul8x16au(REF_4, CONST_256, TMP0);
+
+		vis_padd16(TMP2, CONST_2, TMP10);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP2);
+
+		vis_padd16(TMP8, TMP4, TMP8);
+		vis_mul8x16au(REF_6, CONST_256, TMP4);
+
+		vis_padd16(TMP10, TMP6, TMP10);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP6);
+
+		vis_padd16(TMP12, TMP8, TMP12);
+
+		vis_padd16(TMP14, TMP10, TMP14);
+
+		vis_padd16(TMP12, TMP16, TMP12);
+
+		vis_padd16(TMP14, TMP18, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP0, CONST_2, TMP12);
+
+		vis_mul8x16au(REF_S0, CONST_256, TMP0);
+		vis_padd16(TMP2, CONST_2, TMP14);
+
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
+		vis_padd16(TMP12, TMP4, TMP12);
+
+		vis_mul8x16au(REF_S2, CONST_256, TMP4);
+		vis_padd16(TMP14, TMP6, TMP14);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
+		vis_padd16(TMP20, TMP12, TMP20);
+
+		vis_padd16(TMP22, TMP14, TMP22);
+
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP22, TMP26, TMP22);
+		vis_pack16(TMP20, DST_2);
+
+		vis_pack16(TMP22, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+		vis_padd16(TMP0, TMP4, TMP24);
+
+		vis_mul8x16au(REF_S4, CONST_256, TMP0);
+		vis_padd16(TMP2, TMP6, TMP26);
+
+		vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
+		vis_padd16(TMP24, TMP8, TMP24);
+
+		vis_padd16(TMP26, TMP10, TMP26);
+		vis_pack16(TMP24, DST_0);
+
+		vis_pack16(TMP26, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_pmerge(ZERO, REF_S6, TMP4);
+
+		vis_pmerge(ZERO,      REF_S6_1,  TMP6);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+
+		vis_padd16(TMP0, TMP12, TMP0);
+
+		vis_padd16(TMP2, TMP14, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(constants2[0], CONST_2);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+	} else {
+		vis_src1(TMP2, REF_S2);
+	}
+
+	height >>= 1;
+	do {	/* 26 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0,   CONST_256, TMP8);
+		vis_pmerge(ZERO,        REF_S2,    TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
+		vis_pmerge(ZERO,        REF_S2_1,  TMP14);
+
+		vis_ld64_2(ref, stride, TMP4);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+		vis_faligndata(TMP0, TMP2, REF_S4);
+
+		vis_pmerge(ZERO, REF_S4, TMP18);
+
+		vis_pmerge(ZERO, REF_S4_1, TMP20);
+
+		vis_faligndata(TMP4, TMP6, REF_S0);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_S6);
+			vis_faligndata(TMP4, TMP6, REF_S2);
+		} else {
+			vis_src1(TMP2, REF_S6);
+			vis_src1(TMP6, REF_S2);
+		}
+
+		vis_padd16(TMP18, CONST_2, TMP18);
+		vis_mul8x16au(REF_S6,   CONST_256, TMP22);
+
+		vis_padd16(TMP20, CONST_2, TMP20);
+		vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
+
+		vis_mul8x16au(REF_S0,   CONST_256, TMP26);
+		vis_pmerge(ZERO, REF_S0_1, TMP28);
+
+		vis_mul8x16au(REF_S2,   CONST_256, TMP30);
+		vis_padd16(TMP18, TMP22, TMP18);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP8,  TMP18, TMP8);
+
+		vis_padd16(TMP10, TMP20, TMP10);
+
+		vis_padd16(TMP8,  TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP8,  DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP18, TMP26, TMP18);
+
+		vis_padd16(TMP20, TMP28, TMP20);
+
+		vis_padd16(TMP18, TMP30, TMP18);
+
+		vis_padd16(TMP20, TMP32, TMP20);
+		vis_pack16(TMP18, DST_2);
+
+		vis_pack16(TMP20, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
+			      const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+	int stride_16 = stride + 16;
+
+	vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(ref[16], TMP4);
+
+	vis_ld64(constants6[0], CONST_6);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	vis_ld64(constants256_1024[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_S4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+		vis_faligndata(TMP2, TMP4, REF_S6);
+	} else {
+		vis_src1(TMP2, REF_S2);
+		vis_src1(TMP4, REF_S6);
+	}
+
+	height >>= 1;
+	do {	/* 55 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_mul8x16au(REF_S2, CONST_256, TMP16);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+		vis_mul8x16au(REF_S4, CONST_256, TMP20);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP22);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_mul8x16au(REF_S6, CONST_256, TMP24);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP26);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP6, TMP8, REF_S0);
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_faligndata(TMP8, TMP10, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+			vis_faligndata(TMP6, TMP8, REF_S2);
+			vis_faligndata(TMP8, TMP10, REF_S6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+			vis_src1(TMP8, REF_S2);
+			vis_src1(TMP10, REF_S6);
+		}
+
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO, REF_0, TMP0);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_pmerge(ZERO,      REF_0_1,  TMP2);
+
+		vis_mul8x16au(REF_2, CONST_256, TMP4);
+		vis_pmerge(ZERO,      REF_2_1,  TMP6);
+
+		vis_mul8x16al(DST_2,   CONST_1024, REF_0);
+		vis_padd16(TMP0, CONST_6, TMP0);
+
+		vis_mul8x16al(DST_3,   CONST_1024, REF_2);
+		vis_padd16(TMP2, CONST_6, TMP2);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_4, CONST_256, TMP4);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP6);
+
+		vis_padd16(TMP12, TMP0, TMP12);
+		vis_mul8x16au(REF_6, CONST_256, TMP8);
+
+		vis_padd16(TMP14, TMP2, TMP14);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP10);
+
+		vis_padd16(TMP12, TMP16, TMP12);
+		vis_mul8x16au(REF_S0, CONST_256, REF_4);
+
+		vis_padd16(TMP14, TMP18, TMP14);
+		vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
+
+		vis_padd16(TMP12, TMP30, TMP12);
+
+		vis_padd16(TMP14, TMP32, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP4, CONST_6, TMP4);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_padd16(TMP6, CONST_6, TMP6);
+		vis_mul8x16au(REF_S2, CONST_256, TMP12);
+
+		vis_padd16(TMP4, TMP8, TMP4);
+		vis_mul8x16au(REF_S2_1, CONST_256,  TMP14);
+
+		vis_padd16(TMP6, TMP10, TMP6);
+
+		vis_padd16(TMP20, TMP4, TMP20);
+
+		vis_padd16(TMP22, TMP6, TMP22);
+
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP22, TMP26, TMP22);
+
+		vis_padd16(TMP20, REF_0, TMP20);
+		vis_mul8x16au(REF_S4, CONST_256, REF_0);
+
+		vis_padd16(TMP22, REF_2, TMP22);
+		vis_pack16(TMP20, DST_2);
+
+		vis_pack16(TMP22, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO,      REF_S4_1,  REF_2);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_padd16(REF_4, TMP0, TMP8);
+
+		vis_mul8x16au(REF_S6, CONST_256, REF_4);
+		vis_padd16(REF_6, TMP2, TMP10);
+
+		vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+
+		vis_padd16(TMP8, TMP30, TMP8);
+
+		vis_padd16(TMP10, TMP32, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+
+		vis_padd16(REF_0, TMP4, REF_0);
+
+		vis_mul8x16al(DST_2,   CONST_1024, TMP30);
+		vis_padd16(REF_2, TMP6, REF_2);
+
+		vis_mul8x16al(DST_3,   CONST_1024, TMP32);
+		vis_padd16(REF_0, REF_4, REF_0);
+
+		vis_padd16(REF_2, REF_6, REF_2);
+
+		vis_padd16(REF_0, TMP30, REF_0);
+
+		/* stall */
+
+		vis_padd16(REF_2, TMP32, REF_2);
+		vis_pack16(REF_0, DST_2);
+
+		vis_pack16(REF_2, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+
+	vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64_2(ref, 8, TMP2);
+
+	vis_ld64(constants6[0], CONST_6);
+
+	vis_ld64(constants256_1024[0], CONST_256);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+	} else {
+		vis_src1(TMP2, REF_S2);
+	}
+
+	height >>= 1;
+	do {	/* 31 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP8);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP10);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+		vis_mul8x16au(REF_S2, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride, TMP4);
+		vis_faligndata(TMP0, TMP2, REF_S4);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP4, TMP6, REF_S0);
+
+		vis_ld64_2(dest, stride, DST_2);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_S6);
+			vis_faligndata(TMP4, TMP6, REF_S2);
+		} else {
+			vis_src1(TMP2, REF_S6);
+			vis_src1(TMP6, REF_S2);
+		}
+
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO, REF_S4, TMP22);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP24);
+
+		vis_mul8x16au(REF_S6, CONST_256, TMP26);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP28);
+
+		vis_mul8x16au(REF_S0, CONST_256, REF_S4);
+		vis_padd16(TMP22, CONST_6, TMP22);
+
+		vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
+		vis_padd16(TMP24, CONST_6, TMP24);
+
+		vis_mul8x16al(DST_2,   CONST_1024, REF_0);
+		vis_padd16(TMP22, TMP26, TMP22);
+
+		vis_mul8x16al(DST_3,   CONST_1024, REF_2);
+		vis_padd16(TMP24, TMP28, TMP24);
+
+		vis_mul8x16au(REF_S2, CONST_256, TMP26);
+		vis_padd16(TMP8, TMP22, TMP8);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
+		vis_padd16(TMP10, TMP24, TMP10);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+
+		vis_padd16(TMP8, TMP30, TMP8);
+
+		vis_padd16(TMP10, TMP32, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_padd16(REF_S4, TMP22, TMP12);
+
+		vis_padd16(REF_S6, TMP24, TMP14);
+
+		vis_padd16(TMP12, TMP26, TMP12);
+
+		vis_padd16(TMP14, TMP28, TMP14);
+
+		vis_padd16(TMP12, REF_0, TMP12);
+
+		vis_padd16(TMP14, REF_2, TMP14);
+		vis_pack16(TMP12, DST_2);
+
+		vis_pack16(TMP14, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+MPEG2_MC_EXTERN(vis);
+
+#endif  /* !(ARCH_SPARC) */
diff --git a/src/libmpeg2new/libmpeg2/mpeg2_internal.h b/src/libmpeg2new/libmpeg2/mpeg2_internal.h
index ccd1bc4b5..fec7d4744 100644
--- a/src/libmpeg2new/libmpeg2/mpeg2_internal.h
+++ b/src/libmpeg2new/libmpeg2/mpeg2_internal.h
@@ -21,6 +21,8 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#define STATE_INTERNAL_NORETURN ((mpeg2_state_t)-1)
+
 /* macroblock modes */
 #define MACROBLOCK_INTRA 1
 #define MACROBLOCK_PATTERN 2
@@ -29,12 +31,11 @@
 #define MACROBLOCK_QUANT 16
 #define DCT_TYPE_INTERLACED 32
 /* motion_type */
-#define MOTION_TYPE_MASK (3*64)
-#define MOTION_TYPE_BASE 64
-#define MC_FIELD (1*64)
-#define MC_FRAME (2*64)
-#define MC_16X8 (2*64)
-#define MC_DMV (3*64)
+#define MOTION_TYPE_SHIFT 6
+#define MC_FIELD 1
+#define MC_FRAME 2
+#define MC_16X8 2
+#define MC_DMV 3
 
 /* picture structure */
 #define TOP_FIELD 1
@@ -47,6 +48,8 @@
 #define B_TYPE 3
 #define D_TYPE 4
 
+typedef void mpeg2_mc_fct (uint8_t *, const uint8_t *, int, int);
+
 typedef struct {
     uint8_t * ref[2][3];
     uint8_t ** ref2[2];
@@ -54,27 +57,27 @@ typedef struct {
     int f_code[2];
 } motion_t;
 
+typedef void motion_parser_t (mpeg2_decoder_t * decoder,
+			      motion_t * motion,
+			      mpeg2_mc_fct * const * table);
+
 struct mpeg2_decoder_s {
     /* first, state that carries information from one macroblock to the */
     /* next inside a slice, and is never used outside of mpeg2_slice() */
 
-    /* DCT coefficients - should be kept aligned ! */
-    int16_t DCTblock[64];
-
     /* bit parsing stuff */
     uint32_t bitstream_buf;		/* current 32 bit working set */
     int bitstream_bits;			/* used bits in working set */
     const uint8_t * bitstream_ptr;	/* buffer with stream data */
 
     uint8_t * dest[3];
-    uint8_t * picture_dest[3];
-    void (* convert) (void * fbuf_id, uint8_t * const * src,
-		      unsigned int v_offset);
-    void * fbuf_id;
 
     int offset;
     int stride;
     int uv_stride;
+    int slice_stride;
+    int slice_uv_stride;
+    int stride_frame;
     unsigned int limit_x;
     unsigned int limit_y_16;
     unsigned int limit_y_8;
@@ -85,24 +88,34 @@ struct mpeg2_decoder_s {
     /* predictors */
     motion_t b_motion;
     motion_t f_motion;
+    motion_parser_t * motion_parser[5];
 
     /* predictor for DC coefficients in intra blocks */
     int16_t dc_dct_pred[3];
 
-    int quantizer_scale;	/* remove */
-    int dmv_offset;		/* remove */
-    unsigned int v_offset;	/* remove */
+    /* DCT coefficients */
+    int16_t DCTblock[64] ATTR_ALIGN(64);
+
+    uint8_t * picture_dest[3];
+    void (* convert) (void * convert_id, uint8_t * const * src,
+		      unsigned int v_offset);
+    void * convert_id;
+
+    int dmv_offset;
+    unsigned int v_offset;
 
     /* now non-slice-specific information */
 
     /* sequence header stuff */
-    uint8_t intra_quantizer_matrix [64];
-    uint8_t non_intra_quantizer_matrix [64];
+    uint16_t * quantizer_matrix[4];
+    uint16_t (* chroma_quantizer[2])[64];
+    uint16_t quantizer_prescale[4][32][64];
 
     /* The width and height of the picture snapped to macroblock units */
     int width;
     int height;
     int vertical_position_extension;
+    int chroma_format;
 
     /* picture header stuff */
 
@@ -120,8 +133,6 @@ struct mpeg2_decoder_s {
     /* bool to indicate whether intra blocks have motion vectors */
     /* (for concealment) */
     int concealment_motion_vectors;
-    /* bit to indicate which quantization table to use */
-    int q_scale_type;
     /* bool to use different vlc tables */
     int intra_vlc_format;
     /* used for DMV MC */
@@ -161,10 +172,10 @@ struct mpeg2dec_s {
     /* last start code ? */
     uint8_t code;
 
-    /* PTS */
-    uint32_t pts_current, pts_previous;
-    int num_pts;
-    int bytes_since_pts;
+    /* picture tags */
+    uint32_t tag_current, tag2_current, tag_previous, tag2_previous;
+    int num_tags;
+    int bytes_since_tag;
 
     int first;
     int alloc_index_user;
@@ -172,9 +183,13 @@ struct mpeg2dec_s {
     uint8_t first_decode_slice;
     uint8_t nb_decode_slices;
 
+    unsigned int user_data_len;
+
     mpeg2_sequence_t new_sequence;
     mpeg2_sequence_t sequence;
+    mpeg2_gop_t new_gop;
     mpeg2_gop_t gop;
+    mpeg2_picture_t new_picture;
     mpeg2_picture_t pictures[4];
     mpeg2_picture_t * picture;
     /*const*/ mpeg2_fbuf_t * fbuf[3];	/* 0: current fbuf, 1-2: prediction fbufs */
@@ -184,11 +199,13 @@ struct mpeg2dec_s {
 
     uint8_t * yuv_buf[3][3];
     int yuv_index;
-    void * convert_id;
-    int convert_size[3];
-    void (* convert_start) (void * id, uint8_t * const * dest, int flags);
-    void (* convert_copy) (void * id, uint8_t * const * src,
-			   unsigned int v_offset);
+    mpeg2_convert_t * convert;
+    void * convert_arg;
+    unsigned int convert_id_size;
+    int convert_stride;
+    void (* convert_start) (void * id, const mpeg2_fbuf_t * fbuf,
+			    const mpeg2_picture_t * picture,
+			    const mpeg2_gop_t * gop);
 
     uint8_t * buf_start;
     uint8_t * buf_end;
@@ -196,8 +213,9 @@ struct mpeg2dec_s {
     int16_t display_offset_x, display_offset_y;
 
     int copy_matrix;
-    uint8_t intra_quantizer_matrix [64];
-    uint8_t non_intra_quantizer_matrix [64];
+    int8_t q_scale_type, scaled[4];
+    uint8_t quantizer_matrix[4][64];
+    uint8_t new_quantizer_matrix[4][64];
 };
 
 typedef struct {
@@ -207,50 +225,35 @@ typedef struct {
     int dummy;
 } cpu_state_t;
 
-/* alloc.c */
-#define ALLOC_MPEG2DEC 0
-#define ALLOC_CHUNK 1
-#define ALLOC_YUV 2
-#define ALLOC_CONVERT_ID 3
-#define ALLOC_CONVERTED 4
-void * mpeg2_malloc (int size, int reason);
-void mpeg2_free (void * buf);
-
 /* cpu_accel.c */
-uint32_t mpeg2_detect_accel (void);
+uint32_t mpeg2_detect_accel (uint32_t accel);
 
 /* cpu_state.c */
 void mpeg2_cpu_state_init (uint32_t accel);
 
 /* decode.c */
-mpeg2_state_t mpeg2_seek_sequence (mpeg2dec_t * mpeg2dec);
+mpeg2_state_t mpeg2_seek_header (mpeg2dec_t * mpeg2dec);
 mpeg2_state_t mpeg2_parse_header (mpeg2dec_t * mpeg2dec);
 
 /* header.c */
 void mpeg2_header_state_init (mpeg2dec_t * mpeg2dec);
+void mpeg2_reset_info (mpeg2_info_t * info);
 int mpeg2_header_sequence (mpeg2dec_t * mpeg2dec);
 int mpeg2_header_gop (mpeg2dec_t * mpeg2dec);
 mpeg2_state_t mpeg2_header_picture_start (mpeg2dec_t * mpeg2dec);
 int mpeg2_header_picture (mpeg2dec_t * mpeg2dec);
 int mpeg2_header_extension (mpeg2dec_t * mpeg2dec);
 int mpeg2_header_user_data (mpeg2dec_t * mpeg2dec);
-void mpeg2_header_matrix_finalize (mpeg2dec_t * mpeg2dec);
 void mpeg2_header_sequence_finalize (mpeg2dec_t * mpeg2dec);
+void mpeg2_header_gop_finalize (mpeg2dec_t * mpeg2dec);
+void mpeg2_header_picture_finalize (mpeg2dec_t * mpeg2dec, uint32_t accels);
 mpeg2_state_t mpeg2_header_slice_start (mpeg2dec_t * mpeg2dec);
 mpeg2_state_t mpeg2_header_end (mpeg2dec_t * mpeg2dec);
-void mpeg2_set_fbuf (mpeg2dec_t * mpeg2dec, int coding_type);
+void mpeg2_set_fbuf (mpeg2dec_t * mpeg2dec, int b_type);
 
 /* idct.c */
 void mpeg2_idct_init (uint32_t accel);
 
-/* idct_mlib.c */
-void mpeg2_idct_add_mlib (int last, int16_t * block,
-			  uint8_t * dest, int stride);
-void mpeg2_idct_copy_mlib_non_ieee (int16_t * block, uint8_t * dest,
-				    int stride);
-void mpeg2_idct_add_mlib_non_ieee (int last, int16_t * block,
-				   uint8_t * dest, int stride);
-
 /* idct_mmx.c */
 void mpeg2_idct_copy_mmxext (int16_t * block, uint8_t * dest, int stride);
 void mpeg2_idct_add_mmxext (int last, int16_t * block,
@@ -278,8 +281,6 @@ void mpeg2_idct_alpha_init (void);
 /* motion_comp.c */
 void mpeg2_mc_init (uint32_t accel);
 
-typedef void mpeg2_mc_fct (uint8_t *, const uint8_t *, int, int);
-
 typedef struct {
     mpeg2_mc_fct * put [8];
     mpeg2_mc_fct * avg [8];
@@ -298,4 +299,4 @@ extern mpeg2_mc_t mpeg2_mc_mmxext;
 extern mpeg2_mc_t mpeg2_mc_3dnow;
 extern mpeg2_mc_t mpeg2_mc_altivec;
 extern mpeg2_mc_t mpeg2_mc_alpha;
-extern mpeg2_mc_t mpeg2_mc_mlib;
+extern mpeg2_mc_t mpeg2_mc_vis;
diff --git a/src/libmpeg2new/libmpeg2/rgb.c b/src/libmpeg2new/libmpeg2/rgb.c
new file mode 100644
index 000000000..8863b0b9f
--- /dev/null
+++ b/src/libmpeg2new/libmpeg2/rgb.c
@@ -0,0 +1,598 @@
+/*
+ * rgb.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+#include "attributes.h"
+
+#include <inttypes.h>
+
+#include "mpeg2.h"
+#include "mpeg2convert.h"
+#include "convert_internal.h"
+
+static int matrix_coefficients = 6;
+
+static const int Inverse_Table_6_9[8][4] = {
+    {117504, 138453, 13954, 34903}, /* no sequence_display_extension */
+    {117504, 138453, 13954, 34903}, /* ITU-R Rec. 709 (1990) */
+    {104597, 132201, 25675, 53279}, /* unspecified */
+    {104597, 132201, 25675, 53279}, /* reserved */
+    {104448, 132798, 24759, 53109}, /* FCC */
+    {104597, 132201, 25675, 53279}, /* ITU-R Rec. 624-4 System B, G */
+    {104597, 132201, 25675, 53279}, /* SMPTE 170M */
+    {117579, 136230, 16907, 35559}  /* SMPTE 240M (1987) */
+};
+
+static const uint8_t dither[] ATTR_ALIGN(32) = {
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+    15, 36,  7, 18, 21, 50, 13, 31, 17, 39,  9, 21, 22, 53, 15, 35,
+    15, 36,  7, 18, 21, 50, 13, 31, 17, 39,  9, 21, 22, 53, 15, 35,
+    15, 36,  7, 18, 21, 50, 13, 31, 17, 39,  9, 21, 22, 53, 15, 35,
+    15, 36,  7, 18, 21, 50, 13, 31, 17, 39,  9, 21, 22, 53, 15, 35,
+     3,  9, 27, 63,  1,  4, 25, 59,  5, 12, 28, 67,  3,  7, 26, 62,
+     3,  9, 27, 63,  1,  4, 25, 59,  5, 12, 28, 67,  3,  7, 26, 62,
+     3,  9, 27, 63,  1,  4, 25, 59,  5, 12, 28, 67,  3,  7, 26, 62,
+     3,  9, 27, 63,  1,  4, 25, 59,  5, 12, 28, 67,  3,  7, 26, 62,
+    19, 45, 11, 27, 17, 41,  9, 22, 21, 49, 13, 30, 19, 44, 11, 26,
+    19, 45, 11, 27, 17, 41,  9, 22, 21, 49, 13, 30, 19, 44, 11, 26,
+    19, 45, 11, 27, 17, 41,  9, 22, 21, 49, 13, 30, 19, 44, 11, 26,
+    19, 45, 11, 27, 17, 41,  9, 22, 21, 49, 13, 30, 19, 44, 11, 26,
+     0,  2, 24, 57,  6, 15, 30, 70,  0,  1, 23, 55,  6, 14, 29, 69,
+     0,  2, 24, 57,  6, 15, 30, 70,  0,  1, 23, 55,  6, 14, 29, 69,
+     0,  2, 24, 57,  6, 15, 30, 70,  0,  1, 23, 55,  6, 14, 29, 69,
+     0,  2, 24, 57,  6, 15, 30, 70,  0,  1, 23, 55,  6, 14, 29, 69,
+    16, 38,  8, 20, 22, 52, 14, 34, 16, 37,  8, 19, 21, 51, 14, 33,
+    16, 38,  8, 20, 22, 52, 14, 34, 16, 37,  8, 19, 21, 51, 14, 33,
+    16, 38,  8, 20, 22, 52, 14, 34, 16, 37,  8, 19, 21, 51, 14, 33,
+    16, 38,  8, 20, 22, 52, 14, 34, 16, 37,  8, 19, 21, 51, 14, 33,
+     4, 11, 28, 66,  2,  6, 26, 61,  4, 10, 27, 65,  2,  5, 25, 60,
+     4, 11, 28, 66,  2,  6, 26, 61,  4, 10, 27, 65,  2,  5, 25, 60,
+     4, 11, 28, 66,  2,  6, 26, 61,  4, 10, 27, 65,  2,  5, 25, 60,
+     4, 11, 28, 66,  2,  6, 26, 61,  4, 10, 27, 65,  2,  5, 25, 60,
+    20, 47, 12, 29, 18, 43, 10, 25, 20, 46, 12, 28, 18, 42, 10, 23,
+    20, 47, 12, 29, 18, 43, 10, 25, 20, 46, 12, 28, 18, 42, 10, 23,
+    20, 47, 12, 29, 18, 43, 10, 25, 20, 46, 12, 28, 18, 42, 10, 23,
+    20, 47, 12, 29, 18, 43, 10, 25, 20, 46, 12, 28, 18, 42, 10, 23,
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+    15, 36,  7, 18, 21, 50, 13, 31, 17, 39,  9, 21, 22, 53, 15, 35,
+    15, 36,  7, 18, 21, 50, 13, 31, 17, 39,  9, 21, 22, 53, 15, 35
+};
+
+static const uint8_t dither_temporal[64] = {
+    0x00, 0x20, 0x21, 0x01, 0x40, 0x60, 0x61, 0x41,
+    0x42, 0x62, 0x63, 0x43, 0x02, 0x22, 0x23, 0x03,
+    0x80, 0xa0, 0xa1, 0x81, 0xc0, 0xe0, 0xe1, 0xc1,
+    0xc2, 0xe2, 0xe3, 0xc3, 0x82, 0xa2, 0xa3, 0x83,
+    0x84, 0xa4, 0xa5, 0x85, 0xc4, 0xe4, 0xe5, 0xc5,
+    0xc6, 0xe6, 0xe7, 0xc7, 0x86, 0xa6, 0xa7, 0x87,
+    0x04, 0x24, 0x25, 0x05, 0x44, 0x64, 0x65, 0x45,
+    0x46, 0x66, 0x67, 0x47, 0x06, 0x26, 0x27, 0x07
+};
+
+typedef struct {
+    convert_rgb_t base;
+    void * table_rV[256];
+    void * table_gU[256];
+    int table_gV[256];
+    void * table_bU[256];
+} convert_rgb_c_t;
+
+#define RGB(type,i)							\
+    U = pu[i];								\
+    V = pv[i];								\
+    r = (type *) id->table_rV[V];					\
+    g = (type *) (((uint8_t *)id->table_gU[U]) + id->table_gV[V]);	\
+    b = (type *) id->table_bU[U];
+
+#define DST(py,dst,i,j)			\
+    Y = py[i];				\
+    dst[i] = r[Y] + g[Y] + b[Y];
+
+#define DSTRGB(py,dst,i,j)					\
+    Y = py[i];							\
+    dst[3*i] = r[Y]; dst[3*i+1] = g[Y]; dst[3*i+2] = b[Y];
+
+#define DSTBGR(py,dst,i,j)					\
+    Y = py[i];							\
+    dst[3*i] = b[Y]; dst[3*i+1] = g[Y]; dst[3*i+2] = r[Y];
+
+#define DSTDITHER(py,dst,i,j)						  \
+    Y = py[i];								  \
+    dst[i] = r[Y+pd[2*i+96*j]] + g[Y-pd[2*i+96*j]] + b[Y+pd[2*i+1+96*j]];
+
+#define DO(x) x
+#define SKIP(x)
+
+#define DECLARE_420(func,type,num,DST,DITHER)				\
+static void func (void * _id, uint8_t * const * src,			\
+		  unsigned int v_offset)				\
+{									\
+    const convert_rgb_c_t * const id = (convert_rgb_c_t *) _id;		\
+    type * dst_1;							\
+    const uint8_t * py_1, * pu, * pv;					\
+    int i;								\
+    DITHER(uint8_t dithpos = id->base.dither_offset;)			\
+									\
+    dst_1 = (type *)(id->base.rgb_ptr + id->base.rgb_slice * v_offset);	\
+    py_1 = src[0];	pu = src[1];	pv = src[2];			\
+									\
+    i = 8;								\
+    do {								\
+	const uint8_t * py_2;						\
+	int j, U, V, Y;							\
+	const type * r, * g, * b;					\
+	type * dst_2;							\
+	DITHER(const uint8_t * const pd = dither + 2 * dithpos;)	\
+									\
+	dst_2 = (type *)((char *)dst_1 + id->base.rgb_stride);		\
+	py_2 = py_1 + id->base.y_stride;				\
+	j = id->base.width;						\
+	do {								\
+	    RGB (type, 0)						\
+	    DST (py_1, dst_1, 0, 0)					\
+	    DST (py_1, dst_1, 1, 0)					\
+	    DST (py_2, dst_2, 0, 1)					\
+	    DST (py_2, dst_2, 1, 1)					\
+									\
+	    RGB (type, 1)						\
+	    DST (py_2, dst_2, 2, 1)					\
+	    DST (py_2, dst_2, 3, 1)					\
+	    DST (py_1, dst_1, 2, 0)					\
+	    DST (py_1, dst_1, 3, 0)					\
+									\
+	    RGB (type, 2)						\
+	    DST (py_1, dst_1, 4, 0)					\
+	    DST (py_1, dst_1, 5, 0)					\
+	    DST (py_2, dst_2, 4, 1)					\
+	    DST (py_2, dst_2, 5, 1)					\
+									\
+	    RGB (type, 3)						\
+	    DST (py_2, dst_2, 6, 1)					\
+	    DST (py_2, dst_2, 7, 1)					\
+	    DST (py_1, dst_1, 6, 0)					\
+	    DST (py_1, dst_1, 7, 0)					\
+									\
+	    pu += 4;							\
+	    pv += 4;							\
+	    py_1 += 8;							\
+	    py_2 += 8;							\
+	    dst_1 += 8 * num;						\
+	    dst_2 += 8 * num;						\
+	} while (--j);							\
+	if (--i == id->base.field) {					\
+	    dst_1 = (type *)(id->base.rgb_ptr +				\
+			     id->base.rgb_slice * (v_offset + 1));	\
+	    py_1 = src[0] + id->base.y_stride_frame;			\
+	    pu = src[1] + id->base.uv_stride_frame;			\
+	    pv = src[2] + id->base.uv_stride_frame;			\
+	} else {							\
+	    py_1 += id->base.y_increm;					\
+	    pu += id->base.uv_increm;					\
+	    pv += id->base.uv_increm;					\
+	    dst_1 = (type *)((char *)dst_1 + id->base.rgb_increm);	\
+	    DITHER(dithpos += id->base.dither_stride;)			\
+	}								\
+    } while (i);							\
+}
+
+DECLARE_420 (rgb_c_32_420, uint32_t, 1, DST, SKIP)
+DECLARE_420 (rgb_c_24_rgb_420, uint8_t, 3, DSTRGB, SKIP)
+DECLARE_420 (rgb_c_24_bgr_420, uint8_t, 3, DSTBGR, SKIP)
+DECLARE_420 (rgb_c_16_420, uint16_t, 1, DST, SKIP)
+DECLARE_420 (rgb_c_8_420, uint8_t, 1, DSTDITHER, DO)
+
+#define DECLARE_422(func,type,num,DST,DITHER)				\
+static void func (void * _id, uint8_t * const * src,			\
+		  unsigned int v_offset)				\
+{									\
+    const convert_rgb_c_t * const id = (convert_rgb_c_t *) _id;		\
+    type * dst;								\
+    const uint8_t * py, * pu, * pv;					\
+    int i;								\
+    DITHER(uint8_t dithpos = id->base.dither_offset;)			\
+									\
+    dst = (type *)(id->base.rgb_ptr + id->base.rgb_stride * v_offset);	\
+    py = src[0];	pu = src[1];	pv = src[2];			\
+									\
+    i = 16;								\
+    do {								\
+	int j, U, V, Y;							\
+	const type * r, * g, * b;					\
+	DITHER(const uint8_t * const pd = dither + 2 * dithpos;)	\
+									\
+	j = id->base.width;						\
+	do {								\
+	    RGB (type, 0)						\
+	    DST (py, dst, 0, 0)						\
+	    DST (py, dst, 1, 0)						\
+									\
+	    RGB (type, 1)						\
+	    DST (py, dst, 2, 0)						\
+	    DST (py, dst, 3, 0)						\
+									\
+	    RGB (type, 2)						\
+	    DST (py, dst, 4, 0)						\
+	    DST (py, dst, 5, 0)						\
+									\
+	    RGB (type, 3)						\
+	    DST (py, dst, 6, 0)						\
+	    DST (py, dst, 7, 0)						\
+									\
+	    pu += 4;							\
+	    pv += 4;							\
+	    py += 8;							\
+	    dst += 8 * num;						\
+	} while (--j);							\
+	py += id->base.y_increm;					\
+	pu += id->base.uv_increm;					\
+	pv += id->base.uv_increm;					\
+	dst = (type *)((char *)dst + id->base.rgb_increm);		\
+	DITHER(dithpos += id->base.dither_stride;)			\
+    } while (--i);							\
+}
+
+DECLARE_422 (rgb_c_32_422, uint32_t, 1, DST, SKIP)
+DECLARE_422 (rgb_c_24_rgb_422, uint8_t, 3, DSTRGB, SKIP)
+DECLARE_422 (rgb_c_24_bgr_422, uint8_t, 3, DSTBGR, SKIP)
+DECLARE_422 (rgb_c_16_422, uint16_t, 1, DST, SKIP)
+DECLARE_422 (rgb_c_8_422, uint8_t, 1, DSTDITHER, DO)
+
+#define DECLARE_444(func,type,num,DST,DITHER)				\
+static void func (void * _id, uint8_t * const * src,			\
+		  unsigned int v_offset)				\
+{									\
+    const convert_rgb_c_t * const id = (convert_rgb_c_t *) _id;		\
+    type * dst;								\
+    const uint8_t * py, * pu, * pv;					\
+    int i;								\
+    DITHER(uint8_t dithpos = id->base.dither_offset;)			\
+									\
+    dst = (type *)(id->base.rgb_ptr + id->base.rgb_stride * v_offset);	\
+    py = src[0];	pu = src[1];	pv = src[2];			\
+									\
+    i = 16;								\
+    do {								\
+	int j, U, V, Y;							\
+	const type * r, * g, * b;					\
+	DITHER(const uint8_t * const pd = dither + 2 * dithpos;)	\
+									\
+	j = id->base.width;						\
+	do {								\
+	    RGB (type, 0)						\
+	    DST (py, dst, 0, 0)						\
+	    RGB (type, 1)						\
+	    DST (py, dst, 1, 0)						\
+	    RGB (type, 2)						\
+	    DST (py, dst, 2, 0)						\
+	    RGB (type, 3)						\
+	    DST (py, dst, 3, 0)						\
+	    RGB (type, 4)						\
+	    DST (py, dst, 4, 0)						\
+	    RGB (type, 5)						\
+	    DST (py, dst, 5, 0)						\
+	    RGB (type, 6)						\
+	    DST (py, dst, 6, 0)						\
+	    RGB (type, 7)						\
+	    DST (py, dst, 7, 0)						\
+									\
+	    pu += 8;							\
+	    pv += 8;							\
+	    py += 8;							\
+	    dst += 8 * num;						\
+	} while (--j);							\
+	py += id->base.y_increm;				   	\
+	pu += id->base.y_increm;				   	\
+	pv += id->base.y_increm;				   	\
+	dst = (type *)((char *)dst + id->base.rgb_increm);		\
+	DITHER(dithpos += id->base.dither_stride;)			\
+    } while (--i);							\
+}
+
+DECLARE_444 (rgb_c_32_444, uint32_t, 1, DST, SKIP)
+DECLARE_444 (rgb_c_24_rgb_444, uint8_t, 3, DSTRGB, SKIP)
+DECLARE_444 (rgb_c_24_bgr_444, uint8_t, 3, DSTBGR, SKIP)
+DECLARE_444 (rgb_c_16_444, uint16_t, 1, DST, SKIP)
+DECLARE_444 (rgb_c_8_444, uint8_t, 1, DSTDITHER, DO)
+
+static void rgb_start (void * _id, const mpeg2_fbuf_t * fbuf,
+		       const mpeg2_picture_t * picture,
+		       const mpeg2_gop_t * gop)
+{
+    convert_rgb_t * id = (convert_rgb_t *) _id;
+    int uv_stride = id->uv_stride_frame;
+    id->y_stride = id->y_stride_frame;
+    id->rgb_ptr = fbuf->buf[0];
+    id->rgb_slice = id->rgb_stride = id->rgb_stride_frame;
+    id->dither_stride = 32;
+    id->dither_offset = dither_temporal[picture->temporal_reference & 63];
+    id->field = 0;
+    if ((picture->nb_fields == 1) ||
+	(id->chroma420 && !(picture->flags & PIC_FLAG_PROGRESSIVE_FRAME))) {
+	uv_stride <<= 1;
+	id->y_stride <<= 1;
+	id->rgb_stride <<= 1;
+	id->dither_stride <<= 1;
+	id->dither_offset += 16;
+	if (picture->nb_fields == 1) {
+	    id->rgb_slice <<= 1;
+	    if (!(picture->flags & PIC_FLAG_TOP_FIELD_FIRST)) {
+		id->rgb_ptr += id->rgb_stride_frame;
+		id->dither_offset += 32;
+	    }
+	} else
+	    id->field = 8 >> id->convert420;
+    }
+    id->y_increm = (id->y_stride << id->convert420) - id->y_stride_frame;
+    id->uv_increm = uv_stride - id->uv_stride_frame;
+    id->rgb_increm = (id->rgb_stride << id->convert420) - id->rgb_stride_min;
+    id->dither_stride <<= id->convert420;
+}
+
+static inline int div_round (int dividend, int divisor)
+{
+    if (dividend > 0)
+	return (dividend + (divisor>>1)) / divisor;
+    else
+	return -((-dividend + (divisor>>1)) / divisor);
+}
+
+static unsigned int rgb_c_init (convert_rgb_c_t * id,
+				mpeg2convert_rgb_order_t order,
+				unsigned int bpp)
+{
+    int i;
+    uint8_t table_Y[1024];
+    uint32_t * table_32 = 0;
+    uint16_t * table_16 = 0;
+    uint8_t * table_8 = 0;
+    uint8_t * table_332 = 0;
+    int entry_size = 0;
+    void * table_r = 0;
+    void * table_g = 0;
+    void * table_b = 0;
+
+    int crv = Inverse_Table_6_9[matrix_coefficients][0];
+    int cbu = Inverse_Table_6_9[matrix_coefficients][1];
+    int cgu = -Inverse_Table_6_9[matrix_coefficients][2];
+    int cgv = -Inverse_Table_6_9[matrix_coefficients][3];
+
+    for (i = 0; i < 1024; i++) {
+	int j;
+
+	j = (76309 * (i - 384 - 16) + 32768) >> 16;
+	table_Y[i] = (j < 0) ? 0 : ((j > 255) ? 255 : j);
+    }
+
+    switch (bpp) {
+    case 32:
+	if (!id)
+	    return (197 + 2*682 + 256 + 132) * sizeof (uint32_t);
+	table_32 = (uint32_t *) (id + 1);
+	entry_size = sizeof (uint32_t);
+	table_r = table_32 + 197;
+	table_b = table_32 + 197 + 685;
+	table_g = table_32 + 197 + 2*682;
+
+	for (i = -197; i < 256+197; i++)
+	    ((uint32_t *) table_r)[i] =
+		table_Y[i+384] << ((order == MPEG2CONVERT_RGB) ? 16 : 0);
+	for (i = -132; i < 256+132; i++)
+	    ((uint32_t *) table_g)[i] = table_Y[i+384] << 8;
+	for (i = -232; i < 256+232; i++)
+	    ((uint32_t *) table_b)[i] =
+		table_Y[i+384] << ((order == MPEG2CONVERT_RGB) ? 0 : 16);
+	break;
+
+    case 24:
+	if (!id)
+	    return (256 + 2*232) * sizeof (uint8_t);
+	table_8 = (uint8_t *) (id + 1);
+	entry_size = sizeof (uint8_t);
+	table_r = table_g = table_b = table_8 + 232;
+
+	for (i = -232; i < 256+232; i++)
+	    ((uint8_t * )table_b)[i] = table_Y[i+384];
+	break;
+
+    case 15:
+    case 16:
+	if (!id)
+	    return (197 + 2*682 + 256 + 132) * sizeof (uint16_t);
+	table_16 = (uint16_t *) (id + 1);
+	entry_size = sizeof (uint16_t);
+	table_r = table_16 + 197;
+	table_b = table_16 + 197 + 685;
+	table_g = table_16 + 197 + 2*682;
+
+	for (i = -197; i < 256+197; i++) {
+	    int j = table_Y[i+384] >> 3;
+
+	    if (order == MPEG2CONVERT_RGB)
+		j <<= ((bpp==16) ? 11 : 10);
+
+	    ((uint16_t *)table_r)[i] = j;
+	}
+	for (i = -132; i < 256+132; i++) {
+	    int j = table_Y[i+384] >> ((bpp==16) ? 2 : 3);
+
+	    ((uint16_t *)table_g)[i] = j << 5;
+	}
+	for (i = -232; i < 256+232; i++) {
+	    int j = table_Y[i+384] >> 3;
+
+	    if (order == MPEG2CONVERT_BGR)
+		j <<= ((bpp==16) ? 11 : 10);
+
+	    ((uint16_t *)table_b)[i] = j;
+	}
+	break;
+
+    case 8:
+	if (!id)
+	    return (197 + 2*682 + 256 + 232 + 71) * sizeof (uint8_t);
+	table_332 = (uint8_t *) (id + 1);
+	entry_size = sizeof (uint8_t);
+	table_r = table_332 + 197;
+	table_g = table_332 + 197 + 682 + 30;
+	table_b = table_332 + 197 + 2*682;
+
+	for (i = -197; i < 256+197+30; i++)
+	    ((uint8_t *)table_r)[i] = ((table_Y[i+384] * 7 / 255) <<
+				       (order == MPEG2CONVERT_RGB ? 5 : 0));
+	for (i = -132; i < 256+132+30; i++)
+	    ((uint8_t *)table_g)[i-30] = ((table_Y[i+384] * 7 / 255) <<
+					  (order == MPEG2CONVERT_RGB ? 2 : 3));
+	for (i = -232; i < 256+232+71; i++)
+	    ((uint8_t *)table_b)[i] = ((table_Y[i+384] / 85) <<
+				       (order == MPEG2CONVERT_RGB ? 0 : 6));
+	break;
+    }
+
+    for (i = 0; i < 256; i++) {
+	id->table_rV[i] = (((uint8_t *)table_r) +
+			   entry_size * div_round (crv * (i-128), 76309));
+	id->table_gU[i] = (((uint8_t *)table_g) +
+			   entry_size * div_round (cgu * (i-128), 76309));
+	id->table_gV[i] = entry_size * div_round (cgv * (i-128), 76309);
+	id->table_bU[i] = (((uint8_t *)table_b) +
+			   entry_size * div_round (cbu * (i-128), 76309));
+    }
+
+    return 0;
+}
+
+static int rgb_internal (mpeg2convert_rgb_order_t order, unsigned int bpp,
+			 int stage, void * _id, const mpeg2_sequence_t * seq,
+			 int stride, uint32_t accel, void * arg,
+			 mpeg2_convert_init_t * result)
+{
+    convert_rgb_t * id = (convert_rgb_t *) _id;
+    mpeg2convert_copy_t * copy = (mpeg2convert_copy_t *) 0;
+    unsigned int id_size = sizeof (convert_rgb_t);
+    int chroma420 = (seq->chroma_height < seq->height);
+    int convert420 = 0;
+    int rgb_stride_min = ((bpp + 7) >> 3) * seq->width;
+
+#ifdef ARCH_X86
+    if (!copy && (accel & MPEG2_ACCEL_X86_MMXEXT)) {
+	convert420 = 0;
+	copy = mpeg2convert_rgb_mmxext (order, bpp, seq);
+    }
+    if (!copy && (accel & MPEG2_ACCEL_X86_MMX)) {
+	convert420 = 0;
+	copy = mpeg2convert_rgb_mmx (order, bpp, seq);
+    }
+#endif
+#ifdef ARCH_SPARC
+    if (!copy && (accel & MPEG2_ACCEL_SPARC_VIS)) {
+	convert420 = chroma420;
+	copy = mpeg2convert_rgb_vis (order, bpp, seq);
+    }
+#endif
+    if (!copy) {
+	int src, dest;
+	static void (* rgb_c[3][5]) (void *, uint8_t * const *,
+				     unsigned int) =
+	    {{rgb_c_24_bgr_420, rgb_c_8_420, rgb_c_16_420,
+	      rgb_c_24_rgb_420, rgb_c_32_420},
+	     {rgb_c_24_bgr_422, rgb_c_8_422, rgb_c_16_422,
+	      rgb_c_24_rgb_422, rgb_c_32_422},
+	     {rgb_c_24_bgr_444, rgb_c_8_444, rgb_c_16_444,
+	      rgb_c_24_rgb_444, rgb_c_32_444}};
+
+	convert420 = chroma420;
+	id_size = (sizeof (convert_rgb_c_t) +
+		   rgb_c_init ((convert_rgb_c_t *) id, order, bpp));
+	src = ((seq->chroma_width == seq->width) +
+	       (seq->chroma_height == seq->height));
+	dest = ((bpp == 24 && order == MPEG2CONVERT_BGR) ? 0 : (bpp + 7) >> 3);
+	copy = rgb_c[src][dest];
+    }
+
+    result->id_size = id_size;
+
+    if (stride < rgb_stride_min)
+	stride = rgb_stride_min;
+
+    if (stage == MPEG2_CONVERT_STRIDE)
+	return stride;
+    else if (stage == MPEG2_CONVERT_START) {
+	id->width = seq->width >> 3;
+	id->y_stride_frame = seq->width;
+	id->uv_stride_frame = seq->chroma_width;
+	id->rgb_stride_frame = stride;
+	id->rgb_stride_min = rgb_stride_min;
+	id->chroma420 = chroma420;
+	id->convert420 = convert420;
+	result->buf_size[0] = stride * seq->height;
+	result->buf_size[1] = result->buf_size[2] = 0;
+	result->start = rgb_start;
+	result->copy = copy;
+    }
+    return 0;
+}
+
+#define DECLARE(func,order,bpp)						\
+int func (int stage, void * id,						\
+	  const mpeg2_sequence_t * sequence, int stride,		\
+	  uint32_t accel, void * arg, mpeg2_convert_init_t * result)	\
+{									\
+    return rgb_internal (order, bpp, stage, id, sequence, stride,	\
+			 accel, arg, result);				\
+}
+
+DECLARE (mpeg2convert_rgb32, MPEG2CONVERT_RGB, 32)
+DECLARE (mpeg2convert_rgb24, MPEG2CONVERT_RGB, 24)
+DECLARE (mpeg2convert_rgb16, MPEG2CONVERT_RGB, 16)
+DECLARE (mpeg2convert_rgb15, MPEG2CONVERT_RGB, 15)
+DECLARE (mpeg2convert_rgb8, MPEG2CONVERT_RGB, 8)
+DECLARE (mpeg2convert_bgr32, MPEG2CONVERT_BGR, 32)
+DECLARE (mpeg2convert_bgr24, MPEG2CONVERT_BGR, 24)
+DECLARE (mpeg2convert_bgr16, MPEG2CONVERT_BGR, 16)
+DECLARE (mpeg2convert_bgr15, MPEG2CONVERT_BGR, 15)
+DECLARE (mpeg2convert_bgr8, MPEG2CONVERT_BGR, 8)
+
+mpeg2_convert_t * mpeg2convert_rgb (mpeg2convert_rgb_order_t order,
+				    unsigned int bpp)
+{
+    static mpeg2_convert_t * table[5][2] =
+	{{mpeg2convert_rgb15, mpeg2convert_bgr15},
+	 {mpeg2convert_rgb8, mpeg2convert_bgr8},
+	 {mpeg2convert_rgb16, mpeg2convert_bgr16},
+	 {mpeg2convert_rgb24, mpeg2convert_bgr24},
+	 {mpeg2convert_rgb32, mpeg2convert_bgr32}};
+
+    if (order == MPEG2CONVERT_RGB || order == MPEG2CONVERT_BGR) {
+	if (bpp == 15)
+	    return table[0][order == MPEG2CONVERT_BGR];
+	else if (bpp >= 8 && bpp <= 32 && (bpp & 7) == 0)
+	    return table[bpp >> 3][order == MPEG2CONVERT_BGR];
+    }
+    return (mpeg2_convert_t *) 0;
+}
diff --git a/src/libmpeg2new/libmpeg2/rgb_mmx.c b/src/libmpeg2new/libmpeg2/rgb_mmx.c
new file mode 100644
index 000000000..912291c6a
--- /dev/null
+++ b/src/libmpeg2new/libmpeg2/rgb_mmx.c
@@ -0,0 +1,321 @@
+/*
+ * rgb_mmx.c
+ * Copyright (C) 2000-2003 Silicon Integrated System Corp.
+ * All Rights Reserved.
+ *
+ * Author: Olie Lho <ollie@sis.com.tw>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef ARCH_X86
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "mpeg2.h"
+#include "mpeg2convert.h"
+#include "convert_internal.h"
+#include "attributes.h"
+#include "mmx.h"
+
+#define CPU_MMXEXT 0
+#define CPU_MMX 1
+
+/* CPU_MMXEXT/CPU_MMX adaptation layer */
+
+#define movntq(src,dest)	\
+do {				\
+    if (cpu == CPU_MMXEXT)	\
+	movntq_r2m (src, dest);	\
+    else			\
+	movq_r2m (src, dest);	\
+} while (0)
+
+static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv)
+{
+    static mmx_t mmx_80w = {0x0080008000800080LL};
+    static mmx_t mmx_U_green = {0xf37df37df37df37dLL};
+    static mmx_t mmx_U_blue = {0x4093409340934093LL};
+    static mmx_t mmx_V_red = {0x3312331233123312LL};
+    static mmx_t mmx_V_green = {0xe5fce5fce5fce5fcLL};
+    static mmx_t mmx_10w = {0x1010101010101010LL};
+    static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffLL};
+    static mmx_t mmx_Y_coeff = {0x253f253f253f253fLL};
+
+    movd_m2r (*pu, mm0);		/* mm0 = 00 00 00 00 u3 u2 u1 u0 */
+    movd_m2r (*pv, mm1);		/* mm1 = 00 00 00 00 v3 v2 v1 v0 */
+    movq_m2r (*py, mm6);		/* mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+    pxor_r2r (mm4, mm4);		/* mm4 = 0 */
+    /* XXX might do cache preload for image here */
+
+    /*
+     * Do the multiply part of the conversion for even and odd pixels
+     * register usage:
+     * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
+     * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels
+     * mm6 -> Y even, mm7 -> Y odd
+     */
+
+    punpcklbw_r2r (mm4, mm0);		/* mm0 = u3 u2 u1 u0 */
+    punpcklbw_r2r (mm4, mm1);		/* mm1 = v3 v2 v1 v0 */
+    psubsw_m2r (mmx_80w, mm0);		/* u -= 128 */
+    psubsw_m2r (mmx_80w, mm1);		/* v -= 128 */
+    psllw_i2r (3, mm0);			/* promote precision */
+    psllw_i2r (3, mm1);			/* promote precision */
+    movq_r2r (mm0, mm2);		/* mm2 = u3 u2 u1 u0 */
+    movq_r2r (mm1, mm3);		/* mm3 = v3 v2 v1 v0 */
+    pmulhw_m2r (mmx_U_green, mm2);	/* mm2 = u * u_green */
+    pmulhw_m2r (mmx_V_green, mm3);	/* mm3 = v * v_green */
+    pmulhw_m2r (mmx_U_blue, mm0);	/* mm0 = chroma_b */
+    pmulhw_m2r (mmx_V_red, mm1);	/* mm1 = chroma_r */
+    paddsw_r2r (mm3, mm2);		/* mm2 = chroma_g */
+
+    psubusb_m2r (mmx_10w, mm6);		/* Y -= 16 */
+    movq_r2r (mm6, mm7);		/* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+    pand_m2r (mmx_00ffw, mm6);		/* mm6 =    Y6    Y4    Y2    Y0 */
+    psrlw_i2r (8, mm7);			/* mm7 =    Y7    Y5    Y3    Y1 */
+    psllw_i2r (3, mm6);			/* promote precision */
+    psllw_i2r (3, mm7);			/* promote precision */
+    pmulhw_m2r (mmx_Y_coeff, mm6);	/* mm6 = luma_rgb even */
+    pmulhw_m2r (mmx_Y_coeff, mm7);	/* mm7 = luma_rgb odd */
+
+    /*
+     * Do the addition part of the conversion for even and odd pixels
+     * register usage:
+     * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
+     * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels
+     * mm6 -> Y even, mm7 -> Y odd
+     */
+
+    movq_r2r (mm0, mm3);		/* mm3 = chroma_b */
+    movq_r2r (mm1, mm4);		/* mm4 = chroma_r */
+    movq_r2r (mm2, mm5);		/* mm5 = chroma_g */
+    paddsw_r2r (mm6, mm0);		/* mm0 = B6 B4 B2 B0 */
+    paddsw_r2r (mm7, mm3);		/* mm3 = B7 B5 B3 B1 */
+    paddsw_r2r (mm6, mm1);		/* mm1 = R6 R4 R2 R0 */
+    paddsw_r2r (mm7, mm4);		/* mm4 = R7 R5 R3 R1 */
+    paddsw_r2r (mm6, mm2);		/* mm2 = G6 G4 G2 G0 */
+    paddsw_r2r (mm7, mm5);		/* mm5 = G7 G5 G3 G1 */
+    packuswb_r2r (mm0, mm0);		/* saturate to 0-255 */
+    packuswb_r2r (mm1, mm1);		/* saturate to 0-255 */
+    packuswb_r2r (mm2, mm2);		/* saturate to 0-255 */
+    packuswb_r2r (mm3, mm3);		/* saturate to 0-255 */
+    packuswb_r2r (mm4, mm4);		/* saturate to 0-255 */
+    packuswb_r2r (mm5, mm5);		/* saturate to 0-255 */
+    punpcklbw_r2r (mm3, mm0);		/* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */
+    punpcklbw_r2r (mm4, mm1);		/* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */
+    punpcklbw_r2r (mm5, mm2);		/* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */
+}
+
+static inline void mmx_unpack_16rgb (uint8_t * image, const int cpu)
+{
+    static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8LL};
+    static mmx_t mmx_greenmask = {0xfcfcfcfcfcfcfcfcLL};
+    static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8LL};
+
+    /*
+     * convert RGB plane to RGB 16 bits
+     * mm0 -> B, mm1 -> R, mm2 -> G
+     * mm4 -> GB, mm5 -> AR pixel 4-7
+     * mm6 -> GB, mm7 -> AR pixel 0-3
+     */
+
+    pand_m2r (mmx_bluemask, mm0);	/* mm0 = b7b6b5b4b3______ */
+    pand_m2r (mmx_greenmask, mm2);	/* mm2 = g7g6g5g4g3g2____ */
+    pand_m2r (mmx_redmask, mm1);	/* mm1 = r7r6r5r4r3______ */
+    psrlq_i2r (3, mm0);			/* mm0 = ______b7b6b5b4b3 */
+    pxor_r2r (mm4, mm4);		/* mm4 = 0 */
+    movq_r2r (mm0, mm5);		/* mm5 = ______b7b6b5b4b3 */
+    movq_r2r (mm2, mm7);		/* mm7 = g7g6g5g4g3g2____ */
+
+    punpcklbw_r2r (mm4, mm2);
+    punpcklbw_r2r (mm1, mm0);
+    psllq_i2r (3, mm2);
+    por_r2r (mm2, mm0);
+    movntq (mm0, *image);
+
+    punpckhbw_r2r (mm4, mm7);
+    punpckhbw_r2r (mm1, mm5);
+    psllq_i2r (3, mm7);
+    por_r2r (mm7, mm5);
+    movntq (mm5, *(image+8));
+}
+
+static inline void mmx_unpack_32rgb (uint8_t * image, const int cpu)
+{
+    /*
+     * convert RGB plane to RGB packed format,
+     * mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
+     * mm4 -> GB, mm5 -> AR pixel 4-7,
+     * mm6 -> GB, mm7 -> AR pixel 0-3
+     */
+
+    pxor_r2r (mm3, mm3);
+    movq_r2r (mm0, mm6);
+    movq_r2r (mm1, mm7);
+    movq_r2r (mm0, mm4);
+    movq_r2r (mm1, mm5);
+    punpcklbw_r2r (mm2, mm6);
+    punpcklbw_r2r (mm3, mm7);
+    punpcklwd_r2r (mm7, mm6);
+    movntq (mm6, *image);
+    movq_r2r (mm0, mm6);
+    punpcklbw_r2r (mm2, mm6);
+    punpckhwd_r2r (mm7, mm6);
+    movntq (mm6, *(image+8));
+    punpckhbw_r2r (mm2, mm4);
+    punpckhbw_r2r (mm3, mm5);
+    punpcklwd_r2r (mm5, mm4);
+    movntq (mm4, *(image+16));
+    movq_r2r (mm0, mm4);
+    punpckhbw_r2r (mm2, mm4);
+    punpckhwd_r2r (mm5, mm4);
+    movntq (mm4, *(image+24));
+}
+
+static inline void rgb16 (void * const _id, uint8_t * const * src,
+			  const unsigned int v_offset, const int cpu)
+{
+    convert_rgb_t * const id = (convert_rgb_t *) _id;
+    uint8_t * dst;
+    uint8_t * py, * pu, * pv;
+    int i, j;
+
+    dst = id->rgb_ptr + id->rgb_slice * v_offset;
+    py = src[0];	pu = src[1];	pv = src[2];
+
+    i = 16;
+    do {
+	j = id->width;
+	do {
+	    mmx_yuv2rgb (py, pu, pv);
+	    mmx_unpack_16rgb (dst, cpu);
+	    py += 8;
+	    pu += 4;
+	    pv += 4;
+	    dst += 16;
+	} while (--j);
+
+	dst += id->rgb_increm;
+	py += id->y_increm;
+	if (--i == id->field) {
+	    dst = id->rgb_ptr + id->rgb_slice * (v_offset + 1);
+	    py = src[0] + id->y_stride_frame;
+	    pu = src[1] + id->uv_stride_frame;
+	    pv = src[2] + id->uv_stride_frame;
+	} else if (! (i & id->chroma420)) {
+	    pu += id->uv_increm;
+	    pv += id->uv_increm;
+	} else {
+	    pu -= id->uv_stride_frame;
+	    pv -= id->uv_stride_frame;
+	}
+    } while (i);
+}
+
+static inline void argb32 (void * const _id, uint8_t * const * src,
+			   const unsigned int v_offset, const int cpu)
+{
+    convert_rgb_t * const id = (convert_rgb_t *) _id;
+    uint8_t * dst;
+    uint8_t * py, * pu, * pv;
+    int i, j;
+
+    dst = id->rgb_ptr + id->rgb_slice * v_offset;
+    py = src[0];	pu = src[1];	pv = src[2];
+
+    i = 16;
+    do {
+	j = id->width;
+	do {
+	    mmx_yuv2rgb (py, pu, pv);
+	    mmx_unpack_32rgb (dst, cpu);
+	    py += 8;
+	    pu += 4;
+	    pv += 4;
+	    dst += 32;
+	} while (--j);
+
+	dst += id->rgb_increm;
+	py += id->y_increm;
+	if (--i == id->field) {
+	    dst = id->rgb_ptr + id->rgb_slice * (v_offset + 1);
+	    py = src[0] + id->y_stride_frame;
+	    pu = src[1] + id->uv_stride_frame;
+	    pv = src[2] + id->uv_stride_frame;
+	} else if (! (i & id->chroma420)) {
+	    pu += id->uv_increm;
+	    pv += id->uv_increm;
+	} else {
+	    pu -= id->uv_stride_frame;
+	    pv -= id->uv_stride_frame;
+	}
+    } while (i);
+}
+
+static void mmxext_rgb16 (void * id, uint8_t * const * src,
+			  unsigned int v_offset)
+{
+    rgb16 (id, src, v_offset, CPU_MMXEXT);
+}
+
+static void mmxext_argb32 (void * id, uint8_t * const * src,
+			   unsigned int v_offset)
+{
+    argb32 (id, src, v_offset, CPU_MMXEXT);
+}
+
+static void mmx_rgb16 (void * id, uint8_t * const * src, unsigned int v_offset)
+{
+    rgb16 (id, src, v_offset, CPU_MMX);
+}
+
+static void mmx_argb32 (void * id, uint8_t * const * src,
+			unsigned int v_offset)
+{
+    argb32 (id, src, v_offset, CPU_MMX);
+}
+
+mpeg2convert_copy_t * mpeg2convert_rgb_mmxext (int order, int bpp,
+					       const mpeg2_sequence_t * seq)
+{
+    if (order == MPEG2CONVERT_RGB && seq->chroma_width < seq->width) {
+	if (bpp == 16)
+	    return mmxext_rgb16;
+	else if (bpp == 32)
+	    return mmxext_argb32;
+    }
+    return NULL;	/* Fallback to C */
+}
+
+mpeg2convert_copy_t * mpeg2convert_rgb_mmx (int order, int bpp,
+					    const mpeg2_sequence_t * seq)
+{
+    if (order == MPEG2CONVERT_RGB && seq->chroma_width < seq->width) {
+	if (bpp == 16)
+	    return mmx_rgb16;
+	else if (bpp == 32)
+	    return mmx_argb32;
+    }
+    return NULL;	/* Fallback to C */
+}
+#endif
diff --git a/src/libmpeg2new/libmpeg2/rgb_vis.c b/src/libmpeg2new/libmpeg2/rgb_vis.c
new file mode 100644
index 000000000..49d8d1d7c
--- /dev/null
+++ b/src/libmpeg2new/libmpeg2/rgb_vis.c
@@ -0,0 +1,384 @@
+/*
+ * rgb_vis.c
+ * Copyright (C) 2003 David S. Miller <davem@redhat.com>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef ARCH_SPARC
+
+#include <stddef.h>
+#include <inttypes.h>
+
+#include "mpeg2.h"
+#include "mpeg2convert.h"
+#include "convert_internal.h"
+#include "attributes.h"
+#include "vis.h"
+
+/* Based partially upon the MMX yuv2rgb code, see there for credits.
+ *
+ * The difference here is that since we have enough registers we
+ * process both even and odd scanlines in one pass.
+ */
+
+static const uint16_t const_2048[] ATTR_ALIGN(8) = {2048, 2048, 2048, 2048};
+static const uint16_t const_1024[] ATTR_ALIGN(8) = {1024, 1024, 1024, 1024};
+static const uint16_t const_128[] ATTR_ALIGN(8) = {128, 128, 128, 128};
+static const uint8_t const_Ugreen[] ATTR_ALIGN(8) =
+	{0xf3, 0x00, 0xf3, 0x00, 0xf3, 0x00, 0xf3, 0x00};
+static const uint8_t const_Vgreen[] ATTR_ALIGN(8) =
+	{0xe6, 0x00, 0xe6, 0x00, 0xe6, 0x00, 0xe6, 0x00};
+static const uint8_t const_Ublue_Vred[] ATTR_ALIGN(8) =
+	{0x41, 0x41, 0x41, 0x41, 0x33, 0x33, 0x33, 0x33};
+static const uint8_t const_Ycoeff[] ATTR_ALIGN(4) = {0x25, 0x25, 0x25, 0x25};
+
+#define TMP0		0
+#define TMP1		1
+#define TMP2		2
+#define TMP3		3
+#define TMP4		4
+#define TMP5		5
+#define TMP6		6
+#define TMP7		7
+#define TMP8		8
+#define TMP9		9
+#define TMP10		10
+#define TMP11		11
+#define TMP12		12
+#define TMP13		13
+
+#define CONST_UBLUE	14
+#define CONST_VRED	15
+#define CONST_2048	16
+
+#define BLUE8_EVEN	18
+#define BLUE8_ODD	19
+#define RED8_EVEN	20
+#define RED8_ODD	21
+#define GREEN8_EVEN	22
+#define GREEN8_ODD	23
+
+#define BLUE8_2_EVEN	24
+#define BLUE8_2_ODD	25
+#define RED8_2_EVEN	26
+#define RED8_2_ODD	27
+#define GREEN8_2_EVEN	28
+#define GREEN8_2_ODD	29
+
+#define CONST_YCOEFF	30
+#define ZEROS		31
+
+#define PU_0		32
+#define PU_2		34
+#define PV_0		36
+#define PV_2		38
+#define PY_0		40
+#define PY_2		42
+#define PY_4		44
+#define PY_6		46
+
+#define CONST_128	56
+#define CONST_1024	58
+#define CONST_VGREEN	60
+#define CONST_UGREEN	62
+
+static inline void vis_init_consts(void)
+{
+	vis_set_gsr(7 << VIS_GSR_SCALEFACT_SHIFT);
+
+	vis_ld64(const_2048[0], CONST_2048);
+	vis_ld64(const_1024[0], CONST_1024);
+	vis_ld64(const_Ugreen[0], CONST_UGREEN);
+	vis_ld64(const_Vgreen[0], CONST_VGREEN);
+	vis_fzeros(ZEROS);
+	vis_ld64(const_Ublue_Vred[0], CONST_UBLUE);
+	vis_ld32(const_Ycoeff[0], CONST_YCOEFF);
+	vis_ld64(const_128[0],  CONST_128);
+}
+
+static inline void vis_yuv2rgb(uint8_t *py, uint8_t *pu, uint8_t *pv,
+			       int y_stride)
+{
+	vis_ld32(pu[0], TMP0);
+
+	vis_ld32(pv[0], TMP2);
+
+	vis_ld64(py[0], TMP4);
+	vis_mul8x16au(TMP0, CONST_2048, PU_0);
+
+	vis_ld64_2(py, y_stride, TMP8);
+	vis_mul8x16au(TMP2, CONST_2048, PV_0);
+
+	vis_pmerge(TMP4, TMP5, TMP6);
+
+	vis_pmerge(TMP6, TMP7, TMP4);
+
+	vis_pmerge(TMP8, TMP9, TMP10);
+
+	vis_pmerge(TMP10, TMP11, TMP8);
+	vis_mul8x16au(TMP4, CONST_2048, PY_0);
+
+	vis_psub16(PU_0, CONST_1024, PU_0);
+	vis_mul8x16au(TMP5, CONST_2048, PY_2);
+
+	vis_psub16(PV_0, CONST_1024, PV_0);
+	vis_mul8x16au(TMP8, CONST_2048, PY_4);
+
+	vis_psub16(PY_0, CONST_128, PY_0);
+	vis_mul8x16au(TMP9, CONST_2048, PY_6);
+
+	vis_psub16(PY_2, CONST_128, PY_2);
+	vis_mul8x16(CONST_YCOEFF, PY_0, PY_0);
+
+	vis_psub16(PY_4, CONST_128, PY_4);
+	vis_mul8x16(CONST_YCOEFF, PY_2, PY_2);
+
+	vis_psub16(PY_6, CONST_128, PY_6);
+	vis_mul8x16(CONST_YCOEFF, PY_4, PY_4);
+
+	vis_mul8x16(CONST_YCOEFF, PY_6, PY_6);
+
+	vis_mul8sux16(CONST_UGREEN, PU_0, TMP0);
+
+	vis_mul8sux16(CONST_VGREEN, PV_0, TMP2);
+
+	vis_mul8x16(CONST_UBLUE, PU_0, TMP4);
+
+	vis_mul8x16(CONST_VRED, PV_0, TMP6);
+	vis_padd16(TMP0, TMP2, TMP10);
+
+	vis_padd16(PY_0, TMP4, TMP0);
+
+	vis_padd16(PY_2, TMP4, TMP2);
+	vis_pack16(TMP0, BLUE8_EVEN);
+
+	vis_padd16(PY_4, TMP4, TMP0);
+	vis_pack16(TMP2, BLUE8_ODD);
+
+	vis_padd16(PY_6, TMP4, TMP2);
+	vis_pack16(TMP0, BLUE8_2_EVEN);
+
+	vis_padd16(PY_0, TMP6, TMP0);
+	vis_pack16(TMP2, BLUE8_2_ODD);
+
+	vis_padd16(PY_2, TMP6, TMP2);
+	vis_pack16(TMP0, RED8_EVEN);
+
+	vis_padd16(PY_4, TMP6, TMP0);
+	vis_pack16(TMP2, RED8_ODD);
+
+	vis_padd16(PY_6, TMP6, TMP2);
+	vis_pack16(TMP0, RED8_2_EVEN);
+
+	vis_padd16(PY_0, TMP10, TMP0);
+	vis_pack16(TMP2, RED8_2_ODD);
+
+	vis_padd16(PY_2, TMP10, TMP2);
+	vis_pack16(TMP0, GREEN8_EVEN);
+
+	vis_padd16(PY_4, TMP10, TMP0);
+	vis_pack16(TMP2, GREEN8_ODD);
+
+	vis_padd16(PY_6, TMP10, TMP2);
+	vis_pack16(TMP0, GREEN8_2_EVEN);
+
+	vis_pack16(TMP2, GREEN8_2_ODD);
+	vis_pmerge(BLUE8_EVEN, BLUE8_ODD, BLUE8_EVEN);
+
+	vis_pmerge(BLUE8_2_EVEN, BLUE8_2_ODD, BLUE8_2_EVEN);
+
+	vis_pmerge(RED8_EVEN, RED8_ODD, RED8_EVEN);
+
+	vis_pmerge(RED8_2_EVEN, RED8_2_ODD, RED8_2_EVEN);
+
+	vis_pmerge(GREEN8_EVEN, GREEN8_ODD, GREEN8_EVEN);
+
+	vis_pmerge(GREEN8_2_EVEN, GREEN8_2_ODD, GREEN8_2_EVEN);
+}
+
+static inline void vis_unpack_32rgb(uint8_t *image, int stride)
+{
+	vis_pmerge(ZEROS, GREEN8_EVEN, TMP0);
+	vis_pmerge(RED8_EVEN, BLUE8_EVEN, TMP2);
+
+	vis_pmerge(TMP0, TMP2, TMP4);
+	vis_st64(TMP4, image[0]);
+
+	vis_pmerge(TMP1, TMP3, TMP6);
+	vis_st64_2(TMP6, image, 8);
+
+	vis_pmerge(ZEROS, GREEN8_ODD, TMP8);
+	vis_pmerge(RED8_ODD, BLUE8_ODD, TMP10);
+
+	vis_pmerge(TMP8, TMP10, TMP0);
+	vis_st64_2(TMP0, image, 16);
+
+	vis_pmerge(TMP9, TMP11, TMP2);
+	vis_st64_2(TMP2, image, 24);
+
+	image += stride;
+
+	vis_pmerge(ZEROS, GREEN8_2_EVEN, TMP0);
+	vis_pmerge(RED8_2_EVEN, BLUE8_2_EVEN, TMP2);
+
+	vis_pmerge(TMP0, TMP2, TMP4);
+	vis_st64(TMP4, image[0]);
+
+	vis_pmerge(TMP1, TMP3, TMP6);
+	vis_st64_2(TMP6, image, 8);
+
+	vis_pmerge(ZEROS, GREEN8_2_ODD, TMP8);
+	vis_pmerge(RED8_2_ODD, BLUE8_2_ODD, TMP10);
+
+	vis_pmerge(TMP8, TMP10, TMP0);
+	vis_st64_2(TMP0, image, 16);
+
+	vis_pmerge(TMP9, TMP11, TMP2);
+	vis_st64_2(TMP2, image, 24);
+}
+
+static inline void vis_unpack_32bgr(uint8_t *image, int stride)
+{
+	vis_pmerge(ZEROS, GREEN8_EVEN, TMP0);
+	vis_pmerge(BLUE8_EVEN, RED8_EVEN, TMP2);
+
+	vis_pmerge(TMP0, TMP2, TMP4);
+	vis_st64(TMP4, image[0]);
+
+	vis_pmerge(TMP1, TMP3, TMP6);
+	vis_st64_2(TMP6, image, 8);
+
+	vis_pmerge(ZEROS, GREEN8_ODD, TMP8);
+	vis_pmerge(BLUE8_ODD, RED8_ODD, TMP10);
+
+	vis_pmerge(TMP8, TMP10, TMP0);
+	vis_st64_2(TMP0, image, 16);
+
+	vis_pmerge(TMP9, TMP11, TMP2);
+	vis_st64_2(TMP2, image, 24);
+
+	image += stride;
+
+	vis_pmerge(ZEROS, GREEN8_2_EVEN, TMP0);
+	vis_pmerge(BLUE8_2_EVEN, RED8_2_EVEN, TMP2);
+
+	vis_pmerge(TMP0, TMP2, TMP4);
+	vis_st64(TMP4, image[0]);
+
+	vis_pmerge(TMP1, TMP3, TMP6);
+	vis_st64_2(TMP6, image, 8);
+
+	vis_pmerge(ZEROS, GREEN8_2_ODD, TMP8);
+	vis_pmerge(BLUE8_2_ODD, RED8_2_ODD, TMP10);
+
+	vis_pmerge(TMP8, TMP10, TMP0);
+	vis_st64_2(TMP0, image, 16);
+
+	vis_pmerge(TMP9, TMP11, TMP2);
+	vis_st64_2(TMP2, image, 24);
+}
+
+static inline void vis_yuv420_argb32(uint8_t *image,
+				     uint8_t *py, uint8_t *pu, uint8_t *pv,
+				     int width, int height, int rgb_stride,
+				     int y_stride, int uv_stride)
+{
+	height >>= 1;
+	uv_stride -= width >> 1;
+	do {
+		int i = width >> 3;
+		do {
+			vis_yuv2rgb(py, pu, pv, y_stride);
+			vis_unpack_32rgb(image, rgb_stride);
+			py += 8;
+			pu += 4;
+			pv += 4;
+			image += 32;
+		} while (--i);
+
+		py    += (y_stride << 1) - width;
+		image += (rgb_stride << 1) - 4 * width;
+		pu    += uv_stride;
+		pv    += uv_stride;
+	} while (--height);
+}
+
+static inline void vis_yuv420_abgr32(uint8_t *image,
+				     uint8_t *py, uint8_t *pu, uint8_t *pv,
+				     int width, int height, int rgb_stride,
+				     int y_stride, int uv_stride)
+{
+	height >>= 1;
+	uv_stride -= width >> 1;
+	do {
+		int i = width >> 3;
+		do {
+			vis_yuv2rgb(py, pu, pv, y_stride);
+			vis_unpack_32bgr(image, rgb_stride);
+			py += 8;
+			pu += 4;
+			pv += 4;
+			image += 32;
+		} while (--i);
+
+		py    += (y_stride << 1) - width;
+		image += (rgb_stride << 1) - 4 * width;
+		pu    += uv_stride;
+		pv    += uv_stride;
+	} while (--height);
+}
+
+static void vis_argb32(void *_id, uint8_t * const *src,
+		       unsigned int v_offset)
+{
+	convert_rgb_t *id = (convert_rgb_t *) _id;
+
+	vis_init_consts();
+	vis_yuv420_argb32(id->rgb_ptr + id->rgb_stride * v_offset,
+			  src[0], src[1], src[2], id->width, 16,
+			  id->rgb_stride, id->y_stride, id->y_stride >> 1);
+}
+
+static void vis_abgr32(void *_id, uint8_t * const *src,
+		       unsigned int v_offset)
+{
+	convert_rgb_t *id = (convert_rgb_t *) _id;
+
+	vis_init_consts();
+	vis_yuv420_abgr32(id->rgb_ptr + id->rgb_stride * v_offset,
+			  src[0], src[1], src[2], id->width, 16,
+			  id->rgb_stride, id->y_stride, id->y_stride >> 1);
+}
+
+mpeg2convert_copy_t *mpeg2convert_rgb_vis(int order, int bpp,
+					  const mpeg2_sequence_t * seq)
+{
+	if (bpp == 32 && seq->chroma_height < seq->height) {
+		if (order == MPEG2CONVERT_RGB)
+			return vis_argb32;
+		if (order == MPEG2CONVERT_BGR)
+			return vis_abgr32;
+	}
+
+	return NULL;	/* Fallback to C */
+}
+
+#endif /* ARCH_SPARC */
diff --git a/src/libmpeg2new/libmpeg2/slice.c b/src/libmpeg2new/libmpeg2/slice.c
index 095fc4c82..ce4508639 100644
--- a/src/libmpeg2new/libmpeg2/slice.c
+++ b/src/libmpeg2new/libmpeg2/slice.c
@@ -1,6 +1,7 @@
 /*
  * slice.c
  * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2003      Peter Gubanov <peter@elecard.net.ru>
  * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
@@ -26,8 +27,8 @@
 #include <inttypes.h>
 
 #include "../include/mpeg2.h"
-#include "mpeg2_internal.h"
 #include "../include/attributes.h"
+#include "mpeg2_internal.h"
 
 extern mpeg2_mc_t mpeg2_mc;
 extern void (* mpeg2_idct_copy) (int16_t * block, uint8_t * dest, int stride);
@@ -38,13 +39,6 @@ extern void (* mpeg2_cpu_state_restore) (cpu_state_t * state);
 
 #include "vlc.h"
 
-static int non_linear_quantizer_scale [] = {
-     0,  1,  2,  3,  4,  5,   6,   7,
-     8, 10, 12, 14, 16, 18,  20,  22,
-    24, 28, 32, 36, 40, 44,  48,  52,
-    56, 64, 72, 80, 88, 96, 104, 112
-};
-
 static inline int get_macroblock_modes (mpeg2_decoder_t * const decoder)
 {
 #define bit_buf (decoder->bitstream_buf)
@@ -76,24 +70,24 @@ static inline int get_macroblock_modes (mpeg2_decoder_t * const decoder)
 
 	if (decoder->picture_structure != FRAME_PICTURE) {
 	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD) {
-		macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+		macroblock_modes |= UBITS (bit_buf, 2) << MOTION_TYPE_SHIFT;
 		DUMPBITS (bit_buf, bits, 2);
 	    }
-	    return macroblock_modes;
+	    return macroblock_modes | MACROBLOCK_MOTION_FORWARD;
 	} else if (decoder->frame_pred_frame_dct) {
 	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD)
-		macroblock_modes |= MC_FRAME;
-	    return macroblock_modes;
+		macroblock_modes |= MC_FRAME << MOTION_TYPE_SHIFT;
+	    return macroblock_modes | MACROBLOCK_MOTION_FORWARD;
 	} else {
 	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD) {
-		macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+		macroblock_modes |= UBITS (bit_buf, 2) << MOTION_TYPE_SHIFT;
 		DUMPBITS (bit_buf, bits, 2);
 	    }
 	    if (macroblock_modes & (MACROBLOCK_INTRA | MACROBLOCK_PATTERN)) {
 		macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
 		DUMPBITS (bit_buf, bits, 1);
 	    }
-	    return macroblock_modes;
+	    return macroblock_modes | MACROBLOCK_MOTION_FORWARD;
 	}
 
     case B_TYPE:
@@ -104,18 +98,18 @@ static inline int get_macroblock_modes (mpeg2_decoder_t * const decoder)
 
 	if (decoder->picture_structure != FRAME_PICTURE) {
 	    if (! (macroblock_modes & MACROBLOCK_INTRA)) {
-		macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+		macroblock_modes |= UBITS (bit_buf, 2) << MOTION_TYPE_SHIFT;
 		DUMPBITS (bit_buf, bits, 2);
 	    }
 	    return macroblock_modes;
 	} else if (decoder->frame_pred_frame_dct) {
 	    /* if (! (macroblock_modes & MACROBLOCK_INTRA)) */
-	    macroblock_modes |= MC_FRAME;
+	    macroblock_modes |= MC_FRAME << MOTION_TYPE_SHIFT;
 	    return macroblock_modes;
 	} else {
 	    if (macroblock_modes & MACROBLOCK_INTRA)
 		goto intra;
-	    macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+	    macroblock_modes |= UBITS (bit_buf, 2) << MOTION_TYPE_SHIFT;
 	    DUMPBITS (bit_buf, bits, 2);
 	    if (macroblock_modes & (MACROBLOCK_INTRA | MACROBLOCK_PATTERN)) {
 	    intra:
@@ -138,7 +132,7 @@ static inline int get_macroblock_modes (mpeg2_decoder_t * const decoder)
 #undef bit_ptr
 }
 
-static inline int get_quantizer_scale (mpeg2_decoder_t * const decoder)
+static inline void get_quantizer_scale (mpeg2_decoder_t * const decoder)
 {
 #define bit_buf (decoder->bitstream_buf)
 #define bits (decoder->bitstream_bits)
@@ -149,10 +143,14 @@ static inline int get_quantizer_scale (mpeg2_decoder_t * const decoder)
     quantizer_scale_code = UBITS (bit_buf, 5);
     DUMPBITS (bit_buf, bits, 5);
 
-    if (decoder->q_scale_type)
-	return non_linear_quantizer_scale [quantizer_scale_code];
-    else
-	return quantizer_scale_code << 1;
+    decoder->quantizer_matrix[0] =
+	decoder->quantizer_prescale[0][quantizer_scale_code];
+    decoder->quantizer_matrix[1] =
+	decoder->quantizer_prescale[1][quantizer_scale_code];
+    decoder->quantizer_matrix[2] =
+	decoder->chroma_quantizer[0][quantizer_scale_code];
+    decoder->quantizer_matrix[3] =
+	decoder->chroma_quantizer[1][quantizer_scale_code];
 #undef bit_buf
 #undef bits
 #undef bit_ptr
@@ -279,7 +277,7 @@ static inline int get_luma_dc_dct_diff (mpeg2_decoder_t * const decoder)
 	    dc_diff =
 		UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
 	    bit_buf <<= size;
-	    return dc_diff;
+	    return dc_diff << decoder->intra_dc_precision;
 	} else {
 	    DUMPBITS (bit_buf, bits, 3);
 	    return 0;
@@ -291,7 +289,7 @@ static inline int get_luma_dc_dct_diff (mpeg2_decoder_t * const decoder)
 	NEEDBITS (bit_buf, bits, bit_ptr);
 	dc_diff = UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
 	DUMPBITS (bit_buf, bits, size);
-	return dc_diff;
+	return dc_diff << decoder->intra_dc_precision;
     }
 #undef bit_buf
 #undef bits
@@ -316,7 +314,7 @@ static inline int get_chroma_dc_dct_diff (mpeg2_decoder_t * const decoder)
 	    dc_diff =
 		UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
 	    bit_buf <<= size;
-	    return dc_diff;
+	    return dc_diff << decoder->intra_dc_precision;
 	} else {
 	    DUMPBITS (bit_buf, bits, 2);
 	    return 0;
@@ -328,35 +326,34 @@ static inline int get_chroma_dc_dct_diff (mpeg2_decoder_t * const decoder)
 	NEEDBITS (bit_buf, bits, bit_ptr);
 	dc_diff = UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
 	DUMPBITS (bit_buf, bits, size);
-	return dc_diff;
+	return dc_diff << decoder->intra_dc_precision;
     }
 #undef bit_buf
 #undef bits
 #undef bit_ptr
 }
 
-#define SATURATE(val)					\
-do {							\
-    if (unlikely ((uint32_t)(val + 2048) > 4095))	\
-	val = SBITS (val, 1) ^ 2047;			\
+#define SATURATE(val)				\
+do {						\
+    val <<= 4;					\
+    if (unlikely (val != (int16_t) val))	\
+	val = (SBITS (val, 1) ^ 2047) << 4;	\
 } while (0)
 
-static void get_intra_block_B14 (mpeg2_decoder_t * const decoder)
+static void get_intra_block_B14 (mpeg2_decoder_t * const decoder,
+				 const uint16_t * const quant_matrix)
 {
     int i;
     int j;
     int val;
-    const uint8_t * scan = decoder->scan;
-    const uint8_t * quant_matrix = decoder->intra_quantizer_matrix;
-    int quantizer_scale = decoder->quantizer_scale;
+    const uint8_t * const scan = decoder->scan;
     int mismatch;
     const DCTtab * tab;
     uint32_t bit_buf;
     int bits;
     const uint8_t * bit_ptr;
-    int16_t * dest;
+    int16_t * const dest = decoder->DCTblock;
 
-    dest = decoder->DCTblock;
     i = 0;
     mismatch = ~dest[0];
 
@@ -379,7 +376,7 @@ static void get_intra_block_B14 (mpeg2_decoder_t * const decoder)
 	    j = scan[i];
 	    bit_buf <<= tab->len;
 	    bits += tab->len + 1;
-	    val = (tab->level * quantizer_scale * quant_matrix[j]) >> 4;
+	    val = (tab->level * quant_matrix[j]) >> 4;
 
 	    /* if (bitstream_get (1)) val = -val; */
 	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
@@ -411,8 +408,7 @@ static void get_intra_block_B14 (mpeg2_decoder_t * const decoder)
 
 	    DUMPBITS (bit_buf, bits, 12);
 	    NEEDBITS (bit_buf, bits, bit_ptr);
-	    val = (SBITS (bit_buf, 12) *
-		   quantizer_scale * quant_matrix[j]) / 16;
+	    val = (SBITS (bit_buf, 12) * quant_matrix[j]) / 16;
 
 	    SATURATE (val);
 	    dest[j] = val;
@@ -448,29 +444,27 @@ static void get_intra_block_B14 (mpeg2_decoder_t * const decoder)
 	}
 	break;	/* illegal, check needed to avoid buffer overflow */
     }
-    dest[63] ^= mismatch & 1;
-    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    dest[63] ^= mismatch & 16;
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
     decoder->bitstream_buf = bit_buf;
     decoder->bitstream_bits = bits;
     decoder->bitstream_ptr = bit_ptr;
 }
 
-static void get_intra_block_B15 (mpeg2_decoder_t * const decoder)
+static void get_intra_block_B15 (mpeg2_decoder_t * const decoder,
+				 const uint16_t * const quant_matrix)
 {
     int i;
     int j;
     int val;
-    const uint8_t * scan = decoder->scan;
-    const uint8_t * quant_matrix = decoder->intra_quantizer_matrix;
-    int quantizer_scale = decoder->quantizer_scale;
+    const uint8_t * const scan = decoder->scan;
     int mismatch;
     const DCTtab * tab;
     uint32_t bit_buf;
     int bits;
     const uint8_t * bit_ptr;
-    int16_t * dest;
+    int16_t * const dest = decoder->DCTblock;
 
-    dest = decoder->DCTblock;
     i = 0;
     mismatch = ~dest[0];
 
@@ -492,7 +486,7 @@ static void get_intra_block_B15 (mpeg2_decoder_t * const decoder)
 		j = scan[i];
 		bit_buf <<= tab->len;
 		bits += tab->len + 1;
-		val = (tab->level * quantizer_scale * quant_matrix[j]) >> 4;
+		val = (tab->level * quant_matrix[j]) >> 4;
 
 		/* if (bitstream_get (1)) val = -val; */
 		val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
@@ -523,8 +517,7 @@ static void get_intra_block_B15 (mpeg2_decoder_t * const decoder)
 
 		DUMPBITS (bit_buf, bits, 12);
 		NEEDBITS (bit_buf, bits, bit_ptr);
-		val = (SBITS (bit_buf, 12) *
-		       quantizer_scale * quant_matrix[j]) / 16;
+		val = (SBITS (bit_buf, 12) * quant_matrix[j]) / 16;
 
 		SATURATE (val);
 		dest[j] = val;
@@ -561,31 +554,29 @@ static void get_intra_block_B15 (mpeg2_decoder_t * const decoder)
 	}
 	break;	/* illegal, check needed to avoid buffer overflow */
     }
-    dest[63] ^= mismatch & 1;
-    DUMPBITS (bit_buf, bits, 4);	/* dump end of block code */
+    dest[63] ^= mismatch & 16;
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
     decoder->bitstream_buf = bit_buf;
     decoder->bitstream_bits = bits;
     decoder->bitstream_ptr = bit_ptr;
 }
 
-static int get_non_intra_block (mpeg2_decoder_t * const decoder)
+static int get_non_intra_block (mpeg2_decoder_t * const decoder,
+				const uint16_t * const quant_matrix)
 {
     int i;
     int j;
     int val;
-    const uint8_t * scan = decoder->scan;
-    const uint8_t * quant_matrix = decoder->non_intra_quantizer_matrix;
-    int quantizer_scale = decoder->quantizer_scale;
+    const uint8_t * const scan = decoder->scan;
     int mismatch;
     const DCTtab * tab;
     uint32_t bit_buf;
     int bits;
     const uint8_t * bit_ptr;
-    int16_t * dest;
+    int16_t * const dest = decoder->DCTblock;
 
     i = -1;
-    mismatch = 1;
-    dest = decoder->DCTblock;
+    mismatch = -1;
 
     bit_buf = decoder->bitstream_buf;
     bits = decoder->bitstream_bits;
@@ -612,7 +603,7 @@ static int get_non_intra_block (mpeg2_decoder_t * const decoder)
 	    j = scan[i];
 	    bit_buf <<= tab->len;
 	    bits += tab->len + 1;
-	    val = ((2*tab->level+1) * quantizer_scale * quant_matrix[j]) >> 5;
+	    val = ((2 * tab->level + 1) * quant_matrix[j]) >> 5;
 
 	    /* if (bitstream_get (1)) val = -val; */
 	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
@@ -648,7 +639,7 @@ static int get_non_intra_block (mpeg2_decoder_t * const decoder)
 	    DUMPBITS (bit_buf, bits, 12);
 	    NEEDBITS (bit_buf, bits, bit_ptr);
 	    val = 2 * (SBITS (bit_buf, 12) + SBITS (bit_buf, 1)) + 1;
-	    val = (val * quantizer_scale * quant_matrix[j]) / 32;
+	    val = (val * quant_matrix[j]) / 32;
 
 	    SATURATE (val);
 	    dest[j] = val;
@@ -684,8 +675,8 @@ static int get_non_intra_block (mpeg2_decoder_t * const decoder)
 	}
 	break;	/* illegal, check needed to avoid buffer overflow */
     }
-    dest[63] ^= mismatch & 1;
-    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    dest[63] ^= mismatch & 16;
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
     decoder->bitstream_buf = bit_buf;
     decoder->bitstream_bits = bits;
     decoder->bitstream_ptr = bit_ptr;
@@ -697,17 +688,15 @@ static void get_mpeg1_intra_block (mpeg2_decoder_t * const decoder)
     int i;
     int j;
     int val;
-    const uint8_t * scan = decoder->scan;
-    const uint8_t * quant_matrix = decoder->intra_quantizer_matrix;
-    int quantizer_scale = decoder->quantizer_scale;
+    const uint8_t * const scan = decoder->scan;
+    const uint16_t * const quant_matrix = decoder->quantizer_matrix[0];
     const DCTtab * tab;
     uint32_t bit_buf;
     int bits;
     const uint8_t * bit_ptr;
-    int16_t * dest;
+    int16_t * const dest = decoder->DCTblock;
 
     i = 0;
-    dest = decoder->DCTblock;
 
     bit_buf = decoder->bitstream_buf;
     bits = decoder->bitstream_bits;
@@ -728,7 +717,7 @@ static void get_mpeg1_intra_block (mpeg2_decoder_t * const decoder)
 	    j = scan[i];
 	    bit_buf <<= tab->len;
 	    bits += tab->len + 1;
-	    val = (tab->level * quantizer_scale * quant_matrix[j]) >> 4;
+	    val = (tab->level * quant_matrix[j]) >> 4;
 
 	    /* oddification */
 	    val = (val - 1) | 1;
@@ -767,7 +756,7 @@ static void get_mpeg1_intra_block (mpeg2_decoder_t * const decoder)
 		DUMPBITS (bit_buf, bits, 8);
 		val = UBITS (bit_buf, 8) + 2 * val;
 	    }
-	    val = (val * quantizer_scale * quant_matrix[j]) / 16;
+	    val = (val * quant_matrix[j]) / 16;
 
 	    /* oddification */
 	    val = (val + ~SBITS (val, 1)) | 1;
@@ -805,7 +794,7 @@ static void get_mpeg1_intra_block (mpeg2_decoder_t * const decoder)
 	}
 	break;	/* illegal, check needed to avoid buffer overflow */
     }
-    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
     decoder->bitstream_buf = bit_buf;
     decoder->bitstream_bits = bits;
     decoder->bitstream_ptr = bit_ptr;
@@ -816,17 +805,15 @@ static int get_mpeg1_non_intra_block (mpeg2_decoder_t * const decoder)
     int i;
     int j;
     int val;
-    const uint8_t * scan = decoder->scan;
-    const uint8_t * quant_matrix = decoder->non_intra_quantizer_matrix;
-    int quantizer_scale = decoder->quantizer_scale;
+    const uint8_t * const scan = decoder->scan;
+    const uint16_t * const quant_matrix = decoder->quantizer_matrix[1];
     const DCTtab * tab;
     uint32_t bit_buf;
     int bits;
     const uint8_t * bit_ptr;
-    int16_t * dest;
+    int16_t * const dest = decoder->DCTblock;
 
     i = -1;
-    dest = decoder->DCTblock;
 
     bit_buf = decoder->bitstream_buf;
     bits = decoder->bitstream_bits;
@@ -853,7 +840,7 @@ static int get_mpeg1_non_intra_block (mpeg2_decoder_t * const decoder)
 	    j = scan[i];
 	    bit_buf <<= tab->len;
 	    bits += tab->len + 1;
-	    val = ((2*tab->level+1) * quantizer_scale * quant_matrix[j]) >> 5;
+	    val = ((2 * tab->level + 1) * quant_matrix[j]) >> 5;
 
 	    /* oddification */
 	    val = (val - 1) | 1;
@@ -896,7 +883,7 @@ static int get_mpeg1_non_intra_block (mpeg2_decoder_t * const decoder)
 		val = UBITS (bit_buf, 8) + 2 * val;
 	    }
 	    val = 2 * (val + SBITS (val, 1)) + 1;
-	    val = (val * quantizer_scale * quant_matrix[j]) / 32;
+	    val = (val * quant_matrix[j]) / 32;
 
 	    /* oddification */
 	    val = (val + ~SBITS (val, 1)) | 1;
@@ -934,7 +921,7 @@ static int get_mpeg1_non_intra_block (mpeg2_decoder_t * const decoder)
 	}
 	break;	/* illegal, check needed to avoid buffer overflow */
     }
-    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
     decoder->bitstream_buf = bit_buf;
     decoder->bitstream_bits = bits;
     decoder->bitstream_ptr = bit_ptr;
@@ -951,19 +938,19 @@ static inline void slice_intra_DCT (mpeg2_decoder_t * const decoder,
     NEEDBITS (bit_buf, bits, bit_ptr);
     /* Get the intra DC coefficient and inverse quantize it */
     if (cc == 0)
-	decoder->dc_dct_pred[0] += get_luma_dc_dct_diff (decoder);
+	decoder->DCTblock[0] =
+	    decoder->dc_dct_pred[0] += get_luma_dc_dct_diff (decoder);
     else
-	decoder->dc_dct_pred[cc] += get_chroma_dc_dct_diff (decoder);
-    decoder->DCTblock[0] =
-	decoder->dc_dct_pred[cc] << (3 - decoder->intra_dc_precision);
+	decoder->DCTblock[0] =
+	    decoder->dc_dct_pred[cc] += get_chroma_dc_dct_diff (decoder);
 
     if (decoder->mpeg1) {
 	if (decoder->coding_type != D_TYPE)
 	    get_mpeg1_intra_block (decoder);
     } else if (decoder->intra_vlc_format)
-	get_intra_block_B15 (decoder);
+	get_intra_block_B15 (decoder, decoder->quantizer_matrix[cc ? 2 : 0]);
     else
-	get_intra_block_B14 (decoder);
+	get_intra_block_B14 (decoder, decoder->quantizer_matrix[cc ? 2 : 0]);
     mpeg2_idct_copy (decoder->DCTblock, dest, stride);
 #undef bit_buf
 #undef bits
@@ -971,6 +958,7 @@ static inline void slice_intra_DCT (mpeg2_decoder_t * const decoder,
 }
 
 static inline void slice_non_intra_DCT (mpeg2_decoder_t * const decoder,
+					const int cc,
 					uint8_t * const dest, const int stride)
 {
     int last;
@@ -978,11 +966,12 @@ static inline void slice_non_intra_DCT (mpeg2_decoder_t * const decoder,
     if (decoder->mpeg1)
 	last = get_mpeg1_non_intra_block (decoder);
     else
-	last = get_non_intra_block (decoder);
+	last = get_non_intra_block (decoder,
+				    decoder->quantizer_matrix[cc ? 3 : 1]);
     mpeg2_idct_add (last, decoder->DCTblock, dest, stride);
 }
 
-#define MOTION(table,ref,motion_x,motion_y,size,y)			      \
+#define MOTION_420(table,ref,motion_x,motion_y,size,y)			      \
     pos_x = 2 * decoder->offset + motion_x;				      \
     pos_y = 2 * decoder->v_offset + motion_y + 2 * y;			      \
     if (unlikely (pos_x > decoder->limit_x)) {				      \
@@ -1009,7 +998,7 @@ static inline void slice_non_intra_DCT (mpeg2_decoder_t * const decoder,
 		      (decoder->offset >> 1), ref[2] + offset,		      \
 		      decoder->uv_stride, size/2)
 
-#define MOTION_FIELD(table,ref,motion_x,motion_y,dest_field,op,src_field)     \
+#define MOTION_FIELD_420(table,ref,motion_x,motion_y,dest_field,op,src_field) \
     pos_x = 2 * decoder->offset + motion_x;				      \
     pos_y = decoder->v_offset + motion_y;				      \
     if (unlikely (pos_x > decoder->limit_x)) {				      \
@@ -1038,13 +1027,237 @@ static inline void slice_non_intra_DCT (mpeg2_decoder_t * const decoder,
 		      (decoder->offset >> 1), ref[2] + offset,		      \
 		      2 * decoder->uv_stride, 4)
 
+#define MOTION_DMV_420(table,ref,motion_x,motion_y)			      \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = decoder->v_offset + motion_y;				      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y)) {				      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y;		      \
+	motion_y = pos_y - decoder->v_offset;				      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + (pos_y & ~1) * decoder->stride;		      \
+    table[xy_half] (decoder->dest[0] + decoder->offset,			      \
+		    ref[0] + offset, 2 * decoder->stride, 8);		      \
+    table[xy_half] (decoder->dest[0] + decoder->stride + decoder->offset,     \
+		    ref[0] + decoder->stride + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    motion_x /= 2;	motion_y /= 2;					      \
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			      \
+    offset = (((decoder->offset + motion_x) >> 1) +			      \
+	      (((decoder->v_offset >> 1) + (motion_y & ~1)) *		      \
+	       decoder->uv_stride));					      \
+    table[4+xy_half] (decoder->dest[1] + (decoder->offset >> 1),	      \
+		      ref[1] + offset, 2 * decoder->uv_stride, 4);	      \
+    table[4+xy_half] (decoder->dest[1] + decoder->uv_stride +		      \
+		      (decoder->offset >> 1),				      \
+		      ref[1] + decoder->uv_stride + offset,		      \
+		      2 * decoder->uv_stride, 4);			      \
+    table[4+xy_half] (decoder->dest[2] + (decoder->offset >> 1),	      \
+		      ref[2] + offset, 2 * decoder->uv_stride, 4);	      \
+    table[4+xy_half] (decoder->dest[2] + decoder->uv_stride +		      \
+		      (decoder->offset >> 1),				      \
+		      ref[2] + decoder->uv_stride + offset,		      \
+		      2 * decoder->uv_stride, 4)
+
+#define MOTION_ZERO_420(table,ref)					      \
+    table[0] (decoder->dest[0] + decoder->offset,			      \
+	      (ref[0] + decoder->offset +				      \
+	       decoder->v_offset * decoder->stride), decoder->stride, 16);    \
+    offset = ((decoder->offset >> 1) +					      \
+	      (decoder->v_offset >> 1) * decoder->uv_stride);		      \
+    table[4] (decoder->dest[1] + (decoder->offset >> 1),		      \
+	      ref[1] + offset, decoder->uv_stride, 8);			      \
+    table[4] (decoder->dest[2] + (decoder->offset >> 1),		      \
+	      ref[2] + offset, decoder->uv_stride, 8)
+
+#define MOTION_422(table,ref,motion_x,motion_y,size,y)			      \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = 2 * decoder->v_offset + motion_y + 2 * y;			      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y_ ## size)) {			      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y_ ## size;	      \
+	motion_y = pos_y - 2 * decoder->v_offset - 2 * y;		      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + (pos_y >> 1) * decoder->stride;		      \
+    table[xy_half] (decoder->dest[0] + y * decoder->stride + decoder->offset, \
+		    ref[0] + offset, decoder->stride, size);		      \
+    offset = (offset + (motion_x & (motion_x < 0))) >> 1;		      \
+    motion_x /= 2;							      \
+    xy_half = ((pos_y & 1) << 1) | (motion_x & 1);			      \
+    table[4+xy_half] (decoder->dest[1] + y * decoder->uv_stride +	      \
+		      (decoder->offset >> 1), ref[1] + offset,		      \
+		      decoder->uv_stride, size);			      \
+    table[4+xy_half] (decoder->dest[2] + y * decoder->uv_stride +	      \
+		      (decoder->offset >> 1), ref[2] + offset,		      \
+		      decoder->uv_stride, size)
+
+#define MOTION_FIELD_422(table,ref,motion_x,motion_y,dest_field,op,src_field) \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = decoder->v_offset + motion_y;				      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y)) {				      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y;		      \
+	motion_y = pos_y - decoder->v_offset;				      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + ((pos_y op) + src_field) * decoder->stride;	      \
+    table[xy_half] (decoder->dest[0] + dest_field * decoder->stride +	      \
+		    decoder->offset, ref[0] + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    offset = (offset + (motion_x & (motion_x < 0))) >> 1;		      \
+    motion_x /= 2;							      \
+    xy_half = ((pos_y & 1) << 1) | (motion_x & 1);			      \
+    table[4+xy_half] (decoder->dest[1] + dest_field * decoder->uv_stride +    \
+		      (decoder->offset >> 1), ref[1] + offset,		      \
+		      2 * decoder->uv_stride, 8);			      \
+    table[4+xy_half] (decoder->dest[2] + dest_field * decoder->uv_stride +    \
+		      (decoder->offset >> 1), ref[2] + offset,		      \
+		      2 * decoder->uv_stride, 8)
+
+#define MOTION_DMV_422(table,ref,motion_x,motion_y)			      \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = decoder->v_offset + motion_y;				      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y)) {				      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y;		      \
+	motion_y = pos_y - decoder->v_offset;				      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + (pos_y & ~1) * decoder->stride;		      \
+    table[xy_half] (decoder->dest[0] + decoder->offset,			      \
+		    ref[0] + offset, 2 * decoder->stride, 8);		      \
+    table[xy_half] (decoder->dest[0] + decoder->stride + decoder->offset,     \
+		    ref[0] + decoder->stride + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    offset = (offset + (motion_x & (motion_x < 0))) >> 1;		      \
+    motion_x /= 2;							      \
+    xy_half = ((pos_y & 1) << 1) | (motion_x & 1);			      \
+    table[4+xy_half] (decoder->dest[1] + (decoder->offset >> 1),	      \
+		      ref[1] + offset, 2 * decoder->uv_stride, 8);	      \
+    table[4+xy_half] (decoder->dest[1] + decoder->uv_stride +		      \
+		      (decoder->offset >> 1),				      \
+		      ref[1] + decoder->uv_stride + offset,		      \
+		      2 * decoder->uv_stride, 8);			      \
+    table[4+xy_half] (decoder->dest[2] + (decoder->offset >> 1),	      \
+		      ref[2] + offset, 2 * decoder->uv_stride, 8);	      \
+    table[4+xy_half] (decoder->dest[2] + decoder->uv_stride +		      \
+		      (decoder->offset >> 1),				      \
+		      ref[2] + decoder->uv_stride + offset,		      \
+		      2 * decoder->uv_stride, 8)
+
+#define MOTION_ZERO_422(table,ref)					      \
+    offset = decoder->offset + decoder->v_offset * decoder->stride;	      \
+    table[0] (decoder->dest[0] + decoder->offset,			      \
+	      ref[0] + offset, decoder->stride, 16);			      \
+    offset >>= 1;							      \
+    table[4] (decoder->dest[1] + (decoder->offset >> 1),		      \
+	      ref[1] + offset, decoder->uv_stride, 16);			      \
+    table[4] (decoder->dest[2] + (decoder->offset >> 1),		      \
+	      ref[2] + offset, decoder->uv_stride, 16)
+
+#define MOTION_444(table,ref,motion_x,motion_y,size,y)			      \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = 2 * decoder->v_offset + motion_y + 2 * y;			      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y_ ## size)) {			      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y_ ## size;	      \
+	motion_y = pos_y - 2 * decoder->v_offset - 2 * y;		      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + (pos_y >> 1) * decoder->stride;		      \
+    table[xy_half] (decoder->dest[0] + y * decoder->stride + decoder->offset, \
+		    ref[0] + offset, decoder->stride, size);		      \
+    table[xy_half] (decoder->dest[1] + y * decoder->stride + decoder->offset, \
+		    ref[1] + offset, decoder->stride, size);		      \
+    table[xy_half] (decoder->dest[2] + y * decoder->stride + decoder->offset, \
+		    ref[2] + offset, decoder->stride, size)
+
+#define MOTION_FIELD_444(table,ref,motion_x,motion_y,dest_field,op,src_field) \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = decoder->v_offset + motion_y;				      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y)) {				      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y;		      \
+	motion_y = pos_y - decoder->v_offset;				      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + ((pos_y op) + src_field) * decoder->stride;	      \
+    table[xy_half] (decoder->dest[0] + dest_field * decoder->stride +	      \
+		    decoder->offset, ref[0] + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    table[xy_half] (decoder->dest[1] + dest_field * decoder->stride +	      \
+		    decoder->offset, ref[1] + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    table[xy_half] (decoder->dest[2] + dest_field * decoder->stride +	      \
+		    decoder->offset, ref[2] + offset,			      \
+		    2 * decoder->stride, 8)
+
+#define MOTION_DMV_444(table,ref,motion_x,motion_y)			      \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = decoder->v_offset + motion_y;				      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y)) {				      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y;		      \
+	motion_y = pos_y - decoder->v_offset;				      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + (pos_y & ~1) * decoder->stride;		      \
+    table[xy_half] (decoder->dest[0] + decoder->offset,			      \
+		    ref[0] + offset, 2 * decoder->stride, 8);		      \
+    table[xy_half] (decoder->dest[0] + decoder->stride + decoder->offset,     \
+		    ref[0] + decoder->stride + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    table[xy_half] (decoder->dest[1] + decoder->offset,			      \
+		    ref[1] + offset, 2 * decoder->stride, 8);		      \
+    table[xy_half] (decoder->dest[1] + decoder->stride + decoder->offset,     \
+		    ref[1] + decoder->stride + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    table[xy_half] (decoder->dest[2] + decoder->offset,			      \
+		    ref[2] + offset, 2 * decoder->stride, 8);		      \
+    table[xy_half] (decoder->dest[2] + decoder->stride + decoder->offset,     \
+		    ref[2] + decoder->stride + offset,			      \
+		    2 * decoder->stride, 8)
+
+#define MOTION_ZERO_444(table,ref)					      \
+    offset = decoder->offset + decoder->v_offset * decoder->stride;	      \
+    table[0] (decoder->dest[0] + decoder->offset,			      \
+	      ref[0] + offset, decoder->stride, 16);			      \
+    table[4] (decoder->dest[1] + decoder->offset,			      \
+	      ref[1] + offset, decoder->stride, 16);			      \
+    table[4] (decoder->dest[2] + (decoder->offset >> 1),		      \
+	      ref[2] + offset, decoder->stride, 16)
+
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+
 static void motion_mp1 (mpeg2_decoder_t * const decoder,
 			motion_t * const motion,
 			mpeg2_mc_fct * const * const table)
 {
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
     int motion_x, motion_y;
     unsigned int pos_x, pos_y, xy_half, offset;
 
@@ -1064,202 +1277,239 @@ static void motion_mp1 (mpeg2_decoder_t * const decoder,
 				    motion->f_code[0] + motion->f_code[1]);
     motion->pmv[0][1] = motion_y;
 
-    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);
-#undef bit_buf
-#undef bits
-#undef bit_ptr
-}
-
-static void motion_fr_frame (mpeg2_decoder_t * const decoder,
-			     motion_t * const motion,
-			     mpeg2_mc_fct * const * const table)
-{
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
-    int motion_x, motion_y;
-    unsigned int pos_x, pos_y, xy_half, offset;
-
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,
-						     motion->f_code[0]);
-    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
-    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
-
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,
-						     motion->f_code[1]);
-    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
-    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
-
-    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);
-#undef bit_buf
-#undef bits
-#undef bit_ptr
-}
-
-static void motion_fr_field (mpeg2_decoder_t * const decoder,
-			     motion_t * const motion,
-			     mpeg2_mc_fct * const * const table)
-{
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
-    int motion_x, motion_y, field;
-    unsigned int pos_x, pos_y, xy_half, offset;
-
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    field = UBITS (bit_buf, 1);
-    DUMPBITS (bit_buf, bits, 1);
-
-    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,
-						     motion->f_code[0]);
-    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
-    motion->pmv[0][0] = motion_x;
-
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = (motion->pmv[0][1] >> 1) + get_motion_delta (decoder,
-							    motion->f_code[1]);
-    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
-    motion->pmv[0][1] = motion_y << 1;
-
-    MOTION_FIELD (table, motion->ref[0], motion_x, motion_y, 0, & ~1, field);
-
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    field = UBITS (bit_buf, 1);
-    DUMPBITS (bit_buf, bits, 1);
-
-    motion_x = motion->pmv[1][0] + get_motion_delta (decoder,
-						     motion->f_code[0]);
-    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
-    motion->pmv[1][0] = motion_x;
-
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = (motion->pmv[1][1] >> 1) + get_motion_delta (decoder,
-							    motion->f_code[1]);
-    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
-    motion->pmv[1][1] = motion_y << 1;
-
-    MOTION_FIELD (table, motion->ref[0], motion_x, motion_y, 1, & ~1, field);
-#undef bit_buf
-#undef bits
-#undef bit_ptr
-}
-
-static void motion_fr_dmv (mpeg2_decoder_t * const decoder,
-			   motion_t * const motion,
-			   mpeg2_mc_fct * const * const table)
-{
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
-    int motion_x, motion_y, dmv_x, dmv_y, m, other_x, other_y;
-    unsigned int pos_x, pos_y, xy_half, offset;
-
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,
-						     motion->f_code[0]);
-    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
-    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    dmv_x = get_dmv (decoder);
-
-    motion_y = (motion->pmv[0][1] >> 1) + get_motion_delta (decoder,
-							    motion->f_code[1]);
-    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
-    motion->pmv[1][1] = motion->pmv[0][1] = motion_y << 1;
-    dmv_y = get_dmv (decoder);
-
-    m = decoder->top_field_first ? 1 : 3;
-    other_x = ((motion_x * m + (motion_x > 0)) >> 1) + dmv_x;
-    other_y = ((motion_y * m + (motion_y > 0)) >> 1) + dmv_y - 1;
-    MOTION_FIELD (mpeg2_mc.put, motion->ref[0], other_x, other_y, 0, | 1, 0);
-
-    m = decoder->top_field_first ? 3 : 1;
-    other_x = ((motion_x * m + (motion_x > 0)) >> 1) + dmv_x;
-    other_y = ((motion_y * m + (motion_y > 0)) >> 1) + dmv_y + 1;
-    MOTION_FIELD (mpeg2_mc.put, motion->ref[0], other_x, other_y, 1, & ~1, 0);
-
-    pos_x = 2 * decoder->offset + motion_x;
-    pos_y = decoder->v_offset + motion_y;
-    if (unlikely (pos_x > decoder->limit_x)) {
-	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;
-	motion_x = pos_x - 2 * decoder->offset;
-    }
-    if (unlikely (pos_y > decoder->limit_y)) {
-	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y;
-	motion_y = pos_y - decoder->v_offset;
-    }
-    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);
-    offset = (pos_x >> 1) + (pos_y & ~1) * decoder->stride;
-    mpeg2_mc.avg[xy_half]
-	(decoder->dest[0] + decoder->offset,
-	 motion->ref[0][0] + offset, 2 * decoder->stride, 8);
-    mpeg2_mc.avg[xy_half]
-	(decoder->dest[0] + decoder->stride + decoder->offset,
-	 motion->ref[0][0] + decoder->stride + offset, 2 * decoder->stride, 8);
-    motion_x /= 2;	motion_y /= 2;
-    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);
-    offset = (((decoder->offset + motion_x) >> 1) +
-	      (((decoder->v_offset >> 1) + (motion_y & ~1)) *
-	       decoder->uv_stride));
-    mpeg2_mc.avg[4+xy_half]
-	(decoder->dest[1] + (decoder->offset >> 1),
-	 motion->ref[0][1] + offset, 2 * decoder->uv_stride, 4);
-    mpeg2_mc.avg[4+xy_half]
-	(decoder->dest[1] + decoder->uv_stride + (decoder->offset >> 1),
-	 motion->ref[0][1] + decoder->uv_stride + offset,
-	 2 * decoder->uv_stride, 4);
-    mpeg2_mc.avg[4+xy_half]
-	(decoder->dest[2] + (decoder->offset >> 1),
-	 motion->ref[0][2] + offset, 2 * decoder->uv_stride, 4);
-    mpeg2_mc.avg[4+xy_half]
-	(decoder->dest[2] + decoder->uv_stride + (decoder->offset >> 1),
-	 motion->ref[0][2] + decoder->uv_stride + offset,
-	 2 * decoder->uv_stride, 4);
-#undef bit_buf
-#undef bits
-#undef bit_ptr
-}
-
-static inline void motion_reuse (const mpeg2_decoder_t * const decoder,
-				 const motion_t * const motion,
-				 mpeg2_mc_fct * const * const table)
-{
-    int motion_x, motion_y;
-    unsigned int pos_x, pos_y, xy_half, offset;
-
-    motion_x = motion->pmv[0][0];
-    motion_y = motion->pmv[0][1];
-
-    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);
+    MOTION_420 (table, motion->ref[0], motion_x, motion_y, 16, 0);
 }
 
-static inline void motion_zero (const mpeg2_decoder_t * const decoder,
-				const motion_t * const motion,
-				mpeg2_mc_fct * const * const table)
-{
-    unsigned int offset;
-
-    table[0] (decoder->dest[0] + decoder->offset,
-	      (motion->ref[0][0] + decoder->offset +
-	       decoder->v_offset * decoder->stride),
-	      decoder->stride, 16);
-
-    offset = ((decoder->offset >> 1) +
-	      (decoder->v_offset >> 1) * decoder->uv_stride);
-    table[4] (decoder->dest[1] + (decoder->offset >> 1),
-	      motion->ref[0][1] + offset, decoder->uv_stride, 8);
-    table[4] (decoder->dest[2] + (decoder->offset >> 1),
-	      motion->ref[0][2] + offset, decoder->uv_stride, 8);
-}
+#define MOTION_FUNCTIONS(FORMAT,MOTION,MOTION_FIELD,MOTION_DMV,MOTION_ZERO)   \
+									      \
+static void motion_fr_frame_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				      motion_t * const motion,		      \
+				      mpeg2_mc_fct * const * const table)     \
+{									      \
+    int motion_x, motion_y;						      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;			      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,		      \
+						     motion->f_code[1]);      \
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);	      \
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;			      \
+									      \
+    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);		      \
+}									      \
+									      \
+static void motion_fr_field_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				      motion_t * const motion,		      \
+				      mpeg2_mc_fct * const * const table)     \
+{									      \
+    int motion_x, motion_y, field;					      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    field = UBITS (bit_buf, 1);						      \
+    DUMPBITS (bit_buf, bits, 1);					      \
+									      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[0][0] = motion_x;					      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_y = ((motion->pmv[0][1] >> 1) +				      \
+		get_motion_delta (decoder, motion->f_code[1]));		      \
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */	      \
+    motion->pmv[0][1] = motion_y << 1;					      \
+									      \
+    MOTION_FIELD (table, motion->ref[0], motion_x, motion_y, 0, & ~1, field); \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    field = UBITS (bit_buf, 1);						      \
+    DUMPBITS (bit_buf, bits, 1);					      \
+									      \
+    motion_x = motion->pmv[1][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion_x;					      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_y = ((motion->pmv[1][1] >> 1) +				      \
+		get_motion_delta (decoder, motion->f_code[1]));		      \
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */	      \
+    motion->pmv[1][1] = motion_y << 1;					      \
+									      \
+    MOTION_FIELD (table, motion->ref[0], motion_x, motion_y, 1, & ~1, field); \
+}									      \
+									      \
+static void motion_fr_dmv_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				    motion_t * const motion,		      \
+				    mpeg2_mc_fct * const * const table)	      \
+{									      \
+    int motion_x, motion_y, dmv_x, dmv_y, m, other_x, other_y;		      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;			      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    dmv_x = get_dmv (decoder);						      \
+									      \
+    motion_y = ((motion->pmv[0][1] >> 1) +				      \
+		get_motion_delta (decoder, motion->f_code[1]));		      \
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */	      \
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y << 1;		      \
+    dmv_y = get_dmv (decoder);						      \
+									      \
+    m = decoder->top_field_first ? 1 : 3;				      \
+    other_x = ((motion_x * m + (motion_x > 0)) >> 1) + dmv_x;		      \
+    other_y = ((motion_y * m + (motion_y > 0)) >> 1) + dmv_y - 1;	      \
+    MOTION_FIELD (mpeg2_mc.put, motion->ref[0], other_x, other_y, 0, | 1, 0); \
+									      \
+    m = decoder->top_field_first ? 3 : 1;				      \
+    other_x = ((motion_x * m + (motion_x > 0)) >> 1) + dmv_x;		      \
+    other_y = ((motion_y * m + (motion_y > 0)) >> 1) + dmv_y + 1;	      \
+    MOTION_FIELD (mpeg2_mc.put, motion->ref[0], other_x, other_y, 1, & ~1, 0);\
+									      \
+    MOTION_DMV (mpeg2_mc.avg, motion->ref[0], motion_x, motion_y);	      \
+}									      \
+									      \
+static void motion_reuse_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				   motion_t * const motion,		      \
+				   mpeg2_mc_fct * const * const table)	      \
+{									      \
+    int motion_x, motion_y;						      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    motion_x = motion->pmv[0][0];					      \
+    motion_y = motion->pmv[0][1];					      \
+									      \
+    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);		      \
+}									      \
+									      \
+static void motion_zero_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				  motion_t * const motion,		      \
+				  mpeg2_mc_fct * const * const table)	      \
+{									      \
+    unsigned int offset;						      \
+									      \
+    motion->pmv[0][0] = motion->pmv[0][1] = 0;				      \
+    motion->pmv[1][0] = motion->pmv[1][1] = 0;				      \
+									      \
+    MOTION_ZERO (table, motion->ref[0]);				      \
+}									      \
+									      \
+static void motion_fi_field_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				      motion_t * const motion,		      \
+				      mpeg2_mc_fct * const * const table)     \
+{									      \
+    int motion_x, motion_y;						      \
+    uint8_t ** ref_field;						      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];			      \
+    DUMPBITS (bit_buf, bits, 1);					      \
+									      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;			      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,		      \
+						     motion->f_code[1]);      \
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);	      \
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;			      \
+									      \
+    MOTION (table, ref_field, motion_x, motion_y, 16, 0);		      \
+}									      \
+									      \
+static void motion_fi_16x8_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				     motion_t * const motion,		      \
+				     mpeg2_mc_fct * const * const table)      \
+{									      \
+    int motion_x, motion_y;						      \
+    uint8_t ** ref_field;						      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];			      \
+    DUMPBITS (bit_buf, bits, 1);					      \
+									      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[0][0] = motion_x;					      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,		      \
+						     motion->f_code[1]);      \
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);	      \
+    motion->pmv[0][1] = motion_y;					      \
+									      \
+    MOTION (table, ref_field, motion_x, motion_y, 8, 0);		      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];			      \
+    DUMPBITS (bit_buf, bits, 1);					      \
+									      \
+    motion_x = motion->pmv[1][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion_x;					      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_y = motion->pmv[1][1] + get_motion_delta (decoder,		      \
+						     motion->f_code[1]);      \
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);	      \
+    motion->pmv[1][1] = motion_y;					      \
+									      \
+    MOTION (table, ref_field, motion_x, motion_y, 8, 8);		      \
+}									      \
+									      \
+static void motion_fi_dmv_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				    motion_t * const motion,		      \
+				    mpeg2_mc_fct * const * const table)	      \
+{									      \
+    int motion_x, motion_y, other_x, other_y;				      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;			      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    other_x = ((motion_x + (motion_x > 0)) >> 1) + get_dmv (decoder);	      \
+									      \
+    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,		      \
+						     motion->f_code[1]);      \
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);	      \
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;			      \
+    other_y = (((motion_y + (motion_y > 0)) >> 1) + get_dmv (decoder) +	      \
+	       decoder->dmv_offset);					      \
+									      \
+    MOTION (mpeg2_mc.put, motion->ref[0], motion_x, motion_y, 16, 0);	      \
+    MOTION (mpeg2_mc.avg, motion->ref[1], other_x, other_y, 16, 0);	      \
+}									      \
+
+MOTION_FUNCTIONS (420, MOTION_420, MOTION_FIELD_420, MOTION_DMV_420,
+		  MOTION_ZERO_420)
+MOTION_FUNCTIONS (422, MOTION_422, MOTION_FIELD_422, MOTION_DMV_422,
+		  MOTION_ZERO_422)
+MOTION_FUNCTIONS (444, MOTION_444, MOTION_FIELD_444, MOTION_DMV_444,
+		  MOTION_ZERO_444)
 
 /* like motion_frame, but parsing without actual motion compensation */
 static void motion_fr_conceal (mpeg2_decoder_t * const decoder)
 {
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
     int tmp;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
@@ -1275,129 +1525,10 @@ static void motion_fr_conceal (mpeg2_decoder_t * const decoder)
     decoder->f_motion.pmv[1][1] = decoder->f_motion.pmv[0][1] = tmp;
 
     DUMPBITS (bit_buf, bits, 1); /* remove marker_bit */
-#undef bit_buf
-#undef bits
-#undef bit_ptr
-}
-
-static void motion_fi_field (mpeg2_decoder_t * const decoder,
-			     motion_t * const motion,
-			     mpeg2_mc_fct * const * const table)
-{
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
-    int motion_x, motion_y;
-    uint8_t ** ref_field;
-    unsigned int pos_x, pos_y, xy_half, offset;
-
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    ref_field = motion->ref2[UBITS (bit_buf, 1)];
-    DUMPBITS (bit_buf, bits, 1);
-
-    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,
-						     motion->f_code[0]);
-    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
-    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
-
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,
-						     motion->f_code[1]);
-    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
-    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
-
-    MOTION (table, ref_field, motion_x, motion_y, 16, 0);
-#undef bit_buf
-#undef bits
-#undef bit_ptr
-}
-
-static void motion_fi_16x8 (mpeg2_decoder_t * const decoder,
-			    motion_t * const motion,
-			    mpeg2_mc_fct * const * const table)
-{
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
-    int motion_x, motion_y;
-    uint8_t ** ref_field;
-    unsigned int pos_x, pos_y, xy_half, offset;
-
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    ref_field = motion->ref2[UBITS (bit_buf, 1)];
-    DUMPBITS (bit_buf, bits, 1);
-
-    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,
-						     motion->f_code[0]);
-    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
-    motion->pmv[0][0] = motion_x;
-
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,
-						     motion->f_code[1]);
-    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
-    motion->pmv[0][1] = motion_y;
-
-    MOTION (table, ref_field, motion_x, motion_y, 8, 0);
-
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    ref_field = motion->ref2[UBITS (bit_buf, 1)];
-    DUMPBITS (bit_buf, bits, 1);
-
-    motion_x = motion->pmv[1][0] + get_motion_delta (decoder,
-						     motion->f_code[0]);
-    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
-    motion->pmv[1][0] = motion_x;
-
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = motion->pmv[1][1] + get_motion_delta (decoder,
-						     motion->f_code[1]);
-    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
-    motion->pmv[1][1] = motion_y;
-
-    MOTION (table, ref_field, motion_x, motion_y, 8, 8);
-#undef bit_buf
-#undef bits
-#undef bit_ptr
-}
-
-static void motion_fi_dmv (mpeg2_decoder_t * const decoder,
-			   motion_t * const motion,
-			   mpeg2_mc_fct * const * const table)
-{
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
-    int motion_x, motion_y, other_x, other_y;
-    unsigned int pos_x, pos_y, xy_half, offset;
-
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,
-						     motion->f_code[0]);
-    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
-    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    other_x = ((motion_x + (motion_x > 0)) >> 1) + get_dmv (decoder);
-
-    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,
-						     motion->f_code[1]);
-    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
-    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
-    other_y = (((motion_y + (motion_y > 0)) >> 1) + get_dmv (decoder) +
-	       decoder->dmv_offset);
-
-    MOTION (mpeg2_mc.put, motion->ref[0], motion_x, motion_y, 16, 0);
-    MOTION (mpeg2_mc.avg, motion->ref[1], other_x, other_y, 16, 0);
-#undef bit_buf
-#undef bits
-#undef bit_ptr
 }
 
 static void motion_fi_conceal (mpeg2_decoder_t * const decoder)
 {
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
     int tmp;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
@@ -1415,10 +1546,11 @@ static void motion_fi_conceal (mpeg2_decoder_t * const decoder)
     decoder->f_motion.pmv[1][1] = decoder->f_motion.pmv[0][1] = tmp;
 
     DUMPBITS (bit_buf, bits, 1); /* remove marker_bit */
+}
+
 #undef bit_buf
 #undef bits
 #undef bit_ptr
-}
 
 #define MOTION_CALL(routine,direction)				\
 do {								\
@@ -1436,14 +1568,14 @@ do {									\
     if (decoder->offset == decoder->width) {				\
 	do { /* just so we can use the break statement */		\
 	    if (decoder->convert) {					\
-		decoder->convert (decoder->fbuf_id, decoder->dest,	\
+		decoder->convert (decoder->convert_id, decoder->dest,	\
 				  decoder->v_offset);			\
 		if (decoder->coding_type == B_TYPE)			\
 		    break;						\
 	    }								\
-	    decoder->dest[0] += 16 * decoder->stride;			\
-	    decoder->dest[1] += 4 * decoder->stride;			\
-	    decoder->dest[2] += 4 * decoder->stride;			\
+	    decoder->dest[0] += decoder->slice_stride;			\
+	    decoder->dest[1] += decoder->slice_uv_stride;		\
+	    decoder->dest[2] += decoder->slice_uv_stride;		\
 	} while (0);							\
 	decoder->v_offset += 16;					\
 	if (decoder->v_offset > decoder->limit_y) {			\
@@ -1460,7 +1592,7 @@ void mpeg2_init_fbuf (mpeg2_decoder_t * decoder, uint8_t * current_fbuf[3],
 {
     int offset, stride, height, bottom_field;
 
-    stride = decoder->width;
+    stride = decoder->stride_frame;
     bottom_field = (decoder->picture_structure == BOTTOM_FIELD);
     offset = bottom_field ? stride : 0;
     height = decoder->height;
@@ -1469,15 +1601,9 @@ void mpeg2_init_fbuf (mpeg2_decoder_t * decoder, uint8_t * current_fbuf[3],
     decoder->picture_dest[1] = current_fbuf[1] + (offset >> 1);
     decoder->picture_dest[2] = current_fbuf[2] + (offset >> 1);
 
-    if (forward_fbuf) {
-      decoder->f_motion.ref[0][0] = forward_fbuf[0] + offset;
-      decoder->f_motion.ref[0][1] = forward_fbuf[1] + (offset >> 1);
-      decoder->f_motion.ref[0][2] = forward_fbuf[2] + (offset >> 1);
-    } else {
-      decoder->f_motion.ref[0][0] = 0;
-      decoder->f_motion.ref[0][1] = 0;
-      decoder->f_motion.ref[0][2] = 0;
-    }
+    decoder->f_motion.ref[0][0] = forward_fbuf[0] + offset;
+    decoder->f_motion.ref[0][1] = forward_fbuf[1] + (offset >> 1);
+    decoder->f_motion.ref[0][2] = forward_fbuf[2] + (offset >> 1);
 
     decoder->b_motion.ref[0][0] = backward_fbuf[0] + offset;
     decoder->b_motion.ref[0][1] = backward_fbuf[1] + (offset >> 1);
@@ -1494,15 +1620,9 @@ void mpeg2_init_fbuf (mpeg2_decoder_t * decoder, uint8_t * current_fbuf[3],
 	if (decoder->second_field && (decoder->coding_type != B_TYPE))
 	    forward_fbuf = current_fbuf;
 
-        if (forward_fbuf) {
-	  decoder->f_motion.ref[1][0] = forward_fbuf[0] + offset;
-  	  decoder->f_motion.ref[1][1] = forward_fbuf[1] + (offset >> 1);
-	  decoder->f_motion.ref[1][2] = forward_fbuf[2] + (offset >> 1);
-        } else {
-          decoder->f_motion.ref[0][0] = 0;
-          decoder->f_motion.ref[0][1] = 0;
-          decoder->f_motion.ref[0][2] = 0;
-        }
+	decoder->f_motion.ref[1][0] = forward_fbuf[0] + offset;
+	decoder->f_motion.ref[1][1] = forward_fbuf[1] + (offset >> 1);
+	decoder->f_motion.ref[1][2] = forward_fbuf[2] + (offset >> 1);
 
 	decoder->b_motion.ref[1][0] = backward_fbuf[0] + offset;
 	decoder->b_motion.ref[1][1] = backward_fbuf[1] + (offset >> 1);
@@ -1514,10 +1634,59 @@ void mpeg2_init_fbuf (mpeg2_decoder_t * decoder, uint8_t * current_fbuf[3],
 
     decoder->stride = stride;
     decoder->uv_stride = stride >> 1;
+    decoder->slice_stride = 16 * stride;
+    decoder->slice_uv_stride =
+	decoder->slice_stride >> (2 - decoder->chroma_format);
     decoder->limit_x = 2 * decoder->width - 32;
     decoder->limit_y_16 = 2 * height - 32;
     decoder->limit_y_8 = 2 * height - 16;
     decoder->limit_y = height - 16;
+
+    if (decoder->mpeg1) {
+	decoder->motion_parser[0] = motion_zero_420;
+	decoder->motion_parser[MC_FRAME] = motion_mp1;
+	decoder->motion_parser[4] = motion_reuse_420;
+    } else if (decoder->picture_structure == FRAME_PICTURE) {
+	if (decoder->chroma_format == 0) {
+	    decoder->motion_parser[0] = motion_zero_420;
+	    decoder->motion_parser[MC_FIELD] = motion_fr_field_420;
+	    decoder->motion_parser[MC_FRAME] = motion_fr_frame_420;
+	    decoder->motion_parser[MC_DMV] = motion_fr_dmv_420;
+	    decoder->motion_parser[4] = motion_reuse_420;
+	} else if (decoder->chroma_format == 1) {
+	    decoder->motion_parser[0] = motion_zero_422;
+	    decoder->motion_parser[MC_FIELD] = motion_fr_field_422;
+	    decoder->motion_parser[MC_FRAME] = motion_fr_frame_422;
+	    decoder->motion_parser[MC_DMV] = motion_fr_dmv_422;
+	    decoder->motion_parser[4] = motion_reuse_422;
+	} else {
+	    decoder->motion_parser[0] = motion_zero_444;
+	    decoder->motion_parser[MC_FIELD] = motion_fr_field_444;
+	    decoder->motion_parser[MC_FRAME] = motion_fr_frame_444;
+	    decoder->motion_parser[MC_DMV] = motion_fr_dmv_444;
+	    decoder->motion_parser[4] = motion_reuse_444;
+	}
+    } else {
+	if (decoder->chroma_format == 0) {
+	    decoder->motion_parser[0] = motion_zero_420;
+	    decoder->motion_parser[MC_FIELD] = motion_fi_field_420;
+	    decoder->motion_parser[MC_16X8] = motion_fi_16x8_420;
+	    decoder->motion_parser[MC_DMV] = motion_fi_dmv_420;
+	    decoder->motion_parser[4] = motion_reuse_420;
+	} else if (decoder->chroma_format == 1) {
+	    decoder->motion_parser[0] = motion_zero_422;
+	    decoder->motion_parser[MC_FIELD] = motion_fi_field_422;
+	    decoder->motion_parser[MC_16X8] = motion_fi_16x8_422;
+	    decoder->motion_parser[MC_DMV] = motion_fi_dmv_422;
+	    decoder->motion_parser[4] = motion_reuse_422;
+	} else {
+	    decoder->motion_parser[0] = motion_zero_444;
+	    decoder->motion_parser[MC_FIELD] = motion_fi_field_444;
+	    decoder->motion_parser[MC_16X8] = motion_fi_16x8_444;
+	    decoder->motion_parser[MC_DMV] = motion_fi_dmv_444;
+	    decoder->motion_parser[4] = motion_reuse_444;
+	}
+    }
 }
 
 static inline int slice_init (mpeg2_decoder_t * const decoder, int code)
@@ -1529,7 +1698,7 @@ static inline int slice_init (mpeg2_decoder_t * const decoder, int code)
     const MBAtab * mba;
 
     decoder->dc_dct_pred[0] = decoder->dc_dct_pred[1] =
-	decoder->dc_dct_pred[2] = 128 << decoder->intra_dc_precision;
+	decoder->dc_dct_pred[2] = 16384;
 
     decoder->f_motion.pmv[0][0] = decoder->f_motion.pmv[0][1] = 0;
     decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[1][1] = 0;
@@ -1543,13 +1712,14 @@ static inline int slice_init (mpeg2_decoder_t * const decoder, int code)
     decoder->v_offset = (code - 1) * 16;
     offset = 0;
     if (!(decoder->convert) || decoder->coding_type != B_TYPE)
-	offset = (code - 1) * decoder->stride * 4;
+	offset = (code - 1) * decoder->slice_stride;
 
-    decoder->dest[0] = decoder->picture_dest[0] + offset * 4;
+    decoder->dest[0] = decoder->picture_dest[0] + offset;
+    offset >>= (2 - decoder->chroma_format);
     decoder->dest[1] = decoder->picture_dest[1] + offset;
     decoder->dest[2] = decoder->picture_dest[2] + offset;
 
-    decoder->quantizer_scale = get_quantizer_scale (decoder);
+    get_quantizer_scale (decoder);
 
     /* ignore intra_slice and all the extra data */
     while (bit_buf & 0x80000000) {
@@ -1587,9 +1757,9 @@ static inline int slice_init (mpeg2_decoder_t * const decoder, int code)
     while (decoder->offset - decoder->width >= 0) {
 	decoder->offset -= decoder->width;
 	if (!(decoder->convert) || decoder->coding_type != B_TYPE) {
-	    decoder->dest[0] += 16 * decoder->stride;
-	    decoder->dest[1] += 4 * decoder->stride;
-	    decoder->dest[2] += 4 * decoder->stride;
+	    decoder->dest[0] += decoder->slice_stride;
+	    decoder->dest[1] += decoder->slice_uv_stride;
+	    decoder->dest[2] += decoder->slice_uv_stride;
 	}
 	decoder->v_offset += 16;
     }
@@ -1629,7 +1799,7 @@ void mpeg2_slice (mpeg2_decoder_t * const decoder, const int code,
 
 	/* maybe integrate MACROBLOCK_QUANT test into get_macroblock_modes ? */
 	if (macroblock_modes & MACROBLOCK_QUANT)
-	    decoder->quantizer_scale = get_quantizer_scale (decoder);
+	    get_quantizer_scale (decoder);
 
 	if (macroblock_modes & MACROBLOCK_INTRA) {
 
@@ -1663,72 +1833,49 @@ void mpeg2_slice (mpeg2_decoder_t * const decoder, const int code,
 	    slice_intra_DCT (decoder, 0, dest_y + 8, DCT_stride);
 	    slice_intra_DCT (decoder, 0, dest_y + DCT_offset, DCT_stride);
 	    slice_intra_DCT (decoder, 0, dest_y + DCT_offset + 8, DCT_stride);
-	    slice_intra_DCT (decoder, 1, decoder->dest[1] + (offset >> 1),
-			     decoder->uv_stride);
-	    slice_intra_DCT (decoder, 2, decoder->dest[2] + (offset >> 1),
-			     decoder->uv_stride);
-
-	    if (decoder->coding_type == D_TYPE) {
-		NEEDBITS (bit_buf, bits, bit_ptr);
-		DUMPBITS (bit_buf, bits, 1);
+	    if (likely (decoder->chroma_format == 0)) {
+		slice_intra_DCT (decoder, 1, decoder->dest[1] + (offset >> 1),
+				 decoder->uv_stride);
+		slice_intra_DCT (decoder, 2, decoder->dest[2] + (offset >> 1),
+				 decoder->uv_stride);
+		if (decoder->coding_type == D_TYPE) {
+		    NEEDBITS (bit_buf, bits, bit_ptr);
+		    DUMPBITS (bit_buf, bits, 1);
+		}
+	    } else if (likely (decoder->chroma_format == 1)) {
+		uint8_t * dest_u = decoder->dest[1] + (offset >> 1);
+		uint8_t * dest_v = decoder->dest[2] + (offset >> 1);
+		DCT_stride >>= 1;
+		DCT_offset >>= 1;
+		slice_intra_DCT (decoder, 1, dest_u, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v, DCT_stride);
+		slice_intra_DCT (decoder, 1, dest_u + DCT_offset, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v + DCT_offset, DCT_stride);
+	    } else {
+		uint8_t * dest_u = decoder->dest[1] + offset;
+		uint8_t * dest_v = decoder->dest[2] + offset;
+		slice_intra_DCT (decoder, 1, dest_u, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v, DCT_stride);
+		slice_intra_DCT (decoder, 1, dest_u + DCT_offset, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v + DCT_offset, DCT_stride);
+		slice_intra_DCT (decoder, 1, dest_u + 8, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v + 8, DCT_stride);
+		slice_intra_DCT (decoder, 1, dest_u + DCT_offset + 8,
+				 DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v + DCT_offset + 8,
+				 DCT_stride);
 	    }
 	} else {
 
-	    if (decoder->picture_structure == FRAME_PICTURE)
-		switch (macroblock_modes & MOTION_TYPE_MASK) {
-		case MC_FRAME:
-		    if (decoder->mpeg1)
-			MOTION_CALL (motion_mp1, macroblock_modes);
-		    else
-			MOTION_CALL (motion_fr_frame, macroblock_modes);
-		    break;
-
-		case MC_FIELD:
-		    MOTION_CALL (motion_fr_field, macroblock_modes);
-		    break;
-
-		case MC_DMV:
-		    MOTION_CALL (motion_fr_dmv, MACROBLOCK_MOTION_FORWARD);
-		    break;
-
-		case 0:
-		    /* non-intra mb without forward mv in a P picture */
-		    decoder->f_motion.pmv[0][0] = 0;
-		    decoder->f_motion.pmv[0][1] = 0;
-		    decoder->f_motion.pmv[1][0] = 0;
-		    decoder->f_motion.pmv[1][1] = 0;
-		    MOTION_CALL (motion_zero, MACROBLOCK_MOTION_FORWARD);
-		    break;
-		}
-	    else
-		switch (macroblock_modes & MOTION_TYPE_MASK) {
-		case MC_FIELD:
-		    MOTION_CALL (motion_fi_field, macroblock_modes);
-		    break;
-
-		case MC_16X8:
-		    MOTION_CALL (motion_fi_16x8, macroblock_modes);
-		    break;
-
-		case MC_DMV:
-		    MOTION_CALL (motion_fi_dmv, MACROBLOCK_MOTION_FORWARD);
-		    break;
-
-		case 0:
-		    /* non-intra mb without forward mv in a P picture */
-		    decoder->f_motion.pmv[0][0] = 0;
-		    decoder->f_motion.pmv[0][1] = 0;
-		    decoder->f_motion.pmv[1][0] = 0;
-		    decoder->f_motion.pmv[1][1] = 0;
-		    MOTION_CALL (motion_zero, MACROBLOCK_MOTION_FORWARD);
-		    break;
-		}
+	    motion_parser_t * parser;
+
+	    parser =
+		decoder->motion_parser[macroblock_modes >> MOTION_TYPE_SHIFT];
+	    MOTION_CALL (parser, macroblock_modes);
 
 	    if (macroblock_modes & MACROBLOCK_PATTERN) {
 		int coded_block_pattern;
 		int DCT_offset, DCT_stride;
-		int offset;
-		uint8_t * dest_y;
 
 		if (macroblock_modes & DCT_TYPE_INTERLACED) {
 		    DCT_offset = decoder->stride;
@@ -1740,30 +1887,123 @@ void mpeg2_slice (mpeg2_decoder_t * const decoder, const int code,
 
 		coded_block_pattern = get_coded_block_pattern (decoder);
 
-		offset = decoder->offset;
-		dest_y = decoder->dest[0] + offset;
-		if (coded_block_pattern & 0x20)
-		    slice_non_intra_DCT (decoder, dest_y, DCT_stride);
-		if (coded_block_pattern & 0x10)
-		    slice_non_intra_DCT (decoder, dest_y + 8, DCT_stride);
-		if (coded_block_pattern & 0x08)
-		    slice_non_intra_DCT (decoder, dest_y + DCT_offset,
-					 DCT_stride);
-		if (coded_block_pattern & 0x04)
-		    slice_non_intra_DCT (decoder, dest_y + DCT_offset + 8,
-					 DCT_stride);
-		if (coded_block_pattern & 0x2)
-		    slice_non_intra_DCT (decoder,
-					 decoder->dest[1] + (offset >> 1),
-					 decoder->uv_stride);
-		if (coded_block_pattern & 0x1)
-		    slice_non_intra_DCT (decoder,
-					 decoder->dest[2] + (offset >> 1),
-					 decoder->uv_stride);
+		if (likely (decoder->chroma_format == 0)) {
+		    int offset = decoder->offset;
+		    uint8_t * dest_y = decoder->dest[0] + offset;
+		    if (coded_block_pattern & 1)
+			slice_non_intra_DCT (decoder, 0, dest_y, DCT_stride);
+		    if (coded_block_pattern & 2)
+			slice_non_intra_DCT (decoder, 0, dest_y + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & 4)
+			slice_non_intra_DCT (decoder, 0, dest_y + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & 8)
+			slice_non_intra_DCT (decoder, 0,
+					     dest_y + DCT_offset + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & 16)
+			slice_non_intra_DCT (decoder, 1,
+					     decoder->dest[1] + (offset >> 1),
+					     decoder->uv_stride);
+		    if (coded_block_pattern & 32)
+			slice_non_intra_DCT (decoder, 2,
+					     decoder->dest[2] + (offset >> 1),
+					     decoder->uv_stride);
+		} else if (likely (decoder->chroma_format == 1)) {
+		    int offset;
+		    uint8_t * dest_y;
+
+		    coded_block_pattern |= bit_buf & (3 << 30);
+		    DUMPBITS (bit_buf, bits, 2);
+
+		    offset = decoder->offset;
+		    dest_y = decoder->dest[0] + offset;
+		    if (coded_block_pattern & 1)
+			slice_non_intra_DCT (decoder, 0, dest_y, DCT_stride);
+		    if (coded_block_pattern & 2)
+			slice_non_intra_DCT (decoder, 0, dest_y + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & 4)
+			slice_non_intra_DCT (decoder, 0, dest_y + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & 8)
+			slice_non_intra_DCT (decoder, 0,
+					     dest_y + DCT_offset + 8,
+					     DCT_stride);
+
+		    DCT_stride >>= 1;
+		    DCT_offset = (DCT_offset + offset) >> 1;
+		    if (coded_block_pattern & 16)
+			slice_non_intra_DCT (decoder, 1,
+					     decoder->dest[1] + (offset >> 1),
+					     DCT_stride);
+		    if (coded_block_pattern & 32)
+			slice_non_intra_DCT (decoder, 2,
+					     decoder->dest[2] + (offset >> 1),
+					     DCT_stride);
+		    if (coded_block_pattern & (2 << 30))
+			slice_non_intra_DCT (decoder, 1,
+					     decoder->dest[1] + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & (1 << 30))
+			slice_non_intra_DCT (decoder, 2,
+					     decoder->dest[2] + DCT_offset,
+					     DCT_stride);
+		} else {
+		    int offset;
+		    uint8_t * dest_y, * dest_u, * dest_v;
+
+		    coded_block_pattern |= bit_buf & (63 << 26);
+		    DUMPBITS (bit_buf, bits, 6);
+
+		    offset = decoder->offset;
+		    dest_y = decoder->dest[0] + offset;
+		    dest_u = decoder->dest[1] + offset;
+		    dest_v = decoder->dest[2] + offset;
+
+		    if (coded_block_pattern & 1)
+			slice_non_intra_DCT (decoder, 0, dest_y, DCT_stride);
+		    if (coded_block_pattern & 2)
+			slice_non_intra_DCT (decoder, 0, dest_y + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & 4)
+			slice_non_intra_DCT (decoder, 0, dest_y + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & 8)
+			slice_non_intra_DCT (decoder, 0,
+					     dest_y + DCT_offset + 8,
+					     DCT_stride);
+
+		    if (coded_block_pattern & 16)
+			slice_non_intra_DCT (decoder, 1, dest_u, DCT_stride);
+		    if (coded_block_pattern & 32)
+			slice_non_intra_DCT (decoder, 2, dest_v, DCT_stride);
+		    if (coded_block_pattern & (32 << 26))
+			slice_non_intra_DCT (decoder, 1, dest_u + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & (16 << 26))
+			slice_non_intra_DCT (decoder, 2, dest_v + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & (8 << 26))
+			slice_non_intra_DCT (decoder, 1, dest_u + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & (4 << 26))
+			slice_non_intra_DCT (decoder, 2, dest_v + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & (2 << 26))
+			slice_non_intra_DCT (decoder, 1,
+					     dest_u + DCT_offset + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & (1 << 26))
+			slice_non_intra_DCT (decoder, 2,
+					     dest_v + DCT_offset + 8,
+					     DCT_stride);
+		}
 	    }
 
 	    decoder->dc_dct_pred[0] = decoder->dc_dct_pred[1] =
-		decoder->dc_dct_pred[2] = 128 << decoder->intra_dc_precision;
+		decoder->dc_dct_pred[2] = 16384;
 	}
 
 	NEXT_MACROBLOCK;
@@ -1796,19 +2036,17 @@ void mpeg2_slice (mpeg2_decoder_t * const decoder, const int code,
 
 	if (mba_inc) {
 	    decoder->dc_dct_pred[0] = decoder->dc_dct_pred[1] =
-		decoder->dc_dct_pred[2] = 128 << decoder->intra_dc_precision;
+		decoder->dc_dct_pred[2] = 16384;
 
 	    if (decoder->coding_type == P_TYPE) {
-		decoder->f_motion.pmv[0][0] = decoder->f_motion.pmv[0][1] = 0;
-		decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[1][1] = 0;
-
 		do {
-		    MOTION_CALL (motion_zero, MACROBLOCK_MOTION_FORWARD);
+		    MOTION_CALL (decoder->motion_parser[0],
+				 MACROBLOCK_MOTION_FORWARD);
 		    NEXT_MACROBLOCK;
 		} while (--mba_inc);
 	    } else {
 		do {
-		    MOTION_CALL (motion_reuse, macroblock_modes);
+		    MOTION_CALL (decoder->motion_parser[4], macroblock_modes);
 		    NEXT_MACROBLOCK;
 		} while (--mba_inc);
 	    }
diff --git a/src/libmpeg2new/libmpeg2/uyvy.c b/src/libmpeg2new/libmpeg2/uyvy.c
new file mode 100644
index 000000000..7f107ffad
--- /dev/null
+++ b/src/libmpeg2new/libmpeg2/uyvy.c
@@ -0,0 +1,123 @@
+/*
+ * uyvy.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2003      Regis Duchesne <hpreg@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <inttypes.h>
+
+#include "mpeg2.h"
+#include "mpeg2convert.h"
+
+typedef struct {
+    int width;
+    int stride;
+    int chroma420;
+    uint8_t * out;
+} convert_uyvy_t;
+
+static void uyvy_start (void * _id, const mpeg2_fbuf_t * fbuf,
+			const mpeg2_picture_t * picture,
+			const mpeg2_gop_t * gop)
+{
+    convert_uyvy_t * instance = (convert_uyvy_t *) _id;
+
+    instance->out = fbuf->buf[0];
+    instance->stride = instance->width;
+    if (picture->nb_fields == 1) {
+	if (! (picture->flags & PIC_FLAG_TOP_FIELD_FIRST))
+	    instance->out += 2 * instance->stride;
+	instance->stride <<= 1;
+    }
+}
+
+#ifdef WORDS_BIGENDIAN
+#define PACK(a,b,c,d) (((a) << 24) | ((b) << 16) | ((c) << 8) | (d))
+#else
+#define PACK(a,b,c,d) (((d) << 24) | ((c) << 16) | ((b) << 8) | (a))
+#endif
+
+static void uyvy_copy (void * const _id, uint8_t * const * src,
+		       const unsigned int v_offset)
+{
+    const convert_uyvy_t * const id = (convert_uyvy_t *) _id;
+    uint8_t * _dst;
+    uint8_t * py, * pu, * pv;
+    int i, j;
+
+    _dst = id->out + 2 * id->stride * v_offset;
+    py = src[0]; pu = src[1]; pv = src[2];
+
+    i = 16;
+    do {
+	uint32_t * dst = (uint32_t *) _dst;
+
+	j = id->width >> 4;
+	do {
+	    dst[0] = PACK (pu[0],  py[0], pv[0],  py[1]);
+	    dst[1] = PACK (pu[1],  py[2], pv[1],  py[3]);
+	    dst[2] = PACK (pu[2],  py[4], pv[2],  py[5]);
+	    dst[3] = PACK (pu[3],  py[6], pv[3],  py[7]);
+	    dst[4] = PACK (pu[4],  py[8], pv[4],  py[9]);
+	    dst[5] = PACK (pu[5], py[10], pv[5], py[11]);
+	    dst[6] = PACK (pu[6], py[12], pv[6], py[13]);
+	    dst[7] = PACK (pu[7], py[14], pv[7], py[15]);
+	    py += 16;
+	    pu += 8;
+	    pv += 8;
+	    dst += 8;
+	} while (--j);
+	py -= id->width;
+	pu -= id->width >> 1;
+	pv -= id->width >> 1;
+	_dst += 2 * id->stride;
+	py += id->stride;
+	if (! (--i & id->chroma420)) {
+	    pu += id->stride >> 1;
+	    pv += id->stride >> 1;
+	}
+    } while (i);
+}
+
+int mpeg2convert_uyvy (int stage, void * _id, const mpeg2_sequence_t * seq,
+		       int stride, uint32_t accel, void * arg,
+		       mpeg2_convert_init_t * result)
+{
+    convert_uyvy_t * instance = (convert_uyvy_t *) _id;
+
+    if (seq->chroma_width == seq->width)
+	return 1;
+
+    if (instance) {
+	instance->width = seq->width;
+	instance->chroma420 = (seq->chroma_height < seq->height);
+	result->buf_size[0] = seq->width * seq->height * 2;
+	result->buf_size[1] = result->buf_size[2] = 0;
+	result->start = uyvy_start;
+	result->copy = uyvy_copy;
+    } else {
+	result->id_size = sizeof (convert_uyvy_t);
+    }
+
+    return 0;
+}
diff --git a/src/libmpeg2new/libmpeg2/vlc.h b/src/libmpeg2new/libmpeg2/vlc.h
index 8fa6b75bd..57448ce04 100644
--- a/src/libmpeg2new/libmpeg2/vlc.h
+++ b/src/libmpeg2new/libmpeg2/vlc.h
@@ -121,7 +121,7 @@ static const MBtab MB_P [] = {
 #define INTER MACROBLOCK_MOTION_FORWARD|MACROBLOCK_MOTION_BACKWARD
 
 static const MBtab MB_B [] = {
-    {0,                 0}, {INTRA|QUANT,       6},
+    {0,                 6}, {INTRA|QUANT,       6},
     {BWD|CODED|QUANT,   6}, {FWD|CODED|QUANT,   6},
     {INTER|CODED|QUANT, 5}, {INTER|CODED|QUANT, 5},
 					{INTRA,       5}, {INTRA,       5},
@@ -170,53 +170,53 @@ static const DMVtab DMV_2 [] = {
 
 
 static const CBPtab CBP_7 [] = {
-    {0x22, 7}, {0x12, 7}, {0x0a, 7}, {0x06, 7},
-    {0x21, 7}, {0x11, 7}, {0x09, 7}, {0x05, 7},
-    {0x3f, 6}, {0x3f, 6}, {0x03, 6}, {0x03, 6},
-    {0x24, 6}, {0x24, 6}, {0x18, 6}, {0x18, 6},
-    {0x3e, 5}, {0x3e, 5}, {0x3e, 5}, {0x3e, 5},
-    {0x02, 5}, {0x02, 5}, {0x02, 5}, {0x02, 5},
-    {0x3d, 5}, {0x3d, 5}, {0x3d, 5}, {0x3d, 5},
-    {0x01, 5}, {0x01, 5}, {0x01, 5}, {0x01, 5},
-    {0x38, 5}, {0x38, 5}, {0x38, 5}, {0x38, 5},
-    {0x34, 5}, {0x34, 5}, {0x34, 5}, {0x34, 5},
-    {0x2c, 5}, {0x2c, 5}, {0x2c, 5}, {0x2c, 5},
-    {0x1c, 5}, {0x1c, 5}, {0x1c, 5}, {0x1c, 5},
-    {0x28, 5}, {0x28, 5}, {0x28, 5}, {0x28, 5},
-    {0x14, 5}, {0x14, 5}, {0x14, 5}, {0x14, 5},
-    {0x30, 5}, {0x30, 5}, {0x30, 5}, {0x30, 5},
+    {0x11, 7}, {0x12, 7}, {0x14, 7}, {0x18, 7},
+    {0x21, 7}, {0x22, 7}, {0x24, 7}, {0x28, 7},
+    {0x3f, 6}, {0x3f, 6}, {0x30, 6}, {0x30, 6},
+    {0x09, 6}, {0x09, 6}, {0x06, 6}, {0x06, 6},
+    {0x1f, 5}, {0x1f, 5}, {0x1f, 5}, {0x1f, 5},
+    {0x10, 5}, {0x10, 5}, {0x10, 5}, {0x10, 5},
+    {0x2f, 5}, {0x2f, 5}, {0x2f, 5}, {0x2f, 5},
+    {0x20, 5}, {0x20, 5}, {0x20, 5}, {0x20, 5},
+    {0x07, 5}, {0x07, 5}, {0x07, 5}, {0x07, 5},
+    {0x0b, 5}, {0x0b, 5}, {0x0b, 5}, {0x0b, 5},
+    {0x0d, 5}, {0x0d, 5}, {0x0d, 5}, {0x0d, 5},
+    {0x0e, 5}, {0x0e, 5}, {0x0e, 5}, {0x0e, 5},
+    {0x05, 5}, {0x05, 5}, {0x05, 5}, {0x05, 5},
+    {0x0a, 5}, {0x0a, 5}, {0x0a, 5}, {0x0a, 5},
+    {0x03, 5}, {0x03, 5}, {0x03, 5}, {0x03, 5},
     {0x0c, 5}, {0x0c, 5}, {0x0c, 5}, {0x0c, 5},
-    {0x20, 4}, {0x20, 4}, {0x20, 4}, {0x20, 4},
-    {0x20, 4}, {0x20, 4}, {0x20, 4}, {0x20, 4},
-    {0x10, 4}, {0x10, 4}, {0x10, 4}, {0x10, 4},
-    {0x10, 4}, {0x10, 4}, {0x10, 4}, {0x10, 4},
-    {0x08, 4}, {0x08, 4}, {0x08, 4}, {0x08, 4},
-    {0x08, 4}, {0x08, 4}, {0x08, 4}, {0x08, 4},
+    {0x01, 4}, {0x01, 4}, {0x01, 4}, {0x01, 4},
+    {0x01, 4}, {0x01, 4}, {0x01, 4}, {0x01, 4},
+    {0x02, 4}, {0x02, 4}, {0x02, 4}, {0x02, 4},
+    {0x02, 4}, {0x02, 4}, {0x02, 4}, {0x02, 4},
     {0x04, 4}, {0x04, 4}, {0x04, 4}, {0x04, 4},
     {0x04, 4}, {0x04, 4}, {0x04, 4}, {0x04, 4},
-    {0x3c, 3}, {0x3c, 3}, {0x3c, 3}, {0x3c, 3},
-    {0x3c, 3}, {0x3c, 3}, {0x3c, 3}, {0x3c, 3},
-    {0x3c, 3}, {0x3c, 3}, {0x3c, 3}, {0x3c, 3},
-    {0x3c, 3}, {0x3c, 3}, {0x3c, 3}, {0x3c, 3}
+    {0x08, 4}, {0x08, 4}, {0x08, 4}, {0x08, 4},
+    {0x08, 4}, {0x08, 4}, {0x08, 4}, {0x08, 4},
+    {0x0f, 3}, {0x0f, 3}, {0x0f, 3}, {0x0f, 3},
+    {0x0f, 3}, {0x0f, 3}, {0x0f, 3}, {0x0f, 3},
+    {0x0f, 3}, {0x0f, 3}, {0x0f, 3}, {0x0f, 3},
+    {0x0f, 3}, {0x0f, 3}, {0x0f, 3}, {0x0f, 3}
 };
 
 static const CBPtab CBP_9 [] = {
-    {0,    0}, {0x00, 9}, {0x27, 9}, {0x1b, 9},
-    {0x3b, 9}, {0x37, 9}, {0x2f, 9}, {0x1f, 9},
-    {0x3a, 8}, {0x3a, 8}, {0x36, 8}, {0x36, 8},
-    {0x2e, 8}, {0x2e, 8}, {0x1e, 8}, {0x1e, 8},
-    {0x39, 8}, {0x39, 8}, {0x35, 8}, {0x35, 8},
-    {0x2d, 8}, {0x2d, 8}, {0x1d, 8}, {0x1d, 8},
-    {0x26, 8}, {0x26, 8}, {0x1a, 8}, {0x1a, 8},
-    {0x25, 8}, {0x25, 8}, {0x19, 8}, {0x19, 8},
-    {0x2b, 8}, {0x2b, 8}, {0x17, 8}, {0x17, 8},
-    {0x33, 8}, {0x33, 8}, {0x0f, 8}, {0x0f, 8},
-    {0x2a, 8}, {0x2a, 8}, {0x16, 8}, {0x16, 8},
-    {0x32, 8}, {0x32, 8}, {0x0e, 8}, {0x0e, 8},
-    {0x29, 8}, {0x29, 8}, {0x15, 8}, {0x15, 8},
-    {0x31, 8}, {0x31, 8}, {0x0d, 8}, {0x0d, 8},
-    {0x23, 8}, {0x23, 8}, {0x13, 8}, {0x13, 8},
-    {0x0b, 8}, {0x0b, 8}, {0x07, 8}, {0x07, 8}
+    {0,    9}, {0x00, 9}, {0x39, 9}, {0x36, 9},
+    {0x37, 9}, {0x3b, 9}, {0x3d, 9}, {0x3e, 9},
+    {0x17, 8}, {0x17, 8}, {0x1b, 8}, {0x1b, 8},
+    {0x1d, 8}, {0x1d, 8}, {0x1e, 8}, {0x1e, 8},
+    {0x27, 8}, {0x27, 8}, {0x2b, 8}, {0x2b, 8},
+    {0x2d, 8}, {0x2d, 8}, {0x2e, 8}, {0x2e, 8},
+    {0x19, 8}, {0x19, 8}, {0x16, 8}, {0x16, 8},
+    {0x29, 8}, {0x29, 8}, {0x26, 8}, {0x26, 8},
+    {0x35, 8}, {0x35, 8}, {0x3a, 8}, {0x3a, 8},
+    {0x33, 8}, {0x33, 8}, {0x3c, 8}, {0x3c, 8},
+    {0x15, 8}, {0x15, 8}, {0x1a, 8}, {0x1a, 8},
+    {0x13, 8}, {0x13, 8}, {0x1c, 8}, {0x1c, 8},
+    {0x25, 8}, {0x25, 8}, {0x2a, 8}, {0x2a, 8},
+    {0x23, 8}, {0x23, 8}, {0x2c, 8}, {0x2c, 8},
+    {0x31, 8}, {0x31, 8}, {0x32, 8}, {0x32, 8},
+    {0x34, 8}, {0x34, 8}, {0x38, 8}, {0x38, 8}
 };
 
 
@@ -289,7 +289,7 @@ static const DCTtab DCT_B14_10 [] = {
 };
 
 static const DCTtab DCT_B14_8 [] = {
-    { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6},
+    { 65, 0,12}, { 65, 0,12}, { 65, 0,12}, { 65, 0,12},
     {  3, 2, 7}, {  3, 2, 7}, { 10, 1, 7}, { 10, 1, 7},
     {  1, 4, 7}, {  1, 4, 7}, {  9, 1, 7}, {  9, 1, 7},
     {  8, 1, 6}, {  8, 1, 6}, {  8, 1, 6}, {  8, 1, 6},
@@ -326,7 +326,7 @@ static const DCTtab DCT_B15_10 [] = {
 };
 
 static const DCTtab DCT_B15_8 [] = {
-    { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6},
+    { 65, 0,12}, { 65, 0,12}, { 65, 0,12}, { 65, 0,12},
     {  8, 1, 7}, {  8, 1, 7}, {  9, 1, 7}, {  9, 1, 7},
     {  7, 1, 7}, {  7, 1, 7}, {  3, 2, 7}, {  3, 2, 7},
     {  1, 7, 6}, {  1, 7, 6}, {  1, 7, 6}, {  1, 7, 6},