From 4c0836e8f1504db9bbb329a1351050a8ff2cf469 Mon Sep 17 00:00:00 2001
From: Guenter Bartsch <guenter@users.sourceforge.net>
Date: Tue, 7 Aug 2001 23:59:50 +0000
Subject: latest ffmpeg updates - mmx works\! :-)

CVS patchset: 400
CVS date: 2001/08/07 23:59:50
---
 src/libffmpeg/config.h                 |   3 +
 src/libffmpeg/libavcodec/Makefile.am   |   4 +-
 src/libffmpeg/libavcodec/dsputil.c     |  75 ++++-
 src/libffmpeg/libavcodec/dsputil.h     |  12 +
 src/libffmpeg/libavcodec/dsputil_mmx.c |  38 +--
 src/libffmpeg/libavcodec/fdctref.c     |  36 ++
 src/libffmpeg/libavcodec/idct_mmx.c    | 592 +++++++++++++++++++++++++++++++++
 src/libffmpeg/libavcodec/jrevdct.c     |  16 +-
 src/libffmpeg/libavcodec/mjpeg.c       |  16 +-
 src/libffmpeg/libavcodec/mpeg12.c      |  30 +-
 src/libffmpeg/libavcodec/mpeg12data.h  |  16 +-
 src/libffmpeg/libavcodec/mpeg4data.h   |  23 --
 src/libffmpeg/libavcodec/mpegvideo.c   |   9 +-
 src/libffmpeg/libavcodec/mpegvideo.h   |   9 +-
 src/video_out/yuv2rgb.c                |  25 +-
 src/xine-engine/cpu_accel.h            |   2 +-
 16 files changed, 795 insertions(+), 111 deletions(-)
 create mode 100644 src/libffmpeg/libavcodec/idct_mmx.c

(limited to 'src')

diff --git a/src/libffmpeg/config.h b/src/libffmpeg/config.h
index e560cbebb..674150a19 100644
--- a/src/libffmpeg/config.h
+++ b/src/libffmpeg/config.h
@@ -4,3 +4,6 @@
 
 #include "../../config.h"
 
+#ifdef ARCH_X86
+#define HAVE_MMX
+#endif
diff --git a/src/libffmpeg/libavcodec/Makefile.am b/src/libffmpeg/libavcodec/Makefile.am
index 046974ac8..c9a19f498 100644
--- a/src/libffmpeg/libavcodec/Makefile.am
+++ b/src/libffmpeg/libavcodec/Makefile.am
@@ -9,8 +9,8 @@ LIBTOOL = $(SHELL) $(top_builddir)/libtool-nofpic
 noinst_LTLIBRARIES = libavcodec.la
 
 if HAVE_FFMMX
-#mmx_modules = mpegvideo_mmx.c sad_mmx.s dsputil_mmx.c
-mmx_modules = mpegvideo_mmx.c sad_mmx.s
+mmx_modules = mpegvideo_mmx.c sad_mmx.s dsputil_mmx.c idct_mmx.c
+#mmx_modules = mpegvideo_mmx.c sad_mmx.s
 endif
 
 libavcodec_la_SOURCES = dsputil.c   fdctref.c   jfdctfst.c  mpeg12.c   \
diff --git a/src/libffmpeg/libavcodec/dsputil.c b/src/libffmpeg/libavcodec/dsputil.c
index a41f1bef6..b4c06b820 100644
--- a/src/libffmpeg/libavcodec/dsputil.c
+++ b/src/libffmpeg/libavcodec/dsputil.c
@@ -34,6 +34,42 @@ op_pixels_abs_func pix_abs16x16_xy2;
 static UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
 UINT32 squareTbl[512];
 
+extern UINT16 default_intra_matrix[64];
+extern UINT16 default_non_intra_matrix[64];
+
+UINT8 zigzag_direct[64] = {
+    0, 1, 8, 16, 9, 2, 3, 10,
+    17, 24, 32, 25, 18, 11, 4, 5,
+    12, 19, 26, 33, 40, 48, 41, 34,
+    27, 20, 13, 6, 7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36,
+    29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46,
+    53, 60, 61, 54, 47, 55, 62, 63
+};
+
+UINT8 ff_alternate_horizontal_scan[64] = {
+    0,  1,  2,  3,  8,  9, 16, 17, 
+    10, 11,  4,  5,  6,  7, 15, 14,
+    13, 12, 19, 18, 24, 25, 32, 33, 
+    26, 27, 20, 21, 22, 23, 28, 29,
+    30, 31, 34, 35, 40, 41, 48, 49, 
+    42, 43, 36, 37, 38, 39, 44, 45,
+    46, 47, 50, 51, 56, 57, 58, 59, 
+    52, 53, 54, 55, 60, 61, 62, 63,
+};
+
+UINT8 ff_alternate_vertical_scan[64] = {
+    0,  8, 16, 24,  1,  9,  2, 10, 
+    17, 25, 32, 40, 48, 56, 57, 49,
+    41, 33, 26, 18,  3, 11,  4, 12, 
+    19, 27, 34, 42, 50, 58, 35, 43,
+    51, 59, 20, 28,  5, 13,  6, 14, 
+    21, 29, 36, 44, 52, 60, 37, 45,
+    53, 61, 22, 30,  7, 15, 23, 31, 
+    38, 46, 54, 62, 39, 47, 55, 63,
+};
+
 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
 {
     DCTELEM *p;
@@ -350,10 +386,34 @@ int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
     return s;
 }
 
-void dsputil_init(void)
+/* permute block according so that it corresponds to the MMX idct
+   order */
+void block_permute(INT16 *block)
 {
+    int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
     int i;
 
+    for(i=0;i<8;i++) {
+        tmp1 = block[1];
+        tmp2 = block[2];
+        tmp3 = block[3];
+        tmp4 = block[4];
+        tmp5 = block[5];
+        tmp6 = block[6];
+        block[1] = tmp2;
+        block[2] = tmp4;
+        block[3] = tmp6;
+        block[4] = tmp1;
+        block[5] = tmp3;
+        block[6] = tmp5;
+        block += 8;
+    }
+}
+
+void dsputil_init(void)
+{
+    int i, j;
+
     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
     for(i=0;i<MAX_NEG_CROP;i++) {
         cropTbl[i] = 0;
@@ -375,7 +435,20 @@ void dsputil_init(void)
     pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
     av_fdct = jpeg_fdct_ifast;
 
+    /* permute for IDCT */
+    for(i=0;i<64;i++) {
+        j = zigzag_direct[i];
+        zigzag_direct[i] = block_permute_op(j);
+        j = ff_alternate_horizontal_scan[i];
+        ff_alternate_horizontal_scan[i] = block_permute_op(j);
+        j = ff_alternate_vertical_scan[i];
+        ff_alternate_vertical_scan[i] = block_permute_op(j);
+    }
+    block_permute(default_intra_matrix);
+    block_permute(default_non_intra_matrix);
+
 #ifdef HAVE_MMX
+    printf ("ffmpeg: init mmx\n");
     dsputil_init_mmx();
 #endif
 }
diff --git a/src/libffmpeg/libavcodec/dsputil.h b/src/libffmpeg/libavcodec/dsputil.h
index ffbc395ba..ebb4d8446 100644
--- a/src/libffmpeg/libavcodec/dsputil.h
+++ b/src/libffmpeg/libavcodec/dsputil.h
@@ -15,6 +15,11 @@ void fdct_mmx(DCTELEM *block);
 
 void (*av_fdct)(DCTELEM *block);
 
+/* encoding scans */
+extern UINT8 ff_alternate_horizontal_scan[64];
+extern UINT8 ff_alternate_vertical_scan[64];
+extern UINT8 zigzag_direct[64];
+
 /* pixel operations */
 #define MAX_NEG_CROP 384
 
@@ -62,6 +67,13 @@ int pix_abs16x16_x2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h);
 int pix_abs16x16_y2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h);
 int pix_abs16x16_xy2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h);
 
+static inline int block_permute_op(int j)
+{
+    return (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
+}
+
+void block_permute(INT16 *block);
+
 #ifdef HAVE_MMX
 
 #define MM_MMX    0x0001 /* standard MMX */
diff --git a/src/libffmpeg/libavcodec/dsputil_mmx.c b/src/libffmpeg/libavcodec/dsputil_mmx.c
index ddb91f54c..a4d40eb72 100644
--- a/src/libffmpeg/libavcodec/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/dsputil_mmx.c
@@ -30,15 +30,9 @@ int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
 
-#ifdef USE_MMX_IDCT
-/* external functions, defined in libmpeg2 */
-void mmx_idct(DCTELEM *block);
-void mmxext_idct(DCTELEM *block);
-/* this should be in dsputil.h?  -- A'rpi   */
-extern UINT8 ff_alternate_horizontal_scan[64];
-extern UINT8 ff_alternate_vertical_scan[64];
-extern UINT8 zigzag_direct[64];
-#endif
+/* external functions, from idct_mmx.c */
+void ff_mmx_idct(DCTELEM *block);
+void ff_mmxext_idct(DCTELEM *block);
 
 /* pixel operations */
 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001;
@@ -996,8 +990,7 @@ void dsputil_init_mmx(void)
         pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
         pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
         pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
-        //av_fdct = fdct_mmx;
-        av_fdct = 0;
+        av_fdct = fdct_mmx;
 
         put_pixels_tab[0] = put_pixels_mmx;
         put_pixels_tab[1] = put_pixels_x2_mmx;
@@ -1052,22 +1045,11 @@ void dsputil_init_mmx(void)
             sub_pixels_tab[2] = sub_pixels_y2_3dnow;
         }
 
-#ifdef USE_MMX_IDCT
-	/* use MMX / MMXEXT iDCT code from libmpeg2 */
-	//printf("LIBAVCODEC: Using MMX%s iDCT code\n",(mm_flags & MM_MMXEXT)?"EXT":"");
-	ff_idct = (mm_flags & MM_MMXEXT) ? mmxext_idct : mmx_idct;
-	/* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
-	{   int i,j;
-	    for (i = 0; i < 64; i++) {
-		j = zigzag_direct[i];
-		zigzag_direct[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
-		j = ff_alternate_horizontal_scan[i];
-		ff_alternate_horizontal_scan[i] =  (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
-		j = ff_alternate_vertical_scan[i];
-		ff_alternate_vertical_scan[i] =  (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
-	    }
-	}
-#endif
-
+        /* idct */
+        if (mm_flags & MM_MMXEXT) {
+            ff_idct = ff_mmxext_idct;
+        } else {
+            ff_idct = ff_mmx_idct;
+        }
     }
 }
diff --git a/src/libffmpeg/libavcodec/fdctref.c b/src/libffmpeg/libavcodec/fdctref.c
index b90a2e52e..245492496 100644
--- a/src/libffmpeg/libavcodec/fdctref.c
+++ b/src/libffmpeg/libavcodec/fdctref.c
@@ -116,3 +116,39 @@ short *block;
  */
       }
 }
+
+/* perform IDCT matrix multiply for 8x8 coefficient block */
+
+void idct(block)
+short *block;
+{
+  int i, j, k, v;
+  double partial_product;
+  double tmp[64];
+
+  for (i=0; i<8; i++)
+    for (j=0; j<8; j++)
+    {
+      partial_product = 0.0;
+
+      for (k=0; k<8; k++)
+        partial_product+= c[k][j]*block[8*i+k];
+
+      tmp[8*i+j] = partial_product;
+    }
+
+  /* Transpose operation is integrated into address mapping by switching 
+     loop order of i and j */
+
+  for (j=0; j<8; j++)
+    for (i=0; i<8; i++)
+    {
+      partial_product = 0.0;
+
+      for (k=0; k<8; k++)
+        partial_product+= c[k][i]*tmp[8*k+j];
+
+      v = (int) floor(partial_product+0.5);
+      block[8*i+j] = v;
+    }
+}
diff --git a/src/libffmpeg/libavcodec/idct_mmx.c b/src/libffmpeg/libavcodec/idct_mmx.c
new file mode 100644
index 000000000..d004481b1
--- /dev/null
+++ b/src/libffmpeg/libavcodec/idct_mmx.c
@@ -0,0 +1,592 @@
+/*
+ * Note: For libavcodec, this code can also be used under the LGPL license
+ */
+/*
+ * idct_mmx.c
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <inttypes.h>
+
+#include "config.h"
+
+#include "cpu_accel.h"
+
+#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
+
+#define ROW_SHIFT 11
+#define COL_SHIFT 6
+
+#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
+#define rounder(bias) {round (bias), round (bias)}
+
+#if 0
+/* C row IDCT - its just here to document the MMXEXT and MMX versions */
+static inline void idct_row (int16_t * row, int offset,
+			     int16_t * table, int32_t * rounder)
+{
+    int C1, C2, C3, C4, C5, C6, C7;
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+
+    row += offset;
+
+    C1 = table[1];
+    C2 = table[2];
+    C3 = table[3];
+    C4 = table[4];
+    C5 = table[5];
+    C6 = table[6];
+    C7 = table[7];
+
+    a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
+    a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
+    a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
+    a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
+
+    b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
+    b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
+    b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
+    b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
+
+    row[0] = (a0 + b0) >> ROW_SHIFT;
+    row[1] = (a1 + b1) >> ROW_SHIFT;
+    row[2] = (a2 + b2) >> ROW_SHIFT;
+    row[3] = (a3 + b3) >> ROW_SHIFT;
+    row[4] = (a3 - b3) >> ROW_SHIFT;
+    row[5] = (a2 - b2) >> ROW_SHIFT;
+    row[6] = (a1 - b1) >> ROW_SHIFT;
+    row[7] = (a0 - b0) >> ROW_SHIFT;
+}
+#endif
+
+
+/* MMXEXT row IDCT */
+
+#define mmxext_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2, -c4, -c2,	\
+						   c4,  c6,  c4,  c6,	\
+						   c1,  c3, -c1, -c5,	\
+						   c5,  c7,  c3, -c7,	\
+						   c4, -c6,  c4, -c6,	\
+						  -c4,  c2,  c4, -c2,	\
+						   c5, -c1,  c3, -c1,	\
+						   c7,  c3,  c7, -c5 }
+
+static inline void mmxext_row_head (int16_t * row, int offset, int16_t * table)
+{
+    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+
+    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
+    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+
+    movq_m2r (*table, mm3);		// mm3 = -C2 -C4 C2 C4
+    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+
+    movq_m2r (*(table+4), mm4);		// mm4 = C6 C4 C6 C4
+    pmaddwd_r2r (mm0, mm3);		// mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
+
+    pshufw_r2r (mm2, mm2, 0x4e);	// mm2 = x2 x0 x6 x4
+}
+
+static inline void mmxext_row (int16_t * table, int32_t * rounder)
+{
+    movq_m2r (*(table+8), mm1);		// mm1 = -C5 -C1 C3 C1
+    pmaddwd_r2r (mm2, mm4);		// mm4 = C4*x0+C6*x2 C4*x4+C6*x6
+
+    pmaddwd_m2r (*(table+16), mm0);	// mm0 = C4*x4-C6*x6 C4*x0-C6*x2
+    pshufw_r2r (mm6, mm6, 0x4e);	// mm6 = x3 x1 x7 x5
+
+    movq_m2r (*(table+12), mm7);	// mm7 = -C7 C3 C7 C5
+    pmaddwd_r2r (mm5, mm1);		// mm1 = -C1*x5-C5*x7 C1*x1+C3*x3
+
+    paddd_m2r (*rounder, mm3);		// mm3 += rounder
+    pmaddwd_r2r (mm6, mm7);		// mm7 = C3*x1-C7*x3 C5*x5+C7*x7
+
+    pmaddwd_m2r (*(table+20), mm2);	// mm2 = C4*x0-C2*x2 -C4*x4+C2*x6
+    paddd_r2r (mm4, mm3);		// mm3 = a1 a0 + rounder
+
+    pmaddwd_m2r (*(table+24), mm5);	// mm5 = C3*x5-C1*x7 C5*x1-C1*x3
+    movq_r2r (mm3, mm4);		// mm4 = a1 a0 + rounder
+
+    pmaddwd_m2r (*(table+28), mm6);	// mm6 = C7*x1-C5*x3 C7*x5+C3*x7
+    paddd_r2r (mm7, mm1);		// mm1 = b1 b0
+
+    paddd_m2r (*rounder, mm0);		// mm0 += rounder
+    psubd_r2r (mm1, mm3);		// mm3 = a1-b1 a0-b0 + rounder
+
+    psrad_i2r (ROW_SHIFT, mm3);		// mm3 = y6 y7
+    paddd_r2r (mm4, mm1);		// mm1 = a1+b1 a0+b0 + rounder
+
+    paddd_r2r (mm2, mm0);		// mm0 = a3 a2 + rounder
+    psrad_i2r (ROW_SHIFT, mm1);		// mm1 = y1 y0
+
+    paddd_r2r (mm6, mm5);		// mm5 = b3 b2
+    movq_r2r (mm0, mm4);		// mm4 = a3 a2 + rounder
+
+    paddd_r2r (mm5, mm0);		// mm0 = a3+b3 a2+b2 + rounder
+    psubd_r2r (mm5, mm4);		// mm4 = a3-b3 a2-b2 + rounder
+}
+
+static inline void mmxext_row_tail (int16_t * row, int store)
+{
+    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+
+    psrad_i2r (ROW_SHIFT, mm4);		// mm4 = y4 y5
+
+    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+
+    packssdw_r2r (mm3, mm4);		// mm4 = y6 y7 y4 y5
+
+    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
+    pshufw_r2r (mm4, mm4, 0xb1);	// mm4 = y7 y6 y5 y4
+
+    /* slot */
+
+    movq_r2m (mm4, *(row+store+4));	// save y7 y6 y5 y4
+}
+
+static inline void mmxext_row_mid (int16_t * row, int store,
+				   int offset, int16_t * table)
+{
+    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+
+    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
+    psrad_i2r (ROW_SHIFT, mm4);		// mm4 = y4 y5
+
+    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+
+    packssdw_r2r (mm3, mm4);		// mm4 = y6 y7 y4 y5
+    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+
+    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
+    pshufw_r2r (mm4, mm4, 0xb1);	// mm4 = y7 y6 y5 y4
+
+    movq_m2r (*table, mm3);		// mm3 = -C2 -C4 C2 C4
+    movq_r2m (mm4, *(row+store+4));	// save y7 y6 y5 y4
+
+    pmaddwd_r2r (mm0, mm3);		// mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
+
+    movq_m2r (*(table+4), mm4);		// mm4 = C6 C4 C6 C4
+    pshufw_r2r (mm2, mm2, 0x4e);	// mm2 = x2 x0 x6 x4
+}
+
+
+/* MMX row IDCT */
+
+#define mmx_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2,  c4,  c6,	\
+					   c4,  c6, -c4, -c2,	\
+					   c1,  c3,  c3, -c7,	\
+					   c5,  c7, -c1, -c5,	\
+					   c4, -c6,  c4, -c2,	\
+					  -c4,  c2,  c4, -c6,	\
+					   c5, -c1,  c7, -c5,	\
+					   c7,  c3,  c3, -c1 }
+
+static inline void mmx_row_head (int16_t * row, int offset, int16_t * table)
+{
+    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+
+    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
+    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+
+    movq_m2r (*table, mm3);		// mm3 = C6 C4 C2 C4
+    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+
+    punpckldq_r2r (mm0, mm0);		// mm0 = x2 x0 x2 x0
+
+    movq_m2r (*(table+4), mm4);		// mm4 = -C2 -C4 C6 C4
+    pmaddwd_r2r (mm0, mm3);		// mm3 = C4*x0+C6*x2 C4*x0+C2*x2
+
+    movq_m2r (*(table+8), mm1);		// mm1 = -C7 C3 C3 C1
+    punpckhdq_r2r (mm2, mm2);		// mm2 = x6 x4 x6 x4
+}
+
+static inline void mmx_row (int16_t * table, int32_t * rounder)
+{
+    pmaddwd_r2r (mm2, mm4);		// mm4 = -C4*x4-C2*x6 C4*x4+C6*x6
+    punpckldq_r2r (mm5, mm5);		// mm5 = x3 x1 x3 x1
+
+    pmaddwd_m2r (*(table+16), mm0);	// mm0 = C4*x0-C2*x2 C4*x0-C6*x2
+    punpckhdq_r2r (mm6, mm6);		// mm6 = x7 x5 x7 x5
+
+    movq_m2r (*(table+12), mm7);	// mm7 = -C5 -C1 C7 C5
+    pmaddwd_r2r (mm5, mm1);		// mm1 = C3*x1-C7*x3 C1*x1+C3*x3
+
+    paddd_m2r (*rounder, mm3);		// mm3 += rounder
+    pmaddwd_r2r (mm6, mm7);		// mm7 = -C1*x5-C5*x7 C5*x5+C7*x7
+
+    pmaddwd_m2r (*(table+20), mm2);	// mm2 = C4*x4-C6*x6 -C4*x4+C2*x6
+    paddd_r2r (mm4, mm3);		// mm3 = a1 a0 + rounder
+
+    pmaddwd_m2r (*(table+24), mm5);	// mm5 = C7*x1-C5*x3 C5*x1-C1*x3
+    movq_r2r (mm3, mm4);		// mm4 = a1 a0 + rounder
+
+    pmaddwd_m2r (*(table+28), mm6);	// mm6 = C3*x5-C1*x7 C7*x5+C3*x7
+    paddd_r2r (mm7, mm1);		// mm1 = b1 b0
+
+    paddd_m2r (*rounder, mm0);		// mm0 += rounder
+    psubd_r2r (mm1, mm3);		// mm3 = a1-b1 a0-b0 + rounder
+
+    psrad_i2r (ROW_SHIFT, mm3);		// mm3 = y6 y7
+    paddd_r2r (mm4, mm1);		// mm1 = a1+b1 a0+b0 + rounder
+
+    paddd_r2r (mm2, mm0);		// mm0 = a3 a2 + rounder
+    psrad_i2r (ROW_SHIFT, mm1);		// mm1 = y1 y0
+
+    paddd_r2r (mm6, mm5);		// mm5 = b3 b2
+    movq_r2r (mm0, mm7);		// mm7 = a3 a2 + rounder
+
+    paddd_r2r (mm5, mm0);		// mm0 = a3+b3 a2+b2 + rounder
+    psubd_r2r (mm5, mm7);		// mm7 = a3-b3 a2-b2 + rounder
+}
+
+static inline void mmx_row_tail (int16_t * row, int store)
+{
+    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+
+    psrad_i2r (ROW_SHIFT, mm7);		// mm7 = y4 y5
+
+    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+
+    packssdw_r2r (mm3, mm7);		// mm7 = y6 y7 y4 y5
+
+    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
+    movq_r2r (mm7, mm4);		// mm4 = y6 y7 y4 y5
+
+    pslld_i2r (16, mm7);		// mm7 = y7 0 y5 0
+
+    psrld_i2r (16, mm4);		// mm4 = 0 y6 0 y4
+
+    por_r2r (mm4, mm7);			// mm7 = y7 y6 y5 y4
+
+    /* slot */
+
+    movq_r2m (mm7, *(row+store+4));	// save y7 y6 y5 y4
+}
+
+static inline void mmx_row_mid (int16_t * row, int store,
+				int offset, int16_t * table)
+{
+    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+
+    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
+    psrad_i2r (ROW_SHIFT, mm7);		// mm7 = y4 y5
+
+    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+
+    packssdw_r2r (mm3, mm7);		// mm7 = y6 y7 y4 y5
+    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+
+    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
+    movq_r2r (mm7, mm1);		// mm1 = y6 y7 y4 y5
+
+    punpckldq_r2r (mm0, mm0);		// mm0 = x2 x0 x2 x0
+    psrld_i2r (16, mm7);		// mm7 = 0 y6 0 y4
+
+    movq_m2r (*table, mm3);		// mm3 = C6 C4 C2 C4
+    pslld_i2r (16, mm1);		// mm1 = y7 0 y5 0
+
+    movq_m2r (*(table+4), mm4);		// mm4 = -C2 -C4 C6 C4
+    por_r2r (mm1, mm7);			// mm7 = y7 y6 y5 y4
+
+    movq_m2r (*(table+8), mm1);		// mm1 = -C7 C3 C3 C1
+    punpckhdq_r2r (mm2, mm2);		// mm2 = x6 x4 x6 x4
+
+    movq_r2m (mm7, *(row+store+4));	// save y7 y6 y5 y4
+    pmaddwd_r2r (mm0, mm3);		// mm3 = C4*x0+C6*x2 C4*x0+C2*x2
+}
+
+
+#if 0
+// C column IDCT - its just here to document the MMXEXT and MMX versions
+static inline void idct_col (int16_t * col, int offset)
+{
+/* multiplication - as implemented on mmx */
+#define F(c,x) (((c) * (x)) >> 16)
+
+/* saturation - it helps us handle torture test cases */
+#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
+
+    int16_t x0, x1, x2, x3, x4, x5, x6, x7;
+    int16_t y0, y1, y2, y3, y4, y5, y6, y7;
+    int16_t a0, a1, a2, a3, b0, b1, b2, b3;
+    int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
+
+    col += offset;
+
+    x0 = col[0*8];
+    x1 = col[1*8];
+    x2 = col[2*8];
+    x3 = col[3*8];
+    x4 = col[4*8];
+    x5 = col[5*8];
+    x6 = col[6*8];
+    x7 = col[7*8];
+
+    u04 = S (x0 + x4);
+    v04 = S (x0 - x4);
+    u26 = S (F (T2, x6) + x2);
+    v26 = S (F (T2, x2) - x6);
+
+    a0 = S (u04 + u26);
+    a1 = S (v04 + v26);
+    a2 = S (v04 - v26);
+    a3 = S (u04 - u26);
+
+    u17 = S (F (T1, x7) + x1);
+    v17 = S (F (T1, x1) - x7);
+    u35 = S (F (T3, x5) + x3);
+    v35 = S (F (T3, x3) - x5);
+
+    b0 = S (u17 + u35);
+    b3 = S (v17 - v35);
+    u12 = S (u17 - u35);
+    v12 = S (v17 + v35);
+    u12 = S (2 * F (C4, u12));
+    v12 = S (2 * F (C4, v12));
+    b1 = S (u12 + v12);
+    b2 = S (u12 - v12);
+
+    y0 = S (a0 + b0) >> COL_SHIFT;
+    y1 = S (a1 + b1) >> COL_SHIFT;
+    y2 = S (a2 + b2) >> COL_SHIFT;
+    y3 = S (a3 + b3) >> COL_SHIFT;
+
+    y4 = S (a3 - b3) >> COL_SHIFT;
+    y5 = S (a2 - b2) >> COL_SHIFT;
+    y6 = S (a1 - b1) >> COL_SHIFT;
+    y7 = S (a0 - b0) >> COL_SHIFT;
+
+    col[0*8] = y0;
+    col[1*8] = y1;
+    col[2*8] = y2;
+    col[3*8] = y3;
+    col[4*8] = y4;
+    col[5*8] = y5;
+    col[6*8] = y6;
+    col[7*8] = y7;
+}
+#endif
+
+
+// MMX column IDCT
+static inline void idct_col (int16_t * col, int offset)
+{
+#define T1 13036
+#define T2 27146
+#define T3 43790
+#define C4 23170
+
+    static short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
+    static short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
+    static short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
+    static short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
+
+    /* column code adapted from peter gubanov */
+    /* http://www.elecard.com/peter/idct.shtml */
+
+    movq_m2r (*_T1, mm0);		// mm0 = T1
+
+    movq_m2r (*(col+offset+1*8), mm1);	// mm1 = x1
+    movq_r2r (mm0, mm2);		// mm2 = T1
+
+    movq_m2r (*(col+offset+7*8), mm4);	// mm4 = x7
+    pmulhw_r2r (mm1, mm0);		// mm0 = T1*x1
+
+    movq_m2r (*_T3, mm5);		// mm5 = T3
+    pmulhw_r2r (mm4, mm2);		// mm2 = T1*x7
+
+    movq_m2r (*(col+offset+5*8), mm6);	// mm6 = x5
+    movq_r2r (mm5, mm7);		// mm7 = T3-1
+
+    movq_m2r (*(col+offset+3*8), mm3);	// mm3 = x3
+    psubsw_r2r (mm4, mm0);		// mm0 = v17
+
+    movq_m2r (*_T2, mm4);		// mm4 = T2
+    pmulhw_r2r (mm3, mm5);		// mm5 = (T3-1)*x3
+
+    paddsw_r2r (mm2, mm1);		// mm1 = u17
+    pmulhw_r2r (mm6, mm7);		// mm7 = (T3-1)*x5
+
+    /* slot */
+
+    movq_r2r (mm4, mm2);		// mm2 = T2
+    paddsw_r2r (mm3, mm5);		// mm5 = T3*x3
+
+    pmulhw_m2r (*(col+offset+2*8), mm4);// mm4 = T2*x2
+    paddsw_r2r (mm6, mm7);		// mm7 = T3*x5
+
+    psubsw_r2r (mm6, mm5);		// mm5 = v35
+    paddsw_r2r (mm3, mm7);		// mm7 = u35
+
+    movq_m2r (*(col+offset+6*8), mm3);	// mm3 = x6
+    movq_r2r (mm0, mm6);		// mm6 = v17
+
+    pmulhw_r2r (mm3, mm2);		// mm2 = T2*x6
+    psubsw_r2r (mm5, mm0);		// mm0 = b3
+
+    psubsw_r2r (mm3, mm4);		// mm4 = v26
+    paddsw_r2r (mm6, mm5);		// mm5 = v12
+
+    movq_r2m (mm0, *(col+offset+3*8));	// save b3 in scratch0
+    movq_r2r (mm1, mm6);		// mm6 = u17
+
+    paddsw_m2r (*(col+offset+2*8), mm2);// mm2 = u26
+    paddsw_r2r (mm7, mm6);		// mm6 = b0
+
+    psubsw_r2r (mm7, mm1);		// mm1 = u12
+    movq_r2r (mm1, mm7);		// mm7 = u12
+
+    movq_m2r (*(col+offset+0*8), mm3);	// mm3 = x0
+    paddsw_r2r (mm5, mm1);		// mm1 = u12+v12
+
+    movq_m2r (*_C4, mm0);		// mm0 = C4/2
+    psubsw_r2r (mm5, mm7);		// mm7 = u12-v12
+
+    movq_r2m (mm6, *(col+offset+5*8));	// save b0 in scratch1
+    pmulhw_r2r (mm0, mm1);		// mm1 = b1/2
+
+    movq_r2r (mm4, mm6);		// mm6 = v26
+    pmulhw_r2r (mm0, mm7);		// mm7 = b2/2
+
+    movq_m2r (*(col+offset+4*8), mm5);	// mm5 = x4
+    movq_r2r (mm3, mm0);		// mm0 = x0
+
+    psubsw_r2r (mm5, mm3);		// mm3 = v04
+    paddsw_r2r (mm5, mm0);		// mm0 = u04
+
+    paddsw_r2r (mm3, mm4);		// mm4 = a1
+    movq_r2r (mm0, mm5);		// mm5 = u04
+
+    psubsw_r2r (mm6, mm3);		// mm3 = a2
+    paddsw_r2r (mm2, mm5);		// mm5 = a0
+
+    paddsw_r2r (mm1, mm1);		// mm1 = b1
+    psubsw_r2r (mm2, mm0);		// mm0 = a3
+
+    paddsw_r2r (mm7, mm7);		// mm7 = b2
+    movq_r2r (mm3, mm2);		// mm2 = a2
+
+    movq_r2r (mm4, mm6);		// mm6 = a1
+    paddsw_r2r (mm7, mm3);		// mm3 = a2+b2
+
+    psraw_i2r (COL_SHIFT, mm3);		// mm3 = y2
+    paddsw_r2r (mm1, mm4);		// mm4 = a1+b1
+
+    psraw_i2r (COL_SHIFT, mm4);		// mm4 = y1
+    psubsw_r2r (mm1, mm6);		// mm6 = a1-b1
+
+    movq_m2r (*(col+offset+5*8), mm1);	// mm1 = b0
+    psubsw_r2r (mm7, mm2);		// mm2 = a2-b2
+
+    psraw_i2r (COL_SHIFT, mm6);		// mm6 = y6
+    movq_r2r (mm5, mm7);		// mm7 = a0
+
+    movq_r2m (mm4, *(col+offset+1*8));	// save y1
+    psraw_i2r (COL_SHIFT, mm2);		// mm2 = y5
+
+    movq_r2m (mm3, *(col+offset+2*8));	// save y2
+    paddsw_r2r (mm1, mm5);		// mm5 = a0+b0
+
+    movq_m2r (*(col+offset+3*8), mm4);	// mm4 = b3
+    psubsw_r2r (mm1, mm7);		// mm7 = a0-b0
+
+    psraw_i2r (COL_SHIFT, mm5);		// mm5 = y0
+    movq_r2r (mm0, mm3);		// mm3 = a3
+
+    movq_r2m (mm2, *(col+offset+5*8));	// save y5
+    psubsw_r2r (mm4, mm3);		// mm3 = a3-b3
+
+    psraw_i2r (COL_SHIFT, mm7);		// mm7 = y7
+    paddsw_r2r (mm0, mm4);		// mm4 = a3+b3
+
+    movq_r2m (mm5, *(col+offset+0*8));	// save y0
+    psraw_i2r (COL_SHIFT, mm3);		// mm3 = y4
+
+    movq_r2m (mm6, *(col+offset+6*8));	// save y6
+    psraw_i2r (COL_SHIFT, mm4);		// mm4 = y3
+
+    movq_r2m (mm7, *(col+offset+7*8));	// save y7
+
+    movq_r2m (mm3, *(col+offset+4*8));	// save y4
+
+    movq_r2m (mm4, *(col+offset+3*8));	// save y3
+}
+
+
+static int32_t rounder0[] ATTR_ALIGN(8) =
+    rounder ((1 << (COL_SHIFT - 1)) - 0.5);
+static int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
+static int32_t rounder1[] ATTR_ALIGN(8) =
+    rounder (1.25683487303);	/* C1*(C1/C4+C1+C7)/2 */
+static int32_t rounder7[] ATTR_ALIGN(8) =
+    rounder (-0.25);		/* C1*(C7/C4+C7-C1)/2 */
+static int32_t rounder2[] ATTR_ALIGN(8) =
+    rounder (0.60355339059);	/* C2 * (C6+C2)/2 */
+static int32_t rounder6[] ATTR_ALIGN(8) =
+    rounder (-0.25);		/* C2 * (C6-C2)/2 */
+static int32_t rounder3[] ATTR_ALIGN(8) =
+    rounder (0.087788325588);	/* C3*(-C3/C4+C3+C5)/2 */
+static int32_t rounder5[] ATTR_ALIGN(8) =
+    rounder (-0.441341716183);	/* C3*(-C5/C4+C5-C3)/2 */
+
+
+#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid)	\
+void idct (int16_t * block)					\
+{									\
+    static int16_t table04[] ATTR_ALIGN(16) =				\
+	table (22725, 21407, 19266, 16384, 12873,  8867, 4520);		\
+    static int16_t table17[] ATTR_ALIGN(16) =				\
+	table (31521, 29692, 26722, 22725, 17855, 12299, 6270);		\
+    static int16_t table26[] ATTR_ALIGN(16) =				\
+	table (29692, 27969, 25172, 21407, 16819, 11585, 5906);		\
+    static int16_t table35[] ATTR_ALIGN(16) =				\
+	table (26722, 25172, 22654, 19266, 15137, 10426, 5315);		\
+									\
+    idct_row_head (block, 0*8, table04);				\
+    idct_row (table04, rounder0);					\
+    idct_row_mid (block, 0*8, 4*8, table04);				\
+    idct_row (table04, rounder4);					\
+    idct_row_mid (block, 4*8, 1*8, table17);				\
+    idct_row (table17, rounder1);					\
+    idct_row_mid (block, 1*8, 7*8, table17);				\
+    idct_row (table17, rounder7);					\
+    idct_row_mid (block, 7*8, 2*8, table26);				\
+    idct_row (table26, rounder2);					\
+    idct_row_mid (block, 2*8, 6*8, table26);				\
+    idct_row (table26, rounder6);					\
+    idct_row_mid (block, 6*8, 3*8, table35);				\
+    idct_row (table35, rounder3);					\
+    idct_row_mid (block, 3*8, 5*8, table35);				\
+    idct_row (table35, rounder5);					\
+    idct_row_tail (block, 5*8);						\
+									\
+    idct_col (block, 0);						\
+    idct_col (block, 4);						\
+}
+
+
+declare_idct (ff_mmxext_idct, mmxext_table,
+	      mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
+
+declare_idct (ff_mmx_idct, mmx_table,
+	      mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
diff --git a/src/libffmpeg/libavcodec/jrevdct.c b/src/libffmpeg/libavcodec/jrevdct.c
index 2ef40f38e..246f1b190 100644
--- a/src/libffmpeg/libavcodec/jrevdct.c
+++ b/src/libffmpeg/libavcodec/jrevdct.c
@@ -197,16 +197,18 @@ void j_rev_dct(DCTBLOCK data)
 
     register int *idataptr = (int*)dataptr;
 
+    /* WARNING: we do the same permutation as MMX idct to simplify the
+       video core */
     d0 = dataptr[0];
-    d1 = dataptr[1];
-    d2 = dataptr[2];
-    d3 = dataptr[3];
-    d4 = dataptr[4];
-    d5 = dataptr[5];
-    d6 = dataptr[6];
+    d2 = dataptr[1];
+    d4 = dataptr[2];
+    d6 = dataptr[3];
+    d1 = dataptr[4];
+    d3 = dataptr[5];
+    d5 = dataptr[6];
     d7 = dataptr[7];
 
-    if ((d1 == 0) && (idataptr[1] | idataptr[2] | idataptr[3]) == 0) {
+    if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) {
       /* AC terms all zero */
       if (d0) {
 	  /* Compute a 32 bit value to assign. */
diff --git a/src/libffmpeg/libavcodec/mjpeg.c b/src/libffmpeg/libavcodec/mjpeg.c
index e119df66f..df7415b81 100644
--- a/src/libffmpeg/libavcodec/mjpeg.c
+++ b/src/libffmpeg/libavcodec/mjpeg.c
@@ -220,7 +220,7 @@ static int put_huffman_table(MpegEncContext *s, int table_class, int table_id,
 static void jpeg_table_header(MpegEncContext *s)
 {
     PutBitContext *p = &s->pb;
-    int i, size;
+    int i, j, size;
     UINT8 *ptr;
 
     /* quant matrixes */
@@ -229,13 +229,15 @@ static void jpeg_table_header(MpegEncContext *s)
     put_bits(p, 4, 0); /* 8 bit precision */
     put_bits(p, 4, 0); /* table 0 */
     for(i=0;i<64;i++) {
-        put_bits(p, 8, s->intra_matrix[i]);
+        j = zigzag_direct[i];
+        put_bits(p, 8, s->intra_matrix[j]);
     }
 #if 0
     put_bits(p, 4, 0); /* 8 bit precision */
     put_bits(p, 4, 1); /* table 1 */
     for(i=0;i<64;i++) {
-        put_bits(p, 8, s->chroma_intra_matrix[i]);
+        j = zigzag_direct[i];
+        put_bits(p, 8, s->chroma_intra_matrix[j]);
     }
 #endif
 
@@ -489,7 +491,7 @@ static int mjpeg_decode_init(AVCodecContext *avctx)
 static int mjpeg_decode_dqt(MJpegDecodeContext *s,
                             UINT8 *buf, int buf_size)
 {
-    int len, index, i;
+    int len, index, i, j;
     init_get_bits(&s->gb, buf, buf_size);
 
     len = get_bits(&s->gb, 16);
@@ -504,8 +506,10 @@ static int mjpeg_decode_dqt(MJpegDecodeContext *s,
             return -1;
         dprintf("index=%d\n", index);
         /* read quant table */
-        for(i=0;i<64;i++)
-            s->quant_matrixes[index][i] = get_bits(&s->gb, 8);
+        for(i=0;i<64;i++) {
+            j = zigzag_direct[i];
+            s->quant_matrixes[index][j] = get_bits(&s->gb, 8);
+        }
         len -= 65;
     }
     return 0;
diff --git a/src/libffmpeg/libavcodec/mpeg12.c b/src/libffmpeg/libavcodec/mpeg12.c
index 182c341e4..381fafadd 100644
--- a/src/libffmpeg/libavcodec/mpeg12.c
+++ b/src/libffmpeg/libavcodec/mpeg12.c
@@ -1172,32 +1172,36 @@ static void mpeg_decode_sequence_extension(MpegEncContext *s)
 
 static void mpeg_decode_quant_matrix_extension(MpegEncContext *s)
 {
-    int i, v;
+    int i, v, j;
 
     if (get_bits1(&s->gb)) {
         for(i=0;i<64;i++) {
             v = get_bits(&s->gb, 8);
-            s->intra_matrix[i] = v;
-            s->chroma_intra_matrix[i] = v;
+            j = block_permute_op(i);
+            s->intra_matrix[j] = v;
+            s->chroma_intra_matrix[j] = v;
         }
     }
     if (get_bits1(&s->gb)) {
         for(i=0;i<64;i++) {
             v = get_bits(&s->gb, 8);
-            s->non_intra_matrix[i] = v;
-            s->chroma_non_intra_matrix[i] = v;
+            j = block_permute_op(i);
+            s->non_intra_matrix[j] = v;
+            s->chroma_non_intra_matrix[j] = v;
         }
     }
     if (get_bits1(&s->gb)) {
         for(i=0;i<64;i++) {
             v = get_bits(&s->gb, 8);
-            s->chroma_intra_matrix[i] = v;
+            j = block_permute_op(i);
+            s->chroma_intra_matrix[j] = v;
         }
     }
     if (get_bits1(&s->gb)) {
         for(i=0;i<64;i++) {
             v = get_bits(&s->gb, 8);
-            s->chroma_non_intra_matrix[i] = v;
+            j = block_permute_op(i);
+            s->chroma_non_intra_matrix[j] = v;
         }
     }
 }
@@ -1345,7 +1349,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
 {
     Mpeg1Context *s1 = avctx->priv_data;
     MpegEncContext *s = &s1->mpeg_enc_ctx;
-    int width, height, i, v;
+    int width, height, i, v, j;
     
     init_get_bits(&s->gb, buf, buf_size);
 
@@ -1389,8 +1393,9 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
     if (get_bits1(&s->gb)) {
         for(i=0;i<64;i++) {
             v = get_bits(&s->gb, 8);
-            s->intra_matrix[i] = v;
-            s->chroma_intra_matrix[i] = v;
+            j = block_permute_op(i);
+            s->intra_matrix[j] = v;
+            s->chroma_intra_matrix[j] = v;
         }
     } else {
         for(i=0;i<64;i++) {
@@ -1402,8 +1407,9 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
     if (get_bits1(&s->gb)) {
         for(i=0;i<64;i++) {
             v = get_bits(&s->gb, 8);
-            s->non_intra_matrix[i] = v;
-            s->chroma_non_intra_matrix[i] = v;
+            j = block_permute_op(i);
+            s->non_intra_matrix[j] = v;
+            s->chroma_non_intra_matrix[j] = v;
         }
     } else {
         for(i=0;i<64;i++) {
diff --git a/src/libffmpeg/libavcodec/mpeg12data.h b/src/libffmpeg/libavcodec/mpeg12data.h
index f397c4a17..4f6a95b79 100644
--- a/src/libffmpeg/libavcodec/mpeg12data.h
+++ b/src/libffmpeg/libavcodec/mpeg12data.h
@@ -2,7 +2,7 @@
  * MPEG1/2 tables
  */
 
-const UINT8 default_intra_matrix[64] = {
+INT16 default_intra_matrix[64] = {
 	8, 16, 19, 22, 26, 27, 29, 34,
 	16, 16, 22, 24, 27, 29, 34, 37,
 	19, 22, 26, 27, 29, 34, 34, 38,
@@ -13,7 +13,7 @@ const UINT8 default_intra_matrix[64] = {
 	27, 29, 35, 38, 46, 56, 69, 83
 };
 
-const UINT8 default_non_intra_matrix[64] = {
+INT16 default_non_intra_matrix[64] = {
     16, 16, 16, 16, 16, 16, 16, 16,
     16, 16, 16, 16, 16, 16, 16, 16,
     16, 16, 16, 16, 16, 16, 16, 16,
@@ -331,18 +331,6 @@ static const UINT8 mbMotionVectorTable[17][2] = {
 { 0xc, 10 },
 };
 
-//const 
-UINT8 zigzag_direct[64] = {
-    0, 1, 8, 16, 9, 2, 3, 10,
-    17, 24, 32, 25, 18, 11, 4, 5,
-    12, 19, 26, 33, 40, 48, 41, 34,
-    27, 20, 13, 6, 7, 14, 21, 28,
-    35, 42, 49, 56, 57, 50, 43, 36,
-    29, 22, 15, 23, 30, 37, 44, 51,
-    58, 59, 52, 45, 38, 31, 39, 46,
-    53, 60, 61, 54, 47, 55, 62, 63
-};
-
 static const int frame_rate_tab[9] = {
     0, 
     (int)(23.976 * FRAME_RATE_BASE), 
diff --git a/src/libffmpeg/libavcodec/mpeg4data.h b/src/libffmpeg/libavcodec/mpeg4data.h
index 54b93d97e..3821a591c 100644
--- a/src/libffmpeg/libavcodec/mpeg4data.h
+++ b/src/libffmpeg/libavcodec/mpeg4data.h
@@ -81,26 +81,3 @@ static RLTable rl_intra = {
     intra_run,
     intra_level,
 };
-
-/* alternate scan orders used when doing AC prediction */
-UINT8 ff_alternate_horizontal_scan[64] = {
-    0,  1,  2,  3,  8,  9, 16, 17, 
-    10, 11,  4,  5,  6,  7, 15, 14,
-    13, 12, 19, 18, 24, 25, 32, 33, 
-    26, 27, 20, 21, 22, 23, 28, 29,
-    30, 31, 34, 35, 40, 41, 48, 49, 
-    42, 43, 36, 37, 38, 39, 44, 45,
-    46, 47, 50, 51, 56, 57, 58, 59, 
-    52, 53, 54, 55, 60, 61, 62, 63,
-};
-
-UINT8 ff_alternate_vertical_scan[64] = {
-    0,  8, 16, 24,  1,  9,  2, 10, 
-    17, 25, 32, 40, 48, 56, 57, 49,
-    41, 33, 26, 18,  3, 11,  4, 12, 
-    19, 27, 34, 42, 50, 58, 35, 43,
-    51, 59, 20, 28,  5, 13,  6, 14, 
-    21, 29, 36, 44, 52, 60, 37, 45,
-    53, 61, 22, 30,  7, 15, 23, 31, 
-    38, 46, 54, 62, 39, 47, 55, 63,
-};
diff --git a/src/libffmpeg/libavcodec/mpegvideo.c b/src/libffmpeg/libavcodec/mpegvideo.c
index 8a5745a8f..a2b1cb61e 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.c
+++ b/src/libffmpeg/libavcodec/mpegvideo.c
@@ -28,11 +28,6 @@
 #include "fastmemcpy.h"
 #endif
 
-/* FIXME */
-#ifdef ARCH_X86
-#define HAVE_MMX
-#endif
-
 static void encode_picture(MpegEncContext *s, int picture_number);
 static void rate_control_init(MpegEncContext *s);
 static int rate_estimate_qscale(MpegEncContext *s);
@@ -1073,6 +1068,10 @@ static int dct_quantize_mmx(MpegEncContext *s,
     const int *qmat;
 
     av_fdct (block);
+    
+    /* we need this permutation so that we correct the IDCT
+       permutation. will be moved into DCT code */
+    block_permute(block);
 
     if (s->mb_intra) {
         if (n < 4)
diff --git a/src/libffmpeg/libavcodec/mpegvideo.h b/src/libffmpeg/libavcodec/mpegvideo.h
index 9f9307393..a225dedbe 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.h
+++ b/src/libffmpeg/libavcodec/mpegvideo.h
@@ -179,9 +179,6 @@ typedef struct MpegEncContext {
                            DCTELEM *block, int n, int qscale);
 } MpegEncContext;
 
-//const 
-extern UINT8 zigzag_direct[64];
-
 int MPV_common_init(MpegEncContext *s);
 void MPV_common_end(MpegEncContext *s);
 void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
@@ -198,8 +195,8 @@ int estimate_motion(MpegEncContext *s,
                     int *mx_ptr, int *my_ptr);
 
 /* mpeg12.c */
-extern const UINT8 default_intra_matrix[64];
-extern const UINT8 default_non_intra_matrix[64];
+extern INT16 default_intra_matrix[64];
+extern INT16 default_non_intra_matrix[64];
 
 void mpeg1_encode_picture_header(MpegEncContext *s, int picture_number);
 void mpeg1_encode_mb(MpegEncContext *s,
@@ -257,8 +254,6 @@ int intel_h263_decode_picture_header(MpegEncContext *s);
 int h263_decode_mb(MpegEncContext *s,
                    DCTELEM block[6][64]);
 int h263_get_picture_format(int width, int height);
-extern UINT8 ff_alternate_horizontal_scan[64];
-extern UINT8 ff_alternate_vertical_scan[64];
 
 /* rv10.c */
 void rv10_encode_picture_header(MpegEncContext *s, int picture_number);
diff --git a/src/video_out/yuv2rgb.c b/src/video_out/yuv2rgb.c
index c4ad9d43d..d92b62839 100644
--- a/src/video_out/yuv2rgb.c
+++ b/src/video_out/yuv2rgb.c
@@ -22,7 +22,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
- * $Id: yuv2rgb.c,v 1.10 2001/07/30 19:37:18 guenter Exp $
+ * $Id: yuv2rgb.c,v 1.11 2001/08/07 23:59:50 guenter Exp $
  */
 
 #include "config.h"
@@ -95,9 +95,24 @@ int yuv2rgb_setup (yuv2rgb_t *this,
     this->v_buffer = this->v_chunk = NULL;
   }
 
-  if ((source_width == dest_width) && (source_height == dest_height)) 
+  if ((source_width == dest_width) && (source_height == dest_height)) {
     this->do_scale = 0;
-  else {
+
+    /*
+     * space for two y-lines (for yuv2rgb_mlib)
+     * u,v subsampled 2:1
+     */
+    this->y_buffer = my_malloc_aligned (16, 2*dest_width, &this->y_chunk);
+    if (!this->y_buffer)
+      return 0;
+    this->u_buffer = my_malloc_aligned (16, (dest_width+1)/2, &this->u_chunk);
+    if (!this->u_buffer)
+      return 0;
+    this->v_buffer = my_malloc_aligned (16, (dest_width+1)/2, &this->v_chunk);
+    if (!this->v_buffer)
+      return 0;
+
+  } else {
     this->do_scale = 1;
     
     this->step_dx = source_width  * 32768 / dest_width;
@@ -1227,8 +1242,8 @@ yuv2rgb_t *yuv2rgb_init (int mode) {
   this->matrix_coefficients = 6;
 
   this->y_chunk = this->y_buffer = NULL;
-  this->y_chunk = this->u_buffer = NULL;
-  this->y_chunk = this->v_buffer = NULL;
+  this->u_chunk = this->u_buffer = NULL;
+  this->v_chunk = this->v_buffer = NULL;
 
   yuv2rgb_setup_tables(this, mode);
 
diff --git a/src/xine-engine/cpu_accel.h b/src/xine-engine/cpu_accel.h
index ca2713ff4..498b219fb 100644
--- a/src/xine-engine/cpu_accel.h
+++ b/src/xine-engine/cpu_accel.h
@@ -51,7 +51,7 @@ extern "C" {
 #define MM_SSE2                 0x00000000
 
 uint32_t mm_accel (void) ;
-uint32_t mm_support (void) ;
+/* uint32_t mm_support (void) ; */
 
 #ifdef ARCH_X86
 
-- 
cgit v1.2.3