Index: libavcodec/common.h
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/common.h,v
retrieving revision 1.56
diff -u -r1.56 common.h
--- libavcodec/common.h	22 Aug 2002 19:41:39 -0000	1.56
+++ libavcodec/common.h	16 Sep 2002 17:55:53 -0000
@@ -147,11 +147,19 @@
 
 #else
 
+#if __GNUC__
 #ifdef DEBUG
 #define dprintf(fmt,args...) printf(fmt, ## args)
 #else
 #define dprintf(fmt,args...)
 #endif
+#else
+#ifdef DEBUG
+#define dprintf(...) printf(__VA_ARGS__)
+#else
+#define dprintf(...)
+#endif
+#endif
 
 #endif /* !CONFIG_WIN32 */
 
@@ -922,6 +930,8 @@
     }
     return ret;
 }
+#define RUNTIME_CPUDETECT
+
 #if __CPU__ >= 686 && !defined(RUNTIME_CPUDETECT)
 #define COPY3_IF_LT(x,y,a,b,c,d)\
 asm volatile (\
Index: libavcodec/dsputil.h
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/dsputil.h,v
retrieving revision 1.28
diff -u -r1.28 dsputil.h
--- libavcodec/dsputil.h	14 Sep 2002 19:00:09 -0000	1.28
+++ libavcodec/dsputil.h	16 Sep 2002 17:55:53 -0000
@@ -21,8 +21,9 @@
 
 #include "common.h"
 #include "avcodec.h"
+#include "xineutils.h"
 
-//#define DEBUG
+#undef DEBUG
 /* dct code */
 typedef short DCTELEM;
 
@@ -125,23 +126,32 @@
 }
 
 void block_permute(INT16 *block);
+          
+#if defined(ARCH_X86)
+#define HAVE_MMX 1 
+#endif
 
 #if defined(HAVE_MMX)
 
+#if 0
 #define MM_MMX    0x0001 /* standard MMX */
 #define MM_3DNOW  0x0004 /* AMD 3DNOW */
 #define MM_MMXEXT 0x0002 /* SSE integer functions or AMD MMX ext */
 #define MM_SSE    0x0008 /* SSE functions */
 #define MM_SSE2   0x0010 /* PIV SSE2 functions */
+#endif
 
 extern int mm_flags;
 
-int mm_support(void);
+/*int mm_support(void);*/
+#define mm_support() xine_mm_accel()
 
+#if 0 
 static inline void emms(void)
 {
     __asm __volatile ("emms;":::"memory");
 }
+#endif
 
 #define emms_c() \
 {\
Index: libavcodec/i386/fdct_mmx.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/fdct_mmx.c,v
retrieving revision 1.3
diff -u -r1.3 fdct_mmx.c
--- libavcodec/i386/fdct_mmx.c	1 Sep 2002 16:52:33 -0000	1.3
+++ libavcodec/i386/fdct_mmx.c	16 Sep 2002 17:55:53 -0000
@@ -45,8 +45,8 @@
     23170, 23170, 23170, 23170,	//cos * (2<<15) + 0.5
 };
 
-const long long  fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
-const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
+static const mmx_t fdct_one_corr = {0x0001000100010001LL};
+static volatile mmx_t fdct_r_row = { d:{RND_FRW_ROW, RND_FRW_ROW} };
 
 const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {  // forward_dct coeff table
     //row0
@@ -242,18 +242,18 @@
     punpckhdq_r2r(mm6, mm6);
     movq_m2r(*(table + 20), mm7);
     pmaddwd_r2r(mm5, mm1);
-    paddd_m2r(*fdct_r_row, mm3);
+    paddd_m2r(fdct_r_row, mm3);
     pmaddwd_r2r(mm6, mm7);
     pmaddwd_m2r(*(table + 12), mm2);
     paddd_r2r(mm4, mm3);
     pmaddwd_m2r(*(table + 24), mm5);
     pmaddwd_m2r(*(table + 28), mm6);
     paddd_r2r(mm7, mm1);
-    paddd_m2r(*fdct_r_row, mm0);
+    paddd_m2r(fdct_r_row, mm0);
     psrad_i2r(SHIFT_FRW_ROW, mm3);
-    paddd_m2r(*fdct_r_row, mm1);
+    paddd_m2r(fdct_r_row, mm1);
     paddd_r2r(mm2, mm0);
-    paddd_m2r(*fdct_r_row, mm5);
+    paddd_m2r(fdct_r_row, mm5);
     psrad_i2r(SHIFT_FRW_ROW, mm1);
     paddd_r2r(mm6, mm5);
     psrad_i2r(SHIFT_FRW_ROW, mm0);
Index: libavcodec/i386/mmx.h
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/mmx.h,v
retrieving revision 1.3
diff -u -r1.3 mmx.h
--- libavcodec/i386/mmx.h	27 May 2002 08:31:54 -0000	1.3
+++ libavcodec/i386/mmx.h	16 Sep 2002 17:55:53 -0000
@@ -1,243 +1 @@
-/*
- * mmx.h
- * Copyright (C) 1997-2001 H. Dietz and R. Fisher
- */
-#ifndef AVCODEC_I386MMX_H
-#define AVCODEC_I386MMX_H
-
-/*
- * The type of an value that fits in an MMX register (note that long
- * long constant values MUST be suffixed by LL and unsigned long long
- * values by ULL, lest they be truncated by the compiler)
- */
-
-typedef	union {
-	long long		q;	/* Quadword (64-bit) value */
-	unsigned long long	uq;	/* Unsigned Quadword */
-	int			d[2];	/* 2 Doubleword (32-bit) values */
-	unsigned int		ud[2];	/* 2 Unsigned Doubleword */
-	short			w[4];	/* 4 Word (16-bit) values */
-	unsigned short		uw[4];	/* 4 Unsigned Word */
-	char			b[8];	/* 8 Byte (8-bit) values */
-	unsigned char		ub[8];	/* 8 Unsigned Byte */
-	float			s[2];	/* Single-precision (32-bit) value */
-} mmx_t;	/* On an 8-byte (64-bit) boundary */
-
-
-#define	mmx_i2r(op,imm,reg) \
-	__asm__ __volatile__ (#op " %0, %%" #reg \
-			      : /* nothing */ \
-			      : "i" (imm) )
-
-#define	mmx_m2r(op,mem,reg) \
-	__asm__ __volatile__ (#op " %0, %%" #reg \
-			      : /* nothing */ \
-			      : "m" (mem))
-
-#define	mmx_r2m(op,reg,mem) \
-	__asm__ __volatile__ (#op " %%" #reg ", %0" \
-			      : "=m" (mem) \
-			      : /* nothing */ )
-
-#define	mmx_r2r(op,regs,regd) \
-	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
-
-
-#define	emms() __asm__ __volatile__ ("emms")
-
-#define	movd_m2r(var,reg)	mmx_m2r (movd, var, reg)
-#define	movd_r2m(reg,var)	mmx_r2m (movd, reg, var)
-#define	movd_r2r(regs,regd)	mmx_r2r (movd, regs, regd)
-
-#define	movq_m2r(var,reg)	mmx_m2r (movq, var, reg)
-#define	movq_r2m(reg,var)	mmx_r2m (movq, reg, var)
-#define	movq_r2r(regs,regd)	mmx_r2r (movq, regs, regd)
-
-#define	packssdw_m2r(var,reg)	mmx_m2r (packssdw, var, reg)
-#define	packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
-#define	packsswb_m2r(var,reg)	mmx_m2r (packsswb, var, reg)
-#define	packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
-
-#define	packuswb_m2r(var,reg)	mmx_m2r (packuswb, var, reg)
-#define	packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
-
-#define	paddb_m2r(var,reg)	mmx_m2r (paddb, var, reg)
-#define	paddb_r2r(regs,regd)	mmx_r2r (paddb, regs, regd)
-#define	paddd_m2r(var,reg)	mmx_m2r (paddd, var, reg)
-#define	paddd_r2r(regs,regd)	mmx_r2r (paddd, regs, regd)
-#define	paddw_m2r(var,reg)	mmx_m2r (paddw, var, reg)
-#define	paddw_r2r(regs,regd)	mmx_r2r (paddw, regs, regd)
-
-#define	paddsb_m2r(var,reg)	mmx_m2r (paddsb, var, reg)
-#define	paddsb_r2r(regs,regd)	mmx_r2r (paddsb, regs, regd)
-#define	paddsw_m2r(var,reg)	mmx_m2r (paddsw, var, reg)
-#define	paddsw_r2r(regs,regd)	mmx_r2r (paddsw, regs, regd)
-
-#define	paddusb_m2r(var,reg)	mmx_m2r (paddusb, var, reg)
-#define	paddusb_r2r(regs,regd)	mmx_r2r (paddusb, regs, regd)
-#define	paddusw_m2r(var,reg)	mmx_m2r (paddusw, var, reg)
-#define	paddusw_r2r(regs,regd)	mmx_r2r (paddusw, regs, regd)
-
-#define	pand_m2r(var,reg)	mmx_m2r (pand, var, reg)
-#define	pand_r2r(regs,regd)	mmx_r2r (pand, regs, regd)
-
-#define	pandn_m2r(var,reg)	mmx_m2r (pandn, var, reg)
-#define	pandn_r2r(regs,regd)	mmx_r2r (pandn, regs, regd)
-
-#define	pcmpeqb_m2r(var,reg)	mmx_m2r (pcmpeqb, var, reg)
-#define	pcmpeqb_r2r(regs,regd)	mmx_r2r (pcmpeqb, regs, regd)
-#define	pcmpeqd_m2r(var,reg)	mmx_m2r (pcmpeqd, var, reg)
-#define	pcmpeqd_r2r(regs,regd)	mmx_r2r (pcmpeqd, regs, regd)
-#define	pcmpeqw_m2r(var,reg)	mmx_m2r (pcmpeqw, var, reg)
-#define	pcmpeqw_r2r(regs,regd)	mmx_r2r (pcmpeqw, regs, regd)
-
-#define	pcmpgtb_m2r(var,reg)	mmx_m2r (pcmpgtb, var, reg)
-#define	pcmpgtb_r2r(regs,regd)	mmx_r2r (pcmpgtb, regs, regd)
-#define	pcmpgtd_m2r(var,reg)	mmx_m2r (pcmpgtd, var, reg)
-#define	pcmpgtd_r2r(regs,regd)	mmx_r2r (pcmpgtd, regs, regd)
-#define	pcmpgtw_m2r(var,reg)	mmx_m2r (pcmpgtw, var, reg)
-#define	pcmpgtw_r2r(regs,regd)	mmx_r2r (pcmpgtw, regs, regd)
-
-#define	pmaddwd_m2r(var,reg)	mmx_m2r (pmaddwd, var, reg)
-#define	pmaddwd_r2r(regs,regd)	mmx_r2r (pmaddwd, regs, regd)
-
-#define	pmulhw_m2r(var,reg)	mmx_m2r (pmulhw, var, reg)
-#define	pmulhw_r2r(regs,regd)	mmx_r2r (pmulhw, regs, regd)
-
-#define	pmullw_m2r(var,reg)	mmx_m2r (pmullw, var, reg)
-#define	pmullw_r2r(regs,regd)	mmx_r2r (pmullw, regs, regd)
-
-#define	por_m2r(var,reg)	mmx_m2r (por, var, reg)
-#define	por_r2r(regs,regd)	mmx_r2r (por, regs, regd)
-
-#define	pslld_i2r(imm,reg)	mmx_i2r (pslld, imm, reg)
-#define	pslld_m2r(var,reg)	mmx_m2r (pslld, var, reg)
-#define	pslld_r2r(regs,regd)	mmx_r2r (pslld, regs, regd)
-#define	psllq_i2r(imm,reg)	mmx_i2r (psllq, imm, reg)
-#define	psllq_m2r(var,reg)	mmx_m2r (psllq, var, reg)
-#define	psllq_r2r(regs,regd)	mmx_r2r (psllq, regs, regd)
-#define	psllw_i2r(imm,reg)	mmx_i2r (psllw, imm, reg)
-#define	psllw_m2r(var,reg)	mmx_m2r (psllw, var, reg)
-#define	psllw_r2r(regs,regd)	mmx_r2r (psllw, regs, regd)
-
-#define	psrad_i2r(imm,reg)	mmx_i2r (psrad, imm, reg)
-#define	psrad_m2r(var,reg)	mmx_m2r (psrad, var, reg)
-#define	psrad_r2r(regs,regd)	mmx_r2r (psrad, regs, regd)
-#define	psraw_i2r(imm,reg)	mmx_i2r (psraw, imm, reg)
-#define	psraw_m2r(var,reg)	mmx_m2r (psraw, var, reg)
-#define	psraw_r2r(regs,regd)	mmx_r2r (psraw, regs, regd)
-
-#define	psrld_i2r(imm,reg)	mmx_i2r (psrld, imm, reg)
-#define	psrld_m2r(var,reg)	mmx_m2r (psrld, var, reg)
-#define	psrld_r2r(regs,regd)	mmx_r2r (psrld, regs, regd)
-#define	psrlq_i2r(imm,reg)	mmx_i2r (psrlq, imm, reg)
-#define	psrlq_m2r(var,reg)	mmx_m2r (psrlq, var, reg)
-#define	psrlq_r2r(regs,regd)	mmx_r2r (psrlq, regs, regd)
-#define	psrlw_i2r(imm,reg)	mmx_i2r (psrlw, imm, reg)
-#define	psrlw_m2r(var,reg)	mmx_m2r (psrlw, var, reg)
-#define	psrlw_r2r(regs,regd)	mmx_r2r (psrlw, regs, regd)
-
-#define	psubb_m2r(var,reg)	mmx_m2r (psubb, var, reg)
-#define	psubb_r2r(regs,regd)	mmx_r2r (psubb, regs, regd)
-#define	psubd_m2r(var,reg)	mmx_m2r (psubd, var, reg)
-#define	psubd_r2r(regs,regd)	mmx_r2r (psubd, regs, regd)
-#define	psubw_m2r(var,reg)	mmx_m2r (psubw, var, reg)
-#define	psubw_r2r(regs,regd)	mmx_r2r (psubw, regs, regd)
-
-#define	psubsb_m2r(var,reg)	mmx_m2r (psubsb, var, reg)
-#define	psubsb_r2r(regs,regd)	mmx_r2r (psubsb, regs, regd)
-#define	psubsw_m2r(var,reg)	mmx_m2r (psubsw, var, reg)
-#define	psubsw_r2r(regs,regd)	mmx_r2r (psubsw, regs, regd)
-
-#define	psubusb_m2r(var,reg)	mmx_m2r (psubusb, var, reg)
-#define	psubusb_r2r(regs,regd)	mmx_r2r (psubusb, regs, regd)
-#define	psubusw_m2r(var,reg)	mmx_m2r (psubusw, var, reg)
-#define	psubusw_r2r(regs,regd)	mmx_r2r (psubusw, regs, regd)
-
-#define	punpckhbw_m2r(var,reg)		mmx_m2r (punpckhbw, var, reg)
-#define	punpckhbw_r2r(regs,regd)	mmx_r2r (punpckhbw, regs, regd)
-#define	punpckhdq_m2r(var,reg)		mmx_m2r (punpckhdq, var, reg)
-#define	punpckhdq_r2r(regs,regd)	mmx_r2r (punpckhdq, regs, regd)
-#define	punpckhwd_m2r(var,reg)		mmx_m2r (punpckhwd, var, reg)
-#define	punpckhwd_r2r(regs,regd)	mmx_r2r (punpckhwd, regs, regd)
-
-#define	punpcklbw_m2r(var,reg) 		mmx_m2r (punpcklbw, var, reg)
-#define	punpcklbw_r2r(regs,regd)	mmx_r2r (punpcklbw, regs, regd)
-#define	punpckldq_m2r(var,reg)		mmx_m2r (punpckldq, var, reg)
-#define	punpckldq_r2r(regs,regd)	mmx_r2r (punpckldq, regs, regd)
-#define	punpcklwd_m2r(var,reg)		mmx_m2r (punpcklwd, var, reg)
-#define	punpcklwd_r2r(regs,regd)	mmx_r2r (punpcklwd, regs, regd)
-
-#define	pxor_m2r(var,reg)	mmx_m2r (pxor, var, reg)
-#define	pxor_r2r(regs,regd)	mmx_r2r (pxor, regs, regd)
-
-
-/* 3DNOW extensions */
-
-#define pavgusb_m2r(var,reg)	mmx_m2r (pavgusb, var, reg)
-#define pavgusb_r2r(regs,regd)	mmx_r2r (pavgusb, regs, regd)
-
-
-/* AMD MMX extensions - also available in intel SSE */
-
-
-#define mmx_m2ri(op,mem,reg,imm) \
-        __asm__ __volatile__ (#op " %1, %0, %%" #reg \
-                              : /* nothing */ \
-                              : "X" (mem), "X" (imm))
-#define mmx_r2ri(op,regs,regd,imm) \
-        __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
-                              : /* nothing */ \
-                              : "X" (imm) )
-
-#define	mmx_fetch(mem,hint) \
-	__asm__ __volatile__ ("prefetch" #hint " %0" \
-			      : /* nothing */ \
-			      : "X" (mem))
-
-
-#define	maskmovq(regs,maskreg)		mmx_r2ri (maskmovq, regs, maskreg)
-
-#define	movntq_r2m(mmreg,var)		mmx_r2m (movntq, mmreg, var)
-
-#define	pavgb_m2r(var,reg)		mmx_m2r (pavgb, var, reg)
-#define	pavgb_r2r(regs,regd)		mmx_r2r (pavgb, regs, regd)
-#define	pavgw_m2r(var,reg)		mmx_m2r (pavgw, var, reg)
-#define	pavgw_r2r(regs,regd)		mmx_r2r (pavgw, regs, regd)
-
-#define	pextrw_r2r(mmreg,reg,imm)	mmx_r2ri (pextrw, mmreg, reg, imm)
-
-#define	pinsrw_r2r(reg,mmreg,imm)	mmx_r2ri (pinsrw, reg, mmreg, imm)
-
-#define	pmaxsw_m2r(var,reg)		mmx_m2r (pmaxsw, var, reg)
-#define	pmaxsw_r2r(regs,regd)		mmx_r2r (pmaxsw, regs, regd)
-
-#define	pmaxub_m2r(var,reg)		mmx_m2r (pmaxub, var, reg)
-#define	pmaxub_r2r(regs,regd)		mmx_r2r (pmaxub, regs, regd)
-
-#define	pminsw_m2r(var,reg)		mmx_m2r (pminsw, var, reg)
-#define	pminsw_r2r(regs,regd)		mmx_r2r (pminsw, regs, regd)
-
-#define	pminub_m2r(var,reg)		mmx_m2r (pminub, var, reg)
-#define	pminub_r2r(regs,regd)		mmx_r2r (pminub, regs, regd)
-
-#define	pmovmskb(mmreg,reg) \
-	__asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg)
-
-#define	pmulhuw_m2r(var,reg)		mmx_m2r (pmulhuw, var, reg)
-#define	pmulhuw_r2r(regs,regd)		mmx_r2r (pmulhuw, regs, regd)
-
-#define	prefetcht0(mem)			mmx_fetch (mem, t0)
-#define	prefetcht1(mem)			mmx_fetch (mem, t1)
-#define	prefetcht2(mem)			mmx_fetch (mem, t2)
-#define	prefetchnta(mem)		mmx_fetch (mem, nta)
-
-#define	psadbw_m2r(var,reg)		mmx_m2r (psadbw, var, reg)
-#define	psadbw_r2r(regs,regd)		mmx_r2r (psadbw, regs, regd)
-
-#define	pshufw_m2r(var,reg,imm)		mmx_m2ri(pshufw, var, reg, imm)
-#define	pshufw_r2r(regs,regd,imm)	mmx_r2ri(pshufw, regs, regd, imm)
-
-#define	sfence() __asm__ __volatile__ ("sfence\n\t")
-
-#endif /* AVCODEC_I386MMX_H */
+#include "xineutils.h"
Index: libavcodec/i386/mpegvideo_mmx_template.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/mpegvideo_mmx_template.c,v
retrieving revision 1.10
diff -u -r1.10 mpegvideo_mmx_template.c
--- libavcodec/i386/mpegvideo_mmx_template.c	2 Sep 2002 16:56:29 -0000	1.10
+++ libavcodec/i386/mpegvideo_mmx_template.c	16 Sep 2002 17:55:53 -0000
@@ -84,16 +84,25 @@
     }
 
     if(s->out_format == FMT_H263 && s->mpeg_quant==0){
-    
+
+        /* the following code is patched using avifile's modifications
+           to enable -fpic compilation. this patch has not been accepted on
+           main ffmpeg cvs. */
+
         asm volatile(
             "movd %%eax, %%mm3			\n\t" // last_non_zero_p1
             SPREADW(%%mm3)
             "pxor %%mm7, %%mm7			\n\t" // 0
             "pxor %%mm4, %%mm4			\n\t" // 0
-            "movq (%2), %%mm5			\n\t" // qmat[0]
+            "movq (%1), %%mm5			\n\t" // qmat[0]
             "pxor %%mm6, %%mm6			\n\t"
-            "psubw (%3), %%mm6			\n\t" // -bias[0]
+            "psubw (%2), %%mm6			\n\t" // -bias[0]
             "movl $-128, %%eax			\n\t"
+            : "+a" (last_non_zero_p1)
+            : "r" (qmat), "r" (bias)
+            );
+	  /* CORE */
+	  asm volatile(
             ".balign 16				\n\t"
             "1:					\n\t"
             "pxor %%mm1, %%mm1			\n\t" // 0
@@ -106,7 +115,7 @@
             "por %%mm0, %%mm4			\n\t" 
             "pxor %%mm1, %%mm0			\n\t" 
             "psubw %%mm1, %%mm0			\n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
-            "movq %%mm0, (%5, %%eax)		\n\t"
+            "movq %%mm0, (%3, %%eax)		\n\t"
             "pcmpeqw %%mm7, %%mm0		\n\t" // out==0 ? 0xFF : 0x00
             "movq (%4, %%eax), %%mm1		\n\t" 
             "movq %%mm7, (%1, %%eax)		\n\t" // 0
@@ -114,6 +123,11 @@
 	    PMAXW(%%mm0, %%mm3)
             "addl $8, %%eax			\n\t"
             " js 1b				\n\t"
+            : "+a" (last_non_zero_p1)
+            : "r" (block+64), "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
+            );
+	  /* EPILOGUE */
+	  asm volatile(
             "movq %%mm3, %%mm0			\n\t"
             "psrlq $32, %%mm3			\n\t"
 	    PMAXW(%%mm0, %%mm3)
@@ -122,48 +136,46 @@
 	    PMAXW(%%mm0, %%mm3)
             "movd %%mm3, %%eax			\n\t"
             "movzbl %%al, %%eax			\n\t" // last_non_zero_p1
-	    : "+a" (last_non_zero_p1)
-            : "r" (block+64), "r" (qmat), "r" (bias),
-              "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
-        );
-        // note the asm is split cuz gcc doesnt like that many operands ...
-        asm volatile(
-            "movd %1, %%mm1			\n\t" // max_qcoeff
+            "movd %2, %%mm1			\n\t" // max_qcoeff
 	    SPREADW(%%mm1)
             "psubusw %%mm1, %%mm4		\n\t" 
             "packuswb %%mm4, %%mm4		\n\t"
-            "movd %%mm4, %0			\n\t" // *overflow
-        : "=g" (*overflow)
-        : "g" (s->max_qcoeff)
-        );
+            "movd %%mm4, %1			\n\t" // *overflow
+            : "+a" (last_non_zero_p1), "=r" (*overflow)
+            : "r" (s->max_qcoeff)
+            );
     }else{ // FMT_H263
         asm volatile(
-            "movd %%eax, %%mm3			\n\t" // last_non_zero_p1
+            "pushl %%ebp				\n\t"
+            "pushl %%ebx				\n\t"
+            "movl %0, %%ebp				\n\t"
+            "movl (%%ebp), %%ebx		\n\t"
+            "movd %%ebx, %%mm3			\n\t" // last_non_zero_p1
             SPREADW(%%mm3)
             "pxor %%mm7, %%mm7			\n\t" // 0
             "pxor %%mm4, %%mm4			\n\t" // 0
-            "movl $-128, %%eax			\n\t"
+            "movl $-128, %%ebx			\n\t"
             ".balign 16				\n\t"
             "1:					\n\t"
             "pxor %%mm1, %%mm1			\n\t" // 0
-            "movq (%1, %%eax), %%mm0		\n\t" // block[i]
+            "movq (%1, %%ebx), %%mm0		\n\t" // block[i]
             "pcmpgtw %%mm0, %%mm1		\n\t" // block[i] <= 0 ? 0xFF : 0x00
             "pxor %%mm1, %%mm0			\n\t" 
             "psubw %%mm1, %%mm0			\n\t" // ABS(block[i])
-            "movq (%3, %%eax), %%mm6		\n\t" // bias[0]
+            "movq (%3, %%ebx), %%mm6		\n\t" // bias[0]
             "paddusw %%mm6, %%mm0		\n\t" // ABS(block[i]) + bias[0]
-            "movq (%2, %%eax), %%mm5		\n\t" // qmat[i]
+            "movq (%2, %%ebx), %%mm5		\n\t" // qmat[i]
             "pmulhw %%mm5, %%mm0		\n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
             "por %%mm0, %%mm4			\n\t" 
             "pxor %%mm1, %%mm0			\n\t" 
             "psubw %%mm1, %%mm0			\n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
-            "movq %%mm0, (%5, %%eax)		\n\t"
+            "movq %%mm0, (%5, %%ebx)		\n\t"
             "pcmpeqw %%mm7, %%mm0		\n\t" // out==0 ? 0xFF : 0x00
-            "movq (%4, %%eax), %%mm1		\n\t" 
-            "movq %%mm7, (%1, %%eax)		\n\t" // 0
+            "movq (%4, %%ebx), %%mm1		\n\t" 
+            "movq %%mm7, (%1, %%ebx)		\n\t" // 0
             "pandn %%mm1, %%mm0			\n\t"
 	    PMAXW(%%mm0, %%mm3)
-            "addl $8, %%eax			\n\t"
+            "addl $8, %%ebx			\n\t"
             " js 1b				\n\t"
             "movq %%mm3, %%mm0			\n\t"
             "psrlq $32, %%mm3			\n\t"
@@ -171,10 +183,14 @@
             "movq %%mm3, %%mm0			\n\t"
             "psrlq $16, %%mm3			\n\t"
 	    PMAXW(%%mm0, %%mm3)
-            "movd %%mm3, %%eax			\n\t"
-            "movzbl %%al, %%eax			\n\t" // last_non_zero_p1
-	    : "+a" (last_non_zero_p1)
-            : "r" (block+64), "r" (qmat+64), "r" (bias+64),
+            "movd %%mm3, %%ebx			\n\t"
+            "movzbl %%bl, %%ebx			\n\t" // last_non_zero_p1
+            "movl %%ebx, (%%ebp)		\n\t"
+            "popl %%ebx					\n\t"
+            "popl %%ebp					\n\t"
+            :
+			: "m" (last_non_zero_p1),
+              "r" (block+64), "r" (qmat+64), "r" (bias+64),
               "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
         );
         // note the asm is split cuz gcc doesnt like that many operands ...
@@ -184,8 +200,8 @@
             "psubusw %%mm1, %%mm4		\n\t" 
             "packuswb %%mm4, %%mm4		\n\t"
             "movd %%mm4, %0			\n\t" // *overflow
-        : "=g" (*overflow)
-        : "g" (s->max_qcoeff)
+        : "=r" (*overflow)
+        : "r" (s->max_qcoeff)
         );
     }
 
