1 files changed, 1703 insertions, 0 deletions
diff --git a/xine_post_swscale.c b/xine_post_swscale.c
new file mode 100644
index 00000000..bea7ce0a
--- /dev/null
+++ b/xine_post_swscale.c
@@ -0,0 +1,1703 @@
+/*
+ * Copyright (C) 2000-2007 the xine project
+ * 
+ * This file is part of xine, a free video player.
+ * 
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
+ *
+ * $Id: xine_post_swscale.c,v 1.1 2008-02-20 22:31:23 phintuka Exp $
+ *
+ * Simple (faster) resize for avisynth
+ *     Copyright (C) 2002 Tom Barry
+ *
+ * Very simple 2 tap linear interpolation.  
+ * It is unfiltered which means it will not soften much.
+ *
+ * WarpedResize will do a non-linear stretch/squeeze in both the horizontal 
+ * and vertical dimensions. This can be useful when you want to change the 
+ * aspect ratio of a video clip and have it mostly distorted at the
+ * top, bottom, and side edges. 
+ *
+ *
+ * Ported to linux/xine by Petri Hintukainen <phintuka@users.sourceforge.net>
+ *    - Added x86_64 support
+ *    - Added PIC support (do not clobber ebx in x86, access only local variables from asm)
+ *    - Fixed yv12 stretched warp tables generation
+ */
+ 
+#include <xine/xine_internal.h>
+#include <xine/post.h>
+
+/*#define DBG(x...)*/
+#define DBG(x...) fprintf(stderr, "post_warp: " x)
+
+#define STREAMING_STORE
+#define PREFETCH
+/*#define VANILLA*/
+
+/*
+ * This function accepts a position from 0 to 1 and warps it, to 0 through 1 based
+ * upon the wFact var. The warp equations are designed to:
+ * 
+ * * Always be rising but yield results from 0 to 1
+ *
+ * * Have a first derivative that doesn't go to 0 or infinity, at least close
+ *   to the center of the screen
+ *
+ * * Have a curvature (absolute val of 2nd derivative) that is small in the
+ *   center and smoothly rises towards the edges. We would like the curvature
+ *   to be everywhere = 0 when the warp factor = 1
+ */
+static double WarpFactor(double position, double wFact)
+{
+  double x;
+  double z;
+  double w;
+  x = 2 * (position - .5);
+  if (1) /*(wFact < 1.0)*/
+    /* For warp factor < 1 the warp is calculated as (1-w) * x^3 + w *x, centered
+     *
+     * The warp is calculated as z = (1 - w) * x^3 + w * x, centered
+     * around .5 and ranging from 0 to 1. After some tinkering this seems
+     * to give decent values and derivatives at the right places.
+     */
+    w = 2.0 - wFact; /* reverse parm for compat with initial release */
+  
+  if (x < 0.0) {
+    z = -(1 - w) * x*x*x - w * x; /* -1 < x < 0, wFact < 1 */
+    return .5 - .5 * z;                     
+  } else {
+    z = (1 - w) * x*x*x + w * x;  /* -1 < x < 0, wFact < 1 */
+    return .5 + .5 * z;           /* amts to same formula as above for now */
+  }
+}
+
+/*
+ * YV12
+ *
+ * For each horizontal output pair of pixels there is are 2 qword masks followed by 2 int
+ * offsets. The 2 masks are the weights to be used for the luma and chroma, respectively.
+ * Each mask contains LeftWeight1, RightWeight1, LeftWeight2, RightWeight2. So a pair of pixels
+ * will later be processed each pass through the horizontal resize loop.  I think with my
+ * current math the Horizontal Luma and Chroma contains the same values but since I may have screwed it
+ * up I'll leave it this way for now. Vertical chroma is different.
+ *
+ * Note - try just using the luma calcs for both, seem to be the same.
+ *
+ * The weights are scaled 0-256 and the left and right weights will sum to 256 for each pixel.
+ */
+static void init_tables_yv12(int newwidth, int newheight, int oldwidth, int oldheight,
+			     int Interlaced, double hWarp, double vWarp,
+			     uint32_t *hControl,   uint32_t *vOffsets,   uint32_t *vWeights,
+			     uint32_t *hControlUV, uint32_t *vOffsetsUV, uint32_t *vWeightsUV)
+{
+  int i;
+  int j;
+  int k;
+  int wY1;
+  int wY2;
+  DBG("init_yv12: %dx%d->%dx%d hWarp %1.3lf vWarp %1.3lf\n", 
+      oldwidth, oldheight, newwidth, newheight, hWarp, vWarp);
+
+  /* First set up horizontal table, use for both luma & chroma since 
+   * it seems to have the same equation.
+   * We will geneerate these values in pairs, mostly because that's the way
+   * I wrote it for YUY2 above.
+   */
+
+  for(i=0; i < newwidth; i+=2) {
+    /* first make even pixel control */
+    if (hWarp==1)  /*if no warp factor */
+      j = i * 256 * (oldwidth-1) / (newwidth-1);
+    else           /* stretch and warp somehow */
+      j = (int) (256 * WarpFactor(i / (newwidth-1.0), hWarp) * (oldwidth-1));
+    
+    k = j>>8;
+    wY2 = j - (k << 8);  /* luma weight of right pixel */
+    wY1 = 256 - wY2;     /* luma weight of left pixel  */
+
+    if (k > oldwidth - 2) {
+      hControl[i*3+4] = oldwidth - 1;  /* point to last byte */
+      hControl[i*3] =   0x00000100;    /* use 100% of rightmost Y */
+    } else {
+      hControl[i*3+4] = k;             /* pixel offset */
+      hControl[i*3] = wY2 << 16 | wY1; /* luma weights */
+    }
+
+    /* now make odd pixel control */
+    if (hWarp==1)   /* if no warp factor */
+      j = (i+1) * 256 * (oldwidth-1) / (newwidth-1);
+    else        /* stretch and warp somehow */
+      j = (int) (256 * WarpFactor((i+1) / (newwidth-1.0), hWarp) * (oldwidth-1));
+
+    k = j>>8;
+    wY2 = j - (k << 8); /* luma weight of right pixel */
+    wY1 = 256 - wY2;    /* luma weight of left pixel  */
+
+    if (k > oldwidth - 2) {
+      hControl[i*3+5] = oldwidth - 1; /* point to last byte  */
+      hControl[i*3+1] = 0x00000100;   /* use 100% of rightmost Y */
+    } else {
+      hControl[i*3+5] = k;               /* pixel offset */
+      hControl[i*3+1] = wY2 << 16 | wY1; /* luma weights */
+    }
+  }
+
+  hControl[newwidth*3+4] =  2 * (oldwidth-1); /* give it something to prefetch at end */
+  hControl[newwidth*3+5] =  2 * (oldwidth-1); /*  "  */
+#ifndef VANILLA
+  // UV
+  for(i=0; i < newwidth/2; i+=2) {
+    /* first make even pixel control */
+    if (hWarp==1)  /*if no warp factor */
+      j = i * 256 * (oldwidth/2-1) / (newwidth/2-1);
+    else           /* stretch and warp somehow */
+      j = (int) (256 * WarpFactor(i / (newwidth/2-1.0), hWarp) * (oldwidth/2-1));
+    
+    k = j>>8;
+    wY2 = j - (k << 8);  /* luma weight of right pixel */
+    wY1 = 256 - wY2;     /* luma weight of left pixel  */
+
+    if (k > oldwidth/2 - 2) {
+      hControlUV[i*3+4] = oldwidth/2 - 1;  /* point to last byte */
+      hControlUV[i*3] =   0x00000100;    /* use 100% of rightmost Y */
+    } else {
+      hControlUV[i*3+4] = k;             /* pixel offset */
+      hControlUV[i*3] = wY2 << 16 | wY1; /* luma weights */
+    }
+
+    /* now make odd pixel control */
+    if (hWarp==1)   /* if no warp factor */
+      j = (i+1) * 256 * (oldwidth/2-1) / (newwidth/2-1);
+    else        /* stretch and warp somehow */
+      j = (int) (256 * WarpFactor((i+1) / (newwidth/2-1.0), hWarp) * (oldwidth/2-1));
+
+    k = j>>8;
+    wY2 = j - (k << 8); /* luma weight of right pixel */
+    wY1 = 256 - wY2;    /* luma weight of left pixel  */
+
+    if (k > oldwidth/2 - 2) {
+      hControlUV[i*3+5] = oldwidth/2 - 1; /* point to last byte  */
+      hControlUV[i*3+1] = 0x00000100;   /* use 100% of rightmost Y */
+    } else {
+      hControlUV[i*3+5] = k;               /* pixel offset */
+      hControlUV[i*3+1] = wY2 << 16 | wY1; /* luma weights */
+    }
+  }
+
+  hControlUV[newwidth/2*3+4] =  (oldwidth/2-1); /* give it something to prefetch at end */
+  hControlUV[newwidth/2*3+5] =  (oldwidth/2-1); /*  "  */
+#endif
+
+  /* Next set up vertical tables. The offsets are measured in lines and will be mult */
+  /* by the source pitch later . */
+
+  /* For YV12 we need separate Luma and chroma tables */
+
+  /* First Luma Table */
+  for(i=0; i< newheight; ++i) {
+    if (vWarp==1)  /* if no warp factor */
+      j = i * 256 * (oldheight-1) / (newheight-1);
+    else           /* stretch and warp somehow */
+      j = (int) (256 * WarpFactor(i / (newheight-1.0), vWarp) * (oldheight-1));
+    
+    if (Interlaced) {  /* do hard way? */
+      if (i%2) {       /* is odd output line? */
+	if (j < 256) {     /* before 1st odd input line */
+	  vOffsets[i] = 1; /* all from line 1 */
+	  vWeights[i] = 0; /* weight to give to 2nd line */
+	} else {
+	  k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */
+	  vOffsets[i] = k;
+	  wY2 = j - (k << 8); 
+	  vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */
+	}
+      } else {         /* is even output line */
+	k = (j >> 9) << 1;        /* next lower even line */
+	vOffsets[i] = k;
+	wY2 = j - (k << 8); 
+	vWeights[i] = wY2 >> 1;   /* weight to give to 2nd line */
+      }
+    } else {           /* simple way, do as progressive */
+      k = j >> 8;
+      vOffsets[i] = k;
+      wY2 = j - (k << 8); 
+      vWeights[i] = wY2;   /* weight to give to 2nd line */
+    }
+  }
+
+  /* Vertical table for chroma */
+  for(i=0; i< newheight/2; ++i) {
+    if (vWarp==1)  /* if no warp factor */
+#ifdef VANILLA
+      j = (int) ( (i+.25) * 256 * (oldheight-1) / (newheight-1.0) - 64 );
+#else
+      j = (int) ( (i+.25) * 256 * (oldheight/2-1) / (newheight/2-1.0) - 64 );
+#endif
+    else           /* stretch and warp somehow */
+#ifdef VANILLA
+      j = (int) (256 * WarpFactor( (i+.25) / (newheight-1.0), vWarp) * (oldheight-1.0) );
+#else
+      j = (int) (256 * WarpFactor( (i+.25) / (newheight/2 - 1.0), vWarp) * (oldheight/2 - 1.0) );
+#endif
+#ifndef VANILLA
+    if(j<0) j=0;
+#endif
+    if (Interlaced) { /* do hard way? */
+      if (i%2) {                /* is odd output line? */
+	if (j < 256) {            /* before 1st odd input line */
+	  vOffsetsUV[i] = 1;         /* all from line 1 */
+	  vWeightsUV[i] = 0;         /* weight to give to 2nd line */
+	} else {
+	  k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */
+	  vOffsetsUV[i] = k;
+	  wY2 = j - (k << 8); 
+	  vWeightsUV[i] = wY2 >> 1;  /* weight to give to 2nd line */
+	}
+      } else {                  /* is even output line */
+#ifdef VANILLA
+	k = (j >> 9) << 1;         /* next lower even line */
+	vOffsetsUV[i] = k;
+	wY2 = j - (k << 8); 
+	vWeightsUV[i] = wY2 >> 1;  /* weight to give to 2nd line */
+#else
+	k = (j / 512) << 1;         /* next lower even line */
+	vOffsetsUV[i] = k;
+	wY2 = j - (k << 8); 
+	vWeightsUV[i] = wY2 >> 1;  /* weight to give to 2nd line */
+#endif
+      }
+    } else {              /* simple way, do as progressive */
+#ifdef VANILLA
+      k = j >> 8;
+#else
+      k = j / 256; /* j >> 8;  does not work right if  -256 < j < 0 */
+#endif
+      vOffsetsUV[i] = k;
+      wY2 = j - (k << 8);
+      vWeightsUV[i] = wY2;      /* weight to give to 2nd line */
+    }
+  }
+}
+
+/*
+ * YUY2
+ *
+ * For each horizontal output pair of pixels there is are 2 qword masks followed by 2 int
+ * offsets. The 2 masks are the weights to be used for the luma and chroma, respectively.
+ * Each mask contains LeftWeight1, RightWeight1, LeftWeight2, RightWeight2. So a pair of pixels
+ * will later be processed each pass through the horizontal resize loop.
+ *
+ * The weights are scaled 0-256 and the left and right weights will sum to 256 for each pixel.
+ */
+static void init_tables_yuy2(int newwidth, int newheight, int oldwidth, int oldheight,
+			     int Interlaced, double hWarp, double vWarp,
+			     uint32_t *hControl, uint32_t *vOffsets, uint32_t *vWeights )
+{
+  int i;
+  int j;
+  int k;
+  int wY1;
+  int wY2;
+  int wUV1;
+  int wUV2;
+  DBG("init_yuy2: %dx%d->%dx%d hWarp %1.3lf vWarp %1.3lf\n", 
+      oldwidth, oldheight, newwidth, newheight, hWarp, vWarp);
+  
+  /* First set up horizontal table */
+  for(i=0; i < newwidth; i+=2) {
+    /* first make even pixel control */
+    if (hWarp==1)          /* if no warp factor */
+      j = i * 256 * (oldwidth-1) / (newwidth-1);
+    else                   /* stretch and warp somehow */
+      j = (int) (256 * WarpFactor(i / (newwidth-1.0), hWarp) * (oldwidth-1));
+    
+    k = j>>8;
+    wY2 = j - (k << 8);	   /* luma weight of right pixel */
+    wY1 = 256 - wY2;       /* luma weight of left pixel  */
+    wUV2 = (k%2)  ?  128 + (wY2 >> 1)  :  wY2 >> 1;
+    wUV1 = 256 - wUV2;
+    
+    if (k > oldwidth - 2) {
+      hControl[i*3+4] = oldwidth - 1;      /* point to last byte      */
+      hControl[i*3]   = 0x00000100;        /* use 100% of rightmost Y */
+      hControl[i*3+2] = 0x00000100;        /* use 100% of rightmost U */
+    } else {
+      hControl[i*3+4] = k;                 /* pixel offset   */
+      hControl[i*3]   = wY2 << 16 | wY1;   /* luma weights   */
+      hControl[i*3+2] = wUV2 << 16 | wUV1; /* chroma weights */
+    }
+    
+    /* now make odd pixel control */
+    if (hWarp==1)                          /* if no warp factor */
+      j = (i+1) * 256 * (oldwidth-1) / (newwidth-1);
+    else                                   /* stretch and warp somehow */
+      j = (int) (256 * WarpFactor((i+1) / (newwidth-1.0), hWarp) * (oldwidth-1));
+    
+    k = j>>8;
+    wY2 = j - (k << 8); /* luma weight of right pixel */
+    wY1 = 256 - wY2;    /* luma weight of left pixel  */
+    wUV2 = (k%2)  ?  128 + (wY2 >> 1)  :  wY2 >> 1;
+    wUV1 = 256 - wUV2;
+    
+    if (k > oldwidth - 2) {
+      hControl[i*3+5] = oldwidth - 1;    /* point to last byte      */
+      hControl[i*3+1] = 0x00000100;      /* use 100% of rightmost Y */
+      hControl[i*3+3] = 0x00000100;      /* use 100% of rightmost V */
+    } else {
+      hControl[i*3+5] = k;               /* pixel offset */
+      hControl[i*3+1] = wY2 << 16 | wY1; /* luma weights */
+      /*			hControl[i*3+3] = wUV2 << 16 | wUV1; // chroma weights */
+      /* horiz chroma weights should be same as for even pixel - trbarry 09/16/2002 */
+      hControl[i*3+3] = hControl[i*3+2]; /* chroma weights */
+    }
+  }
+  
+  hControl[newwidth*3+4] =  2 * (oldwidth-1); /* give it something to prefetch at end */
+  hControl[newwidth*3+5] =  2 * (oldwidth-1);
+  
+  /* Next set up vertical table. The offsets are measured in lines and will be mult */
+  /* by the source pitch later */
+  for(i=0; i< newheight; ++i) {
+    if (vWarp==1)                   /* if no warp factor */
+      j = i * 256 * (oldheight-1) / (newheight-1);
+    else                            /* stretch and warp somehow */
+      j = (int) (256 * WarpFactor(i / (newheight-1.0), vWarp) * (oldheight-1));
+
+    if (Interlaced) {           /* do hard way? */
+      if (i%2) {                  /* is odd output line? */
+	if (j < 256) {            /* before 1st odd input line */
+	  vOffsets[i] = 1;        /* all from line 1 */
+	  vWeights[i] = 0;        /* weight to give to 2nd line */
+	} else  {
+	  k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */
+	  vOffsets[i] = k;
+	  wY2 = j - (k << 8); 
+	  vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */
+	}
+      } else {                    /* is even output line */
+	k = (j >> 9) << 1;        /* next lower even line */
+	vOffsets[i] = k;
+	wY2 = j - (k << 8); 
+	vWeights[i] = wY2 >> 1;   /* weight to give to 2nd line */
+      }
+    } else {                    /* simple way, do as progressive */
+      k = j >> 8;
+      vOffsets[i] = k;
+      wY2 = j - (k << 8);  
+      vWeights[i] = wY2;          /* weight to give to 2nd line */
+    }
+  }
+}
+
+/* Register allocation */
+/* index/counter registers (REGA, REGC) are loaded from 32bit vars/arrays ! */
+#define     REGEA "eax"
+#define     REGEB "ebx"
+#if defined(__x86_64__)
+#  define   REGA  "rax"
+#  define   REGB  "rbx"
+#  define   REGC  "ecx"
+#  define   REGD  "rdx"
+#  define   REGDI "rdi"
+#  define   REGSI "rsi"
+#elif defined(__i386__) 
+#  define   REGA  "eax"
+#  define   REGB  "ebx"
+#  define   REGC  "ecx"
+#  define   REGD  "edx"
+#  define   REGDI "edi"
+#  define   REGSI "esi"
+#endif
+
+/* variables accessed from assembler code */
+#define _FPround1       "%0"
+#define _vWeight1       "%1" 
+#define _vWeight2       "%2" 
+#define _YMask          "%3"
+#define _src_row_size   "%4"
+#define _EndOffset      "%5"
+#define _pControl       "%6"
+#define _row_size       "%7"
+#define _vWorkYW        "%8"
+#define _dstp           "%9"
+#define _vWorkUVW       "%10"
+#define _FPround2       "%11"
+#define _srcp1          "%12"
+#define _srcp2          "%13"
+#if !defined(__x86_64__)
+#define _oldbx          "%14"
+#define _SSEMMXenabledW "%15"
+#define _SSE2enabledW   "%16"
+#endif
+
+/* structure for mmx constants */
+typedef union {
+  uint64_t uq[1];  /* Unsigned Quadword */
+  uint32_t ud[2];  /* Unsigned Doubleword */
+} ATTR_ALIGN(16) mmx_t;
+
+/* structure for sse2 constants */
+typedef union {
+  uint64_t uq[2];  /* Unsigned Quadword */
+  uint32_t ud[4];  /* Unsigned Doubleword */
+} ATTR_ALIGN(16) sse2_t;
+
+
+static int do_warp_yuy2(uint8_t *dst, const uint8_t *src,
+			const int dst_pitch, const int src_pitch,
+			const int dst_width, const int dst_height,			 
+			const int src_width, const int src_height,
+			const int Interlaced, const uint32_t * const hControl, 
+			const uint32_t * const vOffsets, const uint32_t * const vWeights,
+			uint32_t *vWorkY, uint32_t *vWorkUV,
+			int dst_start)
+{
+#if defined(__i386__) || defined(__x86_64__)
+  const sse2_t YMask    = {uq:{UINT64_C(0x00ff00ff00ff00ff),UINT64_C(0x00ff00ff00ff00ff)}}; /* keeps only luma */
+  const sse2_t FPround1 = {uq:{UINT64_C(0x0080008000800080),UINT64_C(0x0080008000800080)}}; /* round words      */
+  const sse2_t FPround2 = {uq:{UINT64_C(0x0000008000000080),UINT64_C(0x0000008000000080)}}; /* round dwords     */
+  sse2_t vWeight1;
+  sse2_t vWeight2;
+
+  const uint32_t *pControl = &hControl[0];
+  const uint32_t *vWorkYW = vWorkY;
+  const uint32_t *vWorkUVW = vWorkUV;
+  const uint8_t *srcp = src;
+  const uint8_t *srcp1;
+  const uint8_t *srcp2;
+  uint8_t *dstp = dst + dst_pitch*dst_start;
+
+  const uint32_t src_row_size = src_width * 2;
+  const uint32_t row_size = dst_width * 2;
+  const uint32_t EndOffset = src_row_size / 2;
+
+#if !defined(__x86_64__)
+  const int accel = xine_mm_accel();
+  const uint32_t SSE2enabledW   = !!(accel & MM_ACCEL_X86_SSE2);   /* in local storage for asm */
+  const uint32_t SSEMMXenabledW = !!(accel & MM_ACCEL_X86_MMXEXT); /* in local storage for asm */
+  long int oldbx;
+#endif
+  int y;
+
+  for (y = dst_start; y < dst_height; y++) {
+
+    if(vOffsets[y] >= src_height) {
+      /* slice completed */
+      /*DBG("do_warp_yuy2: max input height reached: need line %d, height %d\n -> Returning next output line: %d\n",
+	vOffsets[y], src_height, y);*/
+      return y;
+    }
+
+    vWeight1.ud[0] = vWeight1.ud[1] = vWeight1.ud[2] = vWeight1.ud[3] = 
+      (256-vWeights[y]) << 16 | (256-vWeights[y]);
+    vWeight2.ud[0] = vWeight2.ud[1] = vWeight2.ud[2] = vWeight2.ud[3] = 
+      vWeights[y] << 16 | vWeights[y];
+
+    srcp1 = srcp + vOffsets[y] * src_pitch;
+    if (Interlaced)
+      srcp2 = (y < dst_height-2)  ?  srcp1 + 2 * src_pitch  :  srcp1;
+    else
+      srcp2 = (y < dst_height-1)  ?  srcp1 + src_pitch      :  srcp1;
+
+    __asm__ __volatile__ (
+#if !defined(__x86_64__)
+            /* store ebx (PIC) */
+            "mov %%"REGB", "_oldbx"          \n\t"
+#endif
+	    "movl   "_src_row_size", %%"REGC"  \n\t"
+	    "shrl   $3,          %%"REGC"      \n\t" /* 8 bytes a time             */
+	    "mov    "_srcp1",    %%"REGSI"     \n\t" /* top of 2 src lines to get  */
+	    "mov    "_srcp2",    %%"REGD"      \n\t" /* next "                     */
+	    "mov    "_vWorkYW",  %%"REGDI"     \n\t" /* luma work destination line */
+	    "mov    "_vWorkUVW", %%"REGB"      \n\t" /* luma work destination line */
+	    "xor    %%"REGA",    %%"REGA"      \n\t"
+#if !defined(__x86_64__)
+	    /* Let's check here to see if we are on a P4 or higher and can use SSE2 instructions.
+	     * This first loop is not the performance bottleneck anyway but it is trivial to tune
+	     * using SSE2 if we have proper alignment.
+	     */
+	    "testl  $1, "_SSE2enabledW"  \n\t"  /* is SSE2 supported?*/
+	    "jz     vMaybeSSEMMX         \n\t"  /* n, can't do anyway*/
+#endif
+	    "cmpl   $2, %%"REGC"         \n\t"  /* we have at least 16 bytes, 2 qwords? */
+	    "jl     vMaybeSSEMMX         \n\t"  /* n, don't bother*/
+	    
+	    "shrl   $1, %%"REGC"         \n\t"  /* do 16 bytes at a time instead*/
+	    "decl   %%"REGC"             \n"    /* jigger loop ct */
+	    
+	    ".align 16                   \n\t"
+
+	    "movdqa "_FPround1", %%xmm0  \n\t"
+	    "movdqa "_vWeight1", %%xmm5  \n\t"
+	    "movdqa "_vWeight2", %%xmm6  \n\t"
+	    "movdqa "_YMask",    %%xmm7  \n"
+
+	    "vLoopSSE2_Fetch:            \n\t"
+#ifdef PREFETCH
+	    "  prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t"
+	    "  prefetcht0 16(%%"REGD",  %%"REGA", 2) \n"
+#endif	    
+	    "vLoopSSE2:  \n\t"
+	    "  movdqu   (%%"REGSI", %%"REGA", 2), %%xmm1 \n\t" /* top of 2 lines to interpolate */
+	    "  movdqu   (%%"REGD",  %%"REGA", 2), %%xmm2 \n\t" /* 2nd of 2 lines */
+
+	    "  movdqa   %%xmm1, %%xmm3  \n\t"  /* get chroma  bytes  */
+	    "  pand     %%xmm7, %%xmm1  \n\t"  /* keep only luma     */
+	    "  psrlw        $8, %%xmm3  \n\t"  /* right just chroma  */
+	    "  pmullw   %%xmm5, %%xmm1  \n\t"  /* mult by weighting factor */
+	    "  pmullw   %%xmm5, %%xmm3  \n\t"  /* mult by weighting factor */
+
+	    "  movdqa   %%xmm2, %%xmm4  \n\t"  /* get chroma bytes  */
+	    "  pand     %%xmm7, %%xmm2  \n\t"  /* keep only luma    */
+	    "  psrlw        $8, %%xmm4  \n\t"  /* right just chroma */
+	    "  pmullw   %%xmm6, %%xmm2  \n\t"  /* mult by weighting factor */
+	    "  pmullw   %%xmm6, %%xmm4  \n\t"  /* mult by weighting factor */
+	    
+	    "  paddw    %%xmm2, %%xmm1  \n\t"  /* combine lumas     */
+	    "  paddusw  %%xmm0, %%xmm1  \n\t"  /* round             */
+	    "  psrlw        $8, %%xmm1  \n\t"  /* right adjust luma */
+#ifdef STREAMING_STORE
+	    "  movntdq  %%xmm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
+#else
+	    "  movdqu   %%xmm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
+#endif
+	    "  paddw    %%xmm4, %%xmm3  \n\t"  /* combine chromas */
+	    "  paddusw  %%xmm0, %%xmm3  \n\t"  /* round */
+	    "  psrlw        $8, %%xmm3  \n\t"  /* right adjust chroma */
+	    "  packuswb %%xmm3, %%xmm3  \n\t"  /* pack UV's into low dword */
+	    "  movdq2q  %%xmm3, %%mm1   \n\t"  /* save in our work area    */
+#ifdef STREAMING_STORE
+	    "  movntq    %%mm1, (%%"REGB", %%"REGA") \n\t"  /* save in our work area */
+#else
+	    "  movq      %%mm1, (%%"REGB", %%"REGA") \n\t"  /* save in our work area */
+#endif
+	    "  lea   8(%%"REGA"), %%"REGA"  \n\t"
+	    "  decl  %%"REGC"               \n\t"
+	    
+	    "  jg    vLoopSSE2_Fetch        \n\t"  /* if not on last one loop, prefetch */
+	    "  jz    vLoopSSE2              \n\t"  /* or just loop, or not */
+
+	    /* done with our SSE2 fortified loop but we may need to pick up the spare change */
+#ifdef STREAMING_STORE
+	    "  sfence    \n\t"
+#endif
+	    "  movl  "_src_row_size", %%"REGC" \n\t"  /* get count again   */
+	    "  andl  $15, %%"REGC"          \n\t"  /* just need mod 16  */
+
+	    "  movq  "_YMask",    %%mm7     \n\t"  /* useful luma mask constant - lazy dupl init */
+	    "  movq  "_vWeight1", %%mm5     \n\t"
+	    "  movq  "_vWeight2", %%mm6     \n\t"
+	    "  movq  "_FPround1", %%mm0     \n\t"  /* useful rounding constant  */
+
+	    "  shrl  $3, %%"REGC"    \n\t"  /* 8 bytes at a time, any?  */
+	    "  jz   MoreSpareChange  \n"    /* n, did them all  */
+
+	    /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions.
+	     * This first loop is not the performance bottleneck anyway but it is trivial to tune
+	     * using SSE if we have proper alignment.
+	     */
+	    "vMaybeSSEMMX:    \n\t"
+
+	    "  movq  "_YMask",    %%mm7    \n\t"  /* useful luma mask constant - lazy dupl init */
+	    "  movq  "_vWeight1", %%mm5    \n\t"  
+	    "  movq  "_vWeight2", %%mm6    \n\t"  
+	    "  movq  "_FPround1", %%mm0    \n\t"  /* useful rounding constant  */
+#if !defined(__x86_64__)
+	    "  testl $1, "_SSEMMXenabledW" \n\t"  /* MMXEXTsupported? */
+	    "  jz    vLoopMMX              \n\t"  /* n, can't do anyway */
+#endif
+	    "  decl  %%"REGC"              \n"    /* jigger loop ctr */
+
+	    ".align 16           \n"
+	    "vLoopSSEMMX_Fetch:  \n\t"
+#ifdef PREFETCH
+	    "  prefetcht0 8(%%"REGSI", %%"REGA", 2)  \n\t"
+	    "  prefetcht0 8(%%"REGD",  %%"REGA", 2)  \n"
+#endif
+	    "vLoopSSEMMX:   \n\t"
+	    "  movq    (%%"REGSI", %%"REGA", 2), %%mm1  \n\t"   /* top of 2 lines to interpolate */
+	    "  movq    (%%"REGD",  %%"REGA", 2), %%mm2  \n\t"   /* 2nd of 2 lines    */
+
+	    "  movq    %%mm1, %%mm3  \n\t"   /* copy top bytes */
+	    "  pand    %%mm7, %%mm1  \n\t"   /* keep only luma */
+	    "  pxor    %%mm1, %%mm3  \n\t"   /* keep only chroma */
+	    "  psrlw      $8, %%mm3  \n\t"   /* right just chroma */
+	    "  pmullw  %%mm5, %%mm1  \n\t"   /* mult by weighting factor */
+	    "  pmullw  %%mm5, %%mm3  \n\t"   /* mult by weighting factor */
+			  
+	    "  movq    %%mm2, %%mm4  \n\t"   /* copy 2nd bytes */
+	    "  pand    %%mm7, %%mm2  \n\t"   /* keep only luma */
+	    "  pxor    %%mm2, %%mm4  \n\t"   /* keep only chroma */
+	    "  psrlw      $8, %%mm4  \n\t"   /* right just chroma */
+	    "  pmullw  %%mm6, %%mm2  \n\t"   /* mult by weighting factor */
+	    "  pmullw  %%mm6, %%mm4  \n\t"   /* mult by weighting factor */
+	    
+	    "  paddw   %%mm2, %%mm1  \n\t"   /* combine lumas     */
+	    "  paddusw %%mm0, %%mm1  \n\t"   /* round             */
+	    "  psrlw      $8, %%mm1  \n\t"   /* right adjust luma */
+#ifdef STREAMING_STORE
+	    "  movntq  %%mm1, (%%"REGDI", %%"REGA", 2) \n\t"  /* save lumas in our work area */
+#else
+	    "  movq    %%mm1, (%%"REGDI", %%"REGA", 2) \n\t"  /* save lumas in our work area */
+#endif	    
+	    "  paddw    %%mm4, %%mm3  \n\t"  /* combine chromas  */
+	    "  paddusw  %%mm0, %%mm3  \n\t"  /* round            */
+	    "  psrlw       $8, %%mm3  \n\t"  /* right adjust chroma  */
+	    "  packuswb %%mm3, %%mm3  \n\t"  /* pack UV's into low dword */
+	    "  movd     %%mm3, (%%"REGB", %%"REGA") \n\t"  /* save in our work area    */
+	    
+	    "  lea   4(%%"REGA"), %%"REGA" \n\t"
+	    "  decl  %%"REGC"              \n\t"
+	    "  jg    vLoopSSEMMX_Fetch     \n\t"  /* if not on last one loop, prefetch  */
+	    "  jz    vLoopSSEMMX           \n\t"  /* or just loop, or not  */
+#ifdef STREAMING_STORE
+	    "  sfence                      \n\t"
+#endif
+	    "  jmp    MoreSpareChange      \n"    /* all done with vertical  */
+	    
+	    ".align 16     \n"
+	    "vLoopMMX:     \n\t"
+
+	    "  movq (%%"REGSI", %%"REGA", 2), %%mm1  \n\t" /* top of 2 lines to interpolate */
+	    "  movq (%%"REGD",  %%"REGA", 2), %%mm2  \n\t" /* 2nd of 2 lines */
+
+	    "  movq     %%mm1, %%mm3  \n\t"  /* copy top bytes    */
+	    "  pand     %%mm7, %%mm1  \n\t"  /* keep only luma    */
+	    "  pxor     %%mm1, %%mm3  \n\t"  /* keep only chroma  */
+	    "  psrlw       $8, %%mm3  \n\t"  /* right just chroma */
+	    "  pmullw   %%mm5, %%mm1  \n\t"  /* mult by weighting factor */
+	    "  pmullw   %%mm5, %%mm3  \n\t"  /* mult by weighting factor */
+	    
+	    "  movq     %%mm2, %%mm4  \n\t"  /* copy 2nd bytes    */
+	    "  pand     %%mm7, %%mm2  \n\t"  /* keep only luma    */
+	    "  pxor     %%mm2, %%mm4  \n\t"  /* keep only chroma  */
+	    "  psrlw       $8, %%mm4  \n\t"  /* right just chroma */
+	    "  pmullw   %%mm6, %%mm2  \n\t"  /* mult by weighting factor */
+	    "  pmullw   %%mm6, %%mm4  \n\t"  /* mult by weighting factor */
+	    
+	    "  paddw    %%mm2, %%mm1  \n\t"  /* combine lumas     */
+	    "  paddusw  %%mm0, %%mm1  \n\t"  /* round             */
+	    "  psrlw       $8, %%mm1  \n\t"  /* right adjust luma */
+	    "  movq     %%mm1, (%%"REGDI", %%"REGA", 2) \n\t"  /* save lumas in our work area */
+	    
+	    "  paddw    %%mm4, %%mm3  \n\t"  /* combine chromas     */
+	    "  paddusw  %%mm0, %%mm3  \n\t"  /* round               */
+	    "  psrlw       $8, %%mm3  \n\t"  /* right adjust chroma */
+	    "  packuswb %%mm3, %%mm3  \n\t"  /* pack UV's into low dword */
+	    "  movd     %%mm3, (%%"REGB", %%"REGA")  \n\t"  /* save in our work area */
+
+	    "  lea      4(%%"REGA"), %%"REGA"  \n\t"
+	    "  loop     vLoopMMX       \n"
+
+	    /* Add a little code here to check if we have 2 more pixels to do and, if so, make one
+	     * more pass thru vLoopMMX. We were processing in multiples of 4 pixels and alway have
+	     * an even number so there will never be more than 2 left. trbarry 7/29/2002
+	     */
+	    "MoreSpareChange:      \n\t"
+
+	    "  cmpl  "_EndOffset", %%"REGEA"  \n\t"  /* did we get them all */
+	    "  jnl   DoHorizontal  \n\t"  /* yes, else have 2 left */
+	    "  movl  $1, %%"REGC"  \n\t"  /* jigger loop ct */
+	    "  sub   $2, %%"REGA"  \n\t"  /* back up 2 pixels (4 bytes, but eax carried as 1/2) */
+	    "  jmp   vLoopMMX      \n"
+
+	    /*  We've taken care of the vertical scaling, now do horizontal  */
+	    "DoHorizontal:        \n\t"
+
+	    "  movq  "_YMask",    %%mm7     \n\t"  /* useful 0U0U..  mask constant  */
+	    "  movq  "_FPround2", %%mm6     \n\t"  /* useful rounding constant, dwords  */
+	    "  mov   "_pControl", %%"REGSI" \n\t"  /* @ horiz control bytes  */	
+	    "  movl  "_row_size", %%"REGC"  \n\t"
+	    "  shrl  $2,          %%"REGC"  \n\t"  /* bytes a time, 2 pixels  */
+	    "  mov   "_vWorkYW",  %%"REGD"  \n\t"  /* our luma data, as 0Y0Y 0Y0Y..  */
+	    "  mov   "_dstp",     %%"REGDI" \n\t"  /* the destination line  */
+	    "  mov   "_vWorkUVW", %%"REGB"  \n"    /* chroma data, as UVUV UVUV...  */
+
+	    ".align 16  \n"
+	    "hLoopMMX:    \n\t"
+
+	    /* x86_64: must use movl (accessing table of uint32's) */
+	    "  movl      16(%%"REGSI"), %%"REGEA"        \n\t"  /* get data offset in pixels, 1st pixel pair */
+	    "  movd      (%%"REGD", %%"REGA", 2), %%mm0  \n\t"  /* copy luma pair */
+	    "  shr       $1, %%"REGA"                    \n\t"  /* div offset by 2 */
+	    "  movd      (%%"REGB", %%"REGA", 2), %%mm1  \n\t"  /* copy UV pair VUVU */
+	    "  psllw     $8, %%mm1                       \n\t"  /* shift out V, keep 0000U0U0 */
+	    
+	    /*  we need to use both even and odd croma from same location - trb 9/2002 */
+	    "  punpckldq (%%"REGB", %%"REGA", 2), %%mm1  \r\n"  /* copy UV pair VUVU  */
+	    "  psrlw     $8, %%mm1                       \r\n"  /* shift out U0, keep 0V0V 0U0U   */
+	    "  movl      20(%%"REGSI"), %%"REGEA"        \r\n"  /* get data offset in pixels, 2nd pixel pair  */
+	    "  punpckldq (%%"REGD", %%"REGA", 2), %%mm0  \r\n"  /* copy luma pair  */
+	    
+	    "  pmaddwd    (%%"REGSI"), %%mm0  \r\n"  /* mult and sum lumas by ctl weights  */
+	    "  paddusw    %%mm6, %%mm0        \r\n"  /* round  */
+	    "  psrlw      $8, %%mm0           \r\n"  /* right just 2 luma pixel value 000Y,000Y  */
+	    
+	    "  pmaddwd    8(%%"REGSI"), %%mm1 \r\n"  /* mult and sum chromas by ctl weights */
+	    "  paddusw    %%mm6, %%mm1        \r\n"  /* round */
+	    "  pslld      $8, %%mm1           \r\n"  /* shift into low bytes of different words */
+	    "  pand       %%mm7, %%mm1        \r\n"  /* keep only 2 chroma values 0V00,0U00  */
+	    "  por        %%mm1, %%mm0        \r\n"  /* combine luma and chroma, 0V0Y,0U0Y  */
+	    "  packuswb   %%mm0, %%mm0        \r\n"  /* pack all into low dword, xxxxVYUY  */
+	    "  movd       %%mm0, (%%"REGDI")  \n\t"  /* done with 2 pixels */
+
+	    "  lea     24(%%"REGSI"), %%"REGSI"  \n\t"  /* bump to next control bytest */
+	    "  lea      4(%%"REGDI"), %%"REGDI"  \n\t"  /* bump to next output pixel addr */
+	    
+	    "  loop   hLoopMMX             \n\t"  /* loop for more */
+
+	    "emms              \n\t"
+	    /* done with one line */
+
+#if !defined(__x86_64__)
+	    "mov "_oldbx", %%"REGB" \n\t"
+#endif
+	    ::
+	    "m" /*0*/(FPround1), 
+	    "m" /*1*/(vWeight1), 
+	    "m" /*2*/(vWeight2), 
+	    "m" /*3*/(YMask),
+	    "m" /*4*/(src_row_size),
+	    "m" /*5*/(EndOffset),
+	    "m" /*6*/(pControl),
+	    "m" /*7*/(row_size),
+	    "m" /*8*/(vWorkYW),
+	    "m" /*9*/(dstp),
+	    "m" /*10*/(vWorkUVW),
+	    "m" /*11*/(FPround2),
+	    "m" /*12*/(srcp1),
+	    "m" /*13*/(srcp2)
+#if !defined(__x86_64__)
+	    ,
+	    "m" /*14*/(oldbx),
+	    "m" /*15*/(SSEMMXenabledW),
+	    "m" /*16*/(SSE2enabledW)
+	    : REGA, /*REGB,*/ REGC, REGD, REGSI, REGDI
+#else
+	    : REGA, REGB, REGC, REGD, REGSI, REGDI
+#endif
+	    );
+
+    dstp += dst_pitch;
+  }
+#endif
+  return 0;
+}
+
+static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
+			const int dst_pitch, const int src_pitch,
+			const int dst_width, const int dst_height,			 
+			const int src_width, const int src_height,
+			const int Interlaced, const uint32_t * const hControl, 
+			const uint32_t * vOffsets, const uint32_t * vWeights,
+			uint32_t *vWorkY, int dst_start)
+{
+#if defined(__i386__) || defined(__x86_64__)
+  const sse2_t FPround1 = {uq:{UINT64_C(0x0080008000800080),UINT64_C(0x0080008000800080)}}; /* round words     */
+  const sse2_t FPround2 = {uq:{UINT64_C(0x0000008000000080),UINT64_C(0x0000008000000080)}}; /* round dwords    */
+  sse2_t vWeight1;
+  sse2_t vWeight2;
+
+  const uint32_t *pControl = &hControl[0];
+  const uint32_t *vWorkYW = vWorkY;
+  const uint8_t *srcp = src;
+  const uint8_t *srcp1;
+  const uint8_t *srcp2;
+  uint8_t *dstp = dst + dst_pitch*dst_start;
+
+  const uint32_t src_row_size = src_width;
+  const uint32_t row_size = dst_width;
+
+#if !defined(__x86_64__)
+  const int accel = xine_mm_accel();
+  const uint32_t SSE2enabledW   = !!(accel & MM_ACCEL_X86_SSE2);   /* in local storage for asm */
+  const uint32_t SSEMMXenabledW = !!(accel & MM_ACCEL_X86_MMXEXT); /* in local storage for asm */
+  long int oldbx;
+#endif
+  int y;
+
+  /* Operation in sliced mode:
+   *  - continue until required next source line is out of slice
+   *  - return next output line
+   *  - at next call, continue from next souce line
+   */
+
+  for (y = dst_start; y < dst_height; y++) {
+    if(vOffsets[y] >= src_height) {
+      /* slice completed */
+      /*DBG("do_warp_yv12: max input height reached: need line %d, height %d\n -> Returning next output line: %d , start was %d\n",
+	(int)vOffsets[y], (int)src_height, (int)y, (int)dst_start);*/
+      return y;
+    }
+
+    vWeight1.ud[0] = vWeight1.ud[1] = vWeight1.ud[2] = vWeight1.ud[3] =
+      (256-vWeights[y]) << 16 | (256-vWeights[y]);
+    vWeight2.ud[0] = vWeight2.ud[1] = vWeight2.ud[2] = vWeight2.ud[3] =
+      vWeights[y] << 16 | vWeights[y];
+
+    srcp1 = srcp + vOffsets[y] * src_pitch;
+
+    if (Interlaced)
+      srcp2 = (y < dst_height-2)  ?  srcp1 + 2 * src_pitch  :  srcp1;
+    else
+      srcp2 = (y < dst_height-1)  ?  srcp1 + src_pitch  :  srcp1;
+
+    __asm__  __volatile__(
+             "movl "_src_row_size", %%"REGC" \n\t"
+	     "shr  $3,         %%"REGC"   \n\t"  /* 8 bytes a time */
+	     "mov  "_srcp1",   %%"REGSI"  \n\t"  /* top of 2 src lines to get */
+	     "mov  "_srcp2",   %%"REGD"   \n\t"  /* next "  */ 
+	     "mov  "_vWorkYW", %%"REGDI"  \n\t"  /* luma work destination line */
+	     "xor  %%"REGA",   %%"REGA"   \n\t"
+#if !defined(__x86_64__)
+	     /* Let's check here to see if we are on a P4 or higher and can use SSE2 instructions.
+	      * This first loop is not the performance bottleneck anyway but it is trivial to tune
+	      * using SSE2 if we have proper alignment.
+	      */
+	     "testl $1, "_SSE2enabledW"   \n\t"  /* is SSE2 supported? */
+	     "jz    vMaybeSSEMMX_12       \n\t"  /* n, can't do anyway */
+#endif
+	     "cmpl  $2, %%"REGC"          \n\t"  /* we have at least 16 byts, 2 qwords? */
+	     "jl    vMaybeSSEMMX_12       \n\t"  /* n, don't bother */
+	     
+	     "mov   %%"REGSI", %%"REGB"   \n\t"
+	     "or    %%"REGD",  %%"REGB"   \n\t"
+	     "test  $15,       %%"REGB"   \n\t"  /* both src rows 16 byte aligned? */
+	     "jnz   vMaybeSSEMMX_12       \n\t"  /* n, don't use sse2 */
+			 
+	     "shr   $1, %%"REGC"          \n\t"  /* do 16 bytes at a time instead */
+	     "dec   %%"REGC"              \n\t"  /* jigger loop ct */
+			 
+	     "movdqa "_FPround1", %%xmm0  \n\t"
+	     "movdqa "_vWeight1", %%xmm5  \n\t"
+	     "movdqa "_vWeight2", %%xmm6  \n\t"
+	     "pxor        %%xmm7, %%xmm7  \n"
+
+	     ".align 16                   \n"
+	     "vLoopSSE2_Fetch_12:         \n\t"
+#ifdef PREFETCH
+	     "  prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t"
+	     "  prefetcht0 16(%%"REGD",  %%"REGA", 2) \n"
+#endif
+	     "vLoopSSE2_12:  \n\t"
+	     /* we're already checked pointers to be on dqword aligned */
+	     "  movdqa  (%%"REGSI", %%"REGA"), %%xmm1 \n\t" /* top of 2 lines to interpolate */
+	     "  movdqa  (%%"REGD",  %%"REGA"), %%xmm3 \n\t" /* 2nd of 2 lines */
+	     "  movdqa    %%xmm1, %%xmm2  \n\t"
+	     "  movdqa    %%xmm3, %%xmm4  \n\t"
+
+	     "  punpcklbw %%xmm7, %%xmm1  \n\t"  /* make words */
+	     "  punpckhbw %%xmm7, %%xmm2  \n\t"  /*     "     */
+	     "  punpcklbw %%xmm7, %%xmm3  \n\t"  /*     "     */
+	     "  punpckhbw %%xmm7, %%xmm4  \n\t"  /*     "     */
+
+	     "  pmullw    %%xmm5, %%xmm1  \n\t"  /* mult by top weighting factor */
+	     "  pmullw    %%xmm5, %%xmm2  \n\t"  /*    "    */
+	     "  pmullw    %%xmm6, %%xmm3  \n\t"  /* mult by bot weighting factor */
+	     "  pmullw    %%xmm6, %%xmm4  \n\t"  /*    "    */
+
+	     "  paddw     %%xmm3, %%xmm1  \n\t"  /* combine lumas low */
+	     "  paddw     %%xmm4, %%xmm2  \n\t"  /* combine lumas high */
+
+	     "  paddusw   %%xmm0, %%xmm1  \n\t"  /* round */
+	     "  paddusw   %%xmm0, %%xmm2  \n\t"  /* round */
+			
+	     "  psrlw     $8, %%xmm1      \n\t"  /* right adjust luma */
+	     "  psrlw     $8, %%xmm2      \n\t"  /* right adjust luma */
+
+	     "  packuswb  %%xmm2, %%xmm1  \n\t"  /* pack words to our 16 byte answer */
+#ifdef STREAMING_STORE
+	     "  movntdq   %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+#else
+	     "  movdqu    %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+#endif			 
+	     "  lea   16(%%"REGA"), %%"REGA" \n\t"
+	     "  decl  %%"REGC"            \n\t"
+
+	     "  jg    vLoopSSE2_Fetch_12  \n\t"  /* if not on last one loop, prefetch  */
+	     "  jz    vLoopSSE2_12        \n\t"  /* or just loop, or not  */
+
+	     /* done with our SSE2 fortified loop but we may need to pick up the spare change */
+#ifdef STREAMING_STORE
+	     "  sfence                  \n\t"
+#endif
+	     "  movl  "_src_row_size", %%"REGC" \n\t"  /* get count again   */
+	     "  andl  $15, %%"REGC"       \n\t"  /* just need mod 16  */
+	     "  movq "_vWeight1", %%mm5   \n\t"
+	     "  movq "_vWeight2", %%mm6   \n\t"
+	     "  movq "_FPround1", %%mm0   \n\t"  /* useful rounding constant  */
+
+	     "  shrl  $3, %%"REGC"        \n\t"  /* 8 bytes at a time, any?  */
+	     "  jz   MoreSpareChange_12   \n"    /* n, did them all  */
+
+	     /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions.
+	      * This first loop is not the performance bottleneck anyway but it is trivial to tune
+	      * using SSE if we have proper alignment.
+	      */
+	     "vMaybeSSEMMX_12:            \n\t"
+
+	     "  movq "_vWeight1", %%mm5   \n\t"  
+	     "  movq "_vWeight2", %%mm6   \n\t"  
+	     "  movq "_FPround1", %%mm0   \n\t"  /* useful rounding constant  */
+	     "  pxor       %%mm7, %%mm7   \n\t"
+#if !defined(__x86_64__)
+	     "  testl $1, "_SSEMMXenabledW" \n\t"/* MMXEXTsupported? */
+	     "  jz    vLoopMMX_12         \n\t"  /* n, can't do anyway */
+#endif
+	     "  decl  %%"REGC"      \n"  /* jigger loop ctr */
+			 
+	     ".align 16             \n"
+	     "vLoopSSEMMX_Fetch_12: \n\t"
+#ifdef PREFETCH
+	     "  prefetcht0 8(%%"REGSI", %%"REGA")  \n\t"
+	     "  prefetcht0 8(%%"REGD",  %%"REGA")  \n"
+#endif
+	     "vLoopSSEMMX_12:   \n\t"
+
+	     "  movq    (%%"REGSI", %%"REGA"), %%mm1  \n\t"   /* top of 2 lines to interpolate */
+	     "  movq    (%%"REGD",  %%"REGA"), %%mm3  \n\t"   /* 2nd of 2 lines    */
+
+	     "  movq      %%mm1, %%mm2  \n\t"
+	     "  movq      %%mm3, %%mm4  \n\t"
+
+	     "  punpcklbw %%mm7, %%mm1  \n\t"  /* make words */
+	     "  punpckhbw %%mm7, %%mm2  \n\t"  /*     "     */
+	     "  punpcklbw %%mm7, %%mm3  \n\t"  /*     "     */
+	     "  punpckhbw %%mm7, %%mm4  \n\t"  /*     "     */
+
+	     "  pmullw    %%mm5, %%mm1  \n\t"  /* mult by top weighting factor */
+	     "  pmullw    %%mm5, %%mm2  \n\t"  /*    "    */
+	     "  pmullw    %%mm6, %%mm3  \n\t"  /* mult by bot weighting factor */
+	     "  pmullw    %%mm6, %%mm4  \n\t"  /*    "    */
+
+	     "  paddw     %%mm3, %%mm1  \n\t"  /* combine lumas low */
+	     "  paddw     %%mm4, %%mm2  \n\t"  /* combine lumas high */
+
+	     "  paddusw   %%mm0, %%mm1  \n\t"  /* round */
+	     "  paddusw   %%mm0, %%mm2  \n\t"  /* round */
+			
+	     "  psrlw     $8, %%mm1     \n\t"  /* right adjust luma */
+	     "  psrlw     $8, %%mm2     \n\t"  /* right adjust luma */
+
+	     "  packuswb  %%mm2, %%mm1  \n\t"  /* pack words to our 16 byte answer */
+#ifdef STREAMING_STORE
+	     "  movntq    %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+#else
+	     "  movq      %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+#endif
+	     "  lea   8(%%"REGA"), %%"REGA" \n\t"
+	     "  decl  %%"REGC"              \n\t"
+
+	     "  jg    vLoopSSEMMX_Fetch_12  \n\t"  /* if not on last one loop, prefetch  */
+	     "  jz    vLoopSSEMMX_12        \n\t"  /* or just loop, or not  */
+#ifdef STREAMING_STORE
+	     "  sfence                      \n\t"
+#endif
+	     "  jmp    MoreSpareChange_12   \n"    /* all done with vertical  */
+
+	     ".align 16        \n"
+	     "vLoopMMX_12:     \n\t"
+
+	     "  movq    (%%"REGSI", %%"REGA"), %%mm1  \n\t"  /* top of 2 lines to interpolate */
+	     "  movq    (%%"REGD",  %%"REGA"), %%mm3  \n\t"  /* 2nd of 2 lines    */
+
+	     "  movq      %%mm1, %%mm2  \n\t"
+	     "  movq      %%mm3, %%mm4  \n\t"
+
+	     "  punpcklbw %%mm7, %%mm1  \n\t"  /* make words */
+	     "  punpckhbw %%mm7, %%mm2  \n\t"  /*     "     */
+	     "  punpcklbw %%mm7, %%mm3  \n\t"  /*     "     */
+	     "  punpckhbw %%mm7, %%mm4  \n\t"  /*     "     */
+	     
+	     "  pmullw    %%mm5, %%mm1  \n\t"  /* mult by top weighting factor */
+	     "  pmullw    %%mm5, %%mm2  \n\t"  /*    "    */
+	     "  pmullw    %%mm6, %%mm3  \n\t"  /* mult by bot weighting factor */
+	     "  pmullw    %%mm6, %%mm4  \n\t"  /*    "    */
+
+	     "  paddw     %%mm3, %%mm1  \n\t"  /* combine lumas low */
+	     "  paddw     %%mm4, %%mm2  \n\t"  /* combine lumas high */
+
+	     "  paddusw   %%mm0, %%mm1  \n\t"  /* round */
+	     "  paddusw   %%mm0, %%mm2  \n\t"  /* round */
+			
+	     "  psrlw     $8, %%mm1     \n\t"  /* right adjust luma */
+	     "  psrlw     $8, %%mm2     \n\t"  /* right adjust luma */
+
+	     "  packuswb  %%mm2, %%mm1  \n\t"  /* pack words to our 16 byte answer */
+	     "  movq      %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+			 
+	     "  lea   8(%%"REGA"), %%"REGA" \n\t"
+	     "  loop  vLoopMMX_12  \n"
+
+	     /* Add a little code here to check if we have more pixels to do and, if so, make one
+	      * more pass thru vLoopMMX. We were processing in multiples of 8 pixels and alway have
+	      * an even number so there will never be more than 7 left.
+	      */
+	     "MoreSpareChange_12:    \n\t"
+
+	     "  cmpl "_src_row_size", %%"REGEA"  \n\t"  /* did we get them all */
+	     "  jnl  DoHorizontal_12  \n\t"  /* yes, else have 2 left */
+	     "  movl $1, %%"REGC"     \n\t"  /* jigger loop ct */
+	     "  movl "_src_row_size", %%"REGEA"  \n\t"
+	     "  sub  $8, %%"REGA"     \n\t"  /* back up to last 8 pixels */
+	     "  jmp  vLoopMMX_12      \n"
+
+	     /*  We've taken care of the vertical scaling, now do horizontal  */
+	     "DoHorizontal_12:        \n\t"
+	     "  pxor        %%mm7, %%mm7     \n\t"
+	     "  movq  "_FPround2", %%mm6     \n\t"  /* useful rounding constant, dwords  */
+	     "  mov   "_pControl", %%"REGSI" \n\t"  /* @ horiz control bytes  */	
+	     "  movl  "_row_size", %%"REGC"  \n\t"
+	     "  shrl  $2, %%"REGC"          \n\t"  /* 4 bytes a time, 4 pixels  */
+	     "  mov   "_vWorkYW",  %%"REGD"  \n\t"  /* our luma data, as 0Y0Y 0Y0Y..  */
+	     "  mov   "_dstp",     %%"REGDI" \n\t"  /* the destination line  */
+#if !defined(__x86_64__)
+	     "  testl $1, "_SSEMMXenabledW" \n\t"  /* MMXEXTsupported? */
+	     "  jz    hLoopMMX_12           \n\t"  /* n, can't do anyway */
+#endif
+	     /* With SSE support we will make 8 pixels (from 8 pairs) at a time */
+	     "  shrl  $1, %%"REGC"  \n\t"  /* 8 bytes a time instead of 4  */
+	     "  jz    LessThan8     \n"
+
+	     ".align 16          \n"
+	     "hLoopMMXSSE_12:    \n\t"
+
+
+	     /* handle first 2 pixels */
+	     /* phi: must use movl here (x86_64, reading from table of uint_32's) */
+	     "  movl   16(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 1st pixel pair */
+	     "  movl   20(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 2nd pixel pair  */
+
+	     "  movd      (%%"REGD", %%"REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm0  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
+	     "  punpcklbw %%mm7, %%mm0      \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
+	     "  movl      16+24(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 3st pixel pair */
+	     "  movl      20+24(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 4nd pixel pair  */
+	     "  pmaddwd   (%%"REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  paddusw    %%mm6, %%mm0       \n\t"  /* round */
+	     "  psrlw         $8, %%mm0       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
+
+	     /* handle 3rd and 4th pixel pairs */
+	     "  movd      (%%"REGD", %%"REGA"), %%mm1  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm1  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
+	     "  punpcklbw %%mm7, %%mm1        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
+	     "  movl      16+48(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 5st pixel pair */
+	     "  movl      20+48(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 6nd pixel pair  */
+	     "  pmaddwd 24(%%"REGSI"), %%mm1  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  paddusw    %%mm6, %%mm1       \n\t"  /* round */
+	     "  psrlw         $8, %%mm1       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
+
+	     /* handle 5th and 6th pixel pairs */
+	     "  movd      (%%"REGD", %%"REGA"), %%mm2  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm2  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
+	     "  punpcklbw %%mm7, %%mm2        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
+	     "  movl      16+72(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 7st pixel pair */
+	     "  movl      20+72(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 8nd pixel pair  */
+	     "  pmaddwd 48(%%"REGSI"), %%mm2  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  paddusw    %%mm6, %%mm2       \n\t"  /* round */
+	     "  psrlw         $8, %%mm2       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
+
+	     /* handle 7th and 8th pixel pairs */
+	     "  movd      (%%"REGD", %%"REGA"), %%mm3  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm3  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
+	     "  punpcklbw %%mm7, %%mm3        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
+	     "  pmaddwd 72(%%"REGSI"), %%mm3  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  paddusw    %%mm6, %%mm3       \n\t"  /* round */
+	     "  psrlw         $8, %%mm3       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
+
+	     /* combine, store, and loop */
+	     "  packuswb %%mm1, %%mm0         \n\t"  /* pack into qword, 0Y0Y0Y0Y */
+	     "  packuswb %%mm3, %%mm2         \n\t"  /* pack into qword, 0Y0Y0Y0Y */
+	     "  packuswb %%mm2, %%mm0         \n\t"  /* and again into  YYYYYYYY */			
+#ifdef STREAMING_STORE
+	     "  movntq   %%mm0, (%%"REGDI")   \n\t"  /* done with 4 pixels */
+#else
+	     "  movq     %%mm0, (%%"REGDI")   \n\t"  /* done with 4 pixels */
+#endif
+
+	     "  lea  96(%%"REGSI"), %%"REGSI" \n\t"
+	     "  lea   8(%%"REGDI"), %%"REGDI" \n\t"
+	     "  decl  %%"REGC"                \n\t"
+	     "  jg    hLoopMMXSSE_12    \n\t"   /* loop for more  */
+#ifdef STREAMING_STORE
+	     "  sfence                  \n"
+#endif
+	     "LessThan8:    \n\t"
+	     "  movl "_row_size", %%"REGC"  \n\t"
+	     "  andl          $7, %%"REGC"  \n\t"  /* we have done all but maybe this */
+	     "  shrl          $2, %%"REGC"  \n\t"  /* now do only 4 bytes at a time */
+	     "  jz            LessThan4     \n"
+
+	     ".align 16   \n"
+	     "hLoopMMX_12:    \n\t"
+
+	     /* handle first 2 pixels */
+	     "  movl   16(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 1st pixel pair */
+	     "  movl   20(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 2nd pixel pair  */
+	     "  movd      (%%"REGD", %%"REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm0  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
+	     "  punpcklbw %%mm7, %%mm0      \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
+	     "  movl      16+24(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 3st pixel pair */
+	     "  movl      20+24(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 4nd pixel pair  */
+	     "  pmaddwd   (%%"REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  paddusw    %%mm6, %%mm0       \n\t"  /* round */
+	     "  psrlw         $8, %%mm0       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
+
+	     /* handle 3rd and 4th pixel pairs */
+	     "  movd      (%%"REGD", %%"REGA"), %%mm1  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpckldq (%%"REGD", %%"REGB"), %%mm1  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
+	     "  punpcklbw %%mm7, %%mm1        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
+	     "  pmaddwd 24(%%"REGSI"), %%mm1  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  paddusw    %%mm6, %%mm1       \n\t"  /* round */
+	     "  psrlw         $8, %%mm1       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
+
+	     /* combine, store, and loop */
+	     "  packuswb %%mm1, %%mm0         \n\t"  /* pack into qword, 0Y0Y0Y0Y */
+	     "  packuswb %%mm7, %%mm0         \n\t"  /* and again into  0000YYYY */			
+	     "  movd     %%mm0, (%%"REGDI")   \n\t"  /* done with 4 pixels */
+	     "  lea  48(%%"REGSI"), %%"REGSI" \n\t"
+	     "  lea   4(%%"REGDI"), %%"REGDI" \n\t"
+
+	     "  loop   hLoopMMX_12            \n"    /* loop for more */
+		 
+	     /* test to see if we have a mod 4 size row, if not then more spare change */
+	     "LessThan4:    \n\t"
+	     "  movl "_row_size", %%"REGC"    \n\t"
+	     "  andl          $3, %%"REGC"    \n\t"  /* remainder side mod 4 */
+	     "  cmpl          $2, %%"REGC"    \n\t"  
+	     "  jl            LastOne         \n\t"  /* none, none */
+
+	     /* handle 2 more pixels */
+	     "  movl      16(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 1st pixel pair */
+	     "  movl      20(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 2nd pixel pair  */
+	     "  movd      (%%"REGD", %%"REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm0  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
+	     "  punpcklbw %%mm7, %%mm0        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
+
+	     "  pmaddwd   (%%"REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  paddusw   %%mm6, %%mm0        \n\t"  /* round */
+	     "  psrlw        $8, %%mm0        \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
+	     "  packuswb  %%mm7, %%mm0        \n\t"  /* pack into qword, 00000Y0Y */
+	     "  packuswb  %%mm7, %%mm0        \n\t"  /* and again into  000000YY */			
+	     "  movd      %%mm0, (%%"REGDI")  \n\t"  /* store, we are guarrenteed room in buffer (8 byte mult) */
+	     "  subl         $2, %%"REGC"     \n\t"  
+	     
+	     "  lea  24(%%"REGSI"), %%"REGSI" \n\t" /* bump to next control bytes */
+	     "  lea   2(%%"REGDI"), %%"REGDI" \n" /* bump to next output pixel addr */
+
+	     /* maybe one last pixel */
+	     "LastOne:    \n\t"
+	     "  cmpl   $0, %%"REGC"   \r\n"  /* still more ? */
+	     "  jz     AllDone        \r\n"  /* n, done */
+	     "  movl   16(%%"REGSI"), %%"REGEA"     \n\t"  /* get data offset in pixels, 1st pixel pair */
+	     "  movd   (%%"REGD", %%"REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpcklbw %%mm7, %%mm0        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
+
+	     "  pmaddwd   (%%"REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  paddusw    %%mm6, %%mm0       \n\t"  /* round */
+	     "  psrlw         $8, %%mm0       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
+	     "  movd       %%mm0, %%"REGEA"   \n\t"
+	     "  movb        %%al, (%%"REGDI") \n"    /* store last one */
+			 
+	     "AllDone:  \n\t"
+	     "  emms    \n\t"
+#if !defined(__x86_64__)
+	     "mov "_oldbx", %%"REGB" \n\t"
+#endif
+	     ::
+	     "m" /*0*/(FPround1),
+	     "m" /*1*/(vWeight1),
+	     "m" /*2*/(vWeight2),
+	     "m" /*3*/(y/*YMask[0]*/),
+	     "m" /*4*/(src_row_size),
+	     "m" /*5*/(y/*EndOffset*/),
+	     "m" /*6*/(pControl),
+	     "m" /*7*/(row_size),
+	     "m" /*8*/(vWorkYW),
+	     "m" /*9*/(dstp),
+	     "m" /*10*/(y/*vWorkUVW*/),
+	     "m" /*11*/(FPround2),
+	     "m" /*12*/(srcp1),
+	     "m" /*13*/(srcp2)
+#if !defined(__x86_64__)
+	     ,
+	     "m" /*14*/(oldbx),
+	     "m" /*15*/(SSEMMXenabledW),
+	     "m" /*16*/(SSE2enabledW)
+	     : REGA, /*REGB,*/ REGC, REGD, REGSI, REGDI
+#else
+	     : REGA, REGB, REGC, REGD, REGSI, REGDI
+#endif
+	     );
+
+    dstp += dst_pitch;
+  }
+#endif
+  return 0; 
+}
+
+/*
+ * tools
+ */
+
+#ifndef ALIGN
+#  define ALIGN(b,p) ((void*)((((unsigned long)(p)) + (b)-1) & (~((b)-1))))
+#endif
+#ifndef MIN
+#  define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+#ifndef MAX
+#  define MAX(a,b) ((a) > (b) ? (a) : (b))
+#endif
+#ifndef FABS
+#  define FABS(x) ((x) < 0.0 ? -(x) : (x))
+#endif
+
+/*
+ * xine plugin
+ */
+
+#define PLUGIN_ID     "warp"
+#define PLUGIN_DESCR  "(non-)linear software scaling post plugin";
+#define PLUGIN_T      warp_plugin_t
+/*#define POST_THREADS*/
+/*#define POST_SLICES*/
+#include "xine/post_util.h"
+
+
+/* plugin class initialization function */
+void *warp_init_plugin(xine_t *xine, void *);
+
+/* plugin class functions */
+static post_plugin_t *open_plugin(post_class_t *class_gen, int inputs,
+				  xine_audio_port_t **audio_target,
+				  xine_video_port_t **video_target);
+
+/* plugin instance functions */
+static void        warp_dispose(post_plugin_t *this_gen);
+
+/* vo_frame functions */
+static vo_frame_t *got_frame(vo_frame_t *frame);
+static void        draw_internal(vo_frame_t *frame, vo_frame_t *new_frame);
+
+
+/* parameter functions */
+static xine_post_api_descr_t *warp_get_param_descr(void);
+static int                    warp_set_parameters(xine_post_t *this_gen, void *param_gen);
+static int                    warp_get_parameters(xine_post_t *this_gen, void *param_gen);
+static char                  *warp_get_help(void);
+
+
+typedef struct warp_parameters_s {
+  int    output_width;
+  int    output_height;
+  double output_aspect;
+  int    no_downscaling;
+} warp_parameters_t;
+
+START_PARAM_DESCR(warp_parameters_t)
+PARAM_ITEM(POST_PARAM_TYPE_INT,    output_width,  NULL, 640, 1920, 0,
+  "output video width")
+PARAM_ITEM(POST_PARAM_TYPE_INT,    output_height, NULL, 480, 1080, 0,
+  "output video height")
+PARAM_ITEM(POST_PARAM_TYPE_DOUBLE, output_aspect, NULL,   1,    3, 0,
+  "output video aspect ratio")
+PARAM_ITEM(POST_PARAM_TYPE_BOOL,   no_downscaling,NULL,   0,    1, 0,
+  "disable downscaling")
+END_PARAM_DESCR(warp_param_descr)
+
+
+typedef struct {
+  post_plugin_t  post;
+
+  xine_post_in_t parameter_input;
+
+  /* User config  (changes to actual config are delayed) */
+  warp_parameters_t config;
+
+  /* Current config */
+  int    enable;
+  int    output_width;
+  int    output_height;
+  double output_aspect;
+  double factor_x;
+  double factor_y;
+
+  /* Last seen input frame */
+  int    input_width;
+  int    input_height;
+  int    input_format;
+  int    input_interlaced;
+  double input_aspect;
+
+  /* working buffers */
+  uint32_t *vWorkY;
+  uint32_t *vWorkUV;
+
+  /* scaling tables */
+  uint32_t *hControl;
+  uint32_t *hControlUV;
+  uint32_t *vOffsets;
+  uint32_t *vOffsetsUV;
+  uint32_t *vWeights;
+  uint32_t *vWeightsUV;
+
+  /* memory for work areas and scaling tables */
+  void *pMem;
+
+} warp_plugin_t;
+
+/*
+ *
+ */
+
+static void init_tables(warp_plugin_t *this)
+{
+#define BP(x) ((uint8_t*)(x))
+  /* allocate memory for scaling tables and workspace */
+  free(this->pMem);
+  this->pMem = xine_xmalloc(this->input_width*3 + this->output_width*sizeof(uint32_t)*3*2 +
+			    this->output_height*sizeof(uint32_t)*4 + 2*9*128);
+
+  /* - aligned for P4 cache line */
+  this->vWorkY   = (uint32_t*)ALIGN(128, this->pMem);
+  this->vWorkUV  = (uint32_t*)ALIGN(128, BP(this->vWorkY)   + this->input_width*2 + 128);
+  this->hControl = (uint32_t*)ALIGN(128, BP(this->vWorkUV)  + this->input_width   + 128);
+  this->vOffsets = (uint32_t*)ALIGN(128, BP(this->hControl) + this->output_width  * sizeof(uint32_t) * 3 + 128);
+  this->vWeights = (uint32_t*)ALIGN(128, BP(this->vOffsets) + this->output_height * sizeof(uint32_t) + 128);
+
+  if (this->input_format == XINE_IMGFMT_YV12) {
+    this->vOffsetsUV = (uint32_t*)ALIGN(128, BP(this->vWeights)   + this->output_height * sizeof(uint32_t) + 128);
+    this->vWeightsUV = (uint32_t*)ALIGN(128, BP(this->vOffsetsUV) + this->output_height * sizeof(uint32_t) + 128);
+    this->hControlUV = (uint32_t*)ALIGN(128, BP(this->vWeightsUV) + this->output_height * sizeof(uint32_t) + 128);
+
+    init_tables_yv12(this->output_width, this->output_height,
+		     this->input_width,  this->input_height,
+		     this->input_interlaced, this->factor_x, this->factor_y, 
+		     this->hControl,   this->vOffsets,   this->vWeights,
+		     this->hControlUV, this->vOffsetsUV, this->vWeightsUV );
+
+  } else if (this->input_format == XINE_IMGFMT_YUY2) {
+
+    init_tables_yuy2(this->output_width, this->output_height,
+		     this->input_width,  this->input_height,
+		     this->input_interlaced, this->factor_x, this->factor_y, 
+		     this->hControl, this->vOffsets, this->vWeights );
+  }
+}
+
+static void calculate_factors(warp_plugin_t *this)
+{
+  /* try to guess amount to stretch/shrink */
+  double adiff = this->input_aspect - this->output_aspect;
+  this->factor_x = 1.0;
+  this->factor_y = 1.0;
+
+  if (adiff > 0.1) {
+
+    if (adiff > 0.1 + ((16.0-12.0)/9.0)) {
+      /* >16:9 -> >4:3 */
+      DBG("aspect ratio diff %1.3lf > 0 : too large !\n", adiff);
+      this->factor_x = 0.95;
+      this->factor_y = 1.15;
+      this->output_aspect += (adiff - 4.0/9.0);
+      DBG("  changing target ratio to %3.1lf\n", this->output_aspect);
+    } else {
+      /* 16:9 ... 12:9 -> 4:3 */
+      DBG("aspect ratio diff %1.3lf > 0 : 16.9...12:9 -> 4:3\n", adiff);
+      this->factor_x = 1.0 - 0.05 * adiff * 9.0/4.0;
+      this->factor_y = 1.0 + 0.15 * adiff * 9.0/4.0;
+    }
+
+  } else if (adiff < -0.1) {
+
+    if(adiff < -0.1-((16.0-12.0)/9.0)) {
+      /* <4:3 -> <16:9 */
+      DBG("aspect ratio diff %1.3lf > 0 : too large !\n", adiff);
+      this->factor_x = 1.05;
+      this->factor_y = 0.85;
+      this->output_aspect += (adiff + 4.0/9.0);
+      DBG("  changing target ratio to %3.1lf\n", this->output_aspect);
+    } else {
+      /* 4:3...16:9 -> 16:9 */
+      DBG("aspect ratio diff %1.3lf < 0 : 4:3...16:9 -> 16:9\n", adiff);
+      this->factor_x = 1.0 + 0.05 * adiff * 9.0/4.0;
+      this->factor_y = 1.0 - 0.15 * adiff * 9.0/4.0;
+    }
+
+  } else {
+    DBG("aspect ratio matches, no warp\n");
+    this->factor_x = 1.0;
+    this->factor_y = 1.0;
+  }
+
+  DBG("factor_x = %1.3lf factor_y = %1.3lf  output ratio = %1.3lf\n", 
+      this->factor_x, this->factor_y, this->output_aspect);
+}
+
+/*
+ *
+ */
+
+void *warp_init_plugin(xine_t *xine, void *data)
+{
+#if !defined(__x86_64__)
+  /* Need at least MMX */
+  if (!(xine_mm_accel() & MM_ACCEL_X86_MMX)) {
+    fprintf(stderr, "warp_init_plugin: ERROR: at least MMX required\n");
+    return NULL;
+  }
+#endif
+
+  return init_plugin(xine, data);
+}
+
+static post_plugin_t *open_plugin(post_class_t *class_gen, int inputs,
+					 xine_audio_port_t **audio_target,
+					 xine_video_port_t **video_target)
+{
+  warp_plugin_t     *this     = (warp_plugin_t *) xine_xmalloc(sizeof(warp_plugin_t));
+  post_plugin_t     *this_gen = (post_plugin_t *) this;
+  post_in_t         *input;
+  post_out_t        *output;
+  xine_post_in_t    *input_param;
+  post_video_port_t *port;
+
+  static xine_post_api_t post_api =
+    { warp_set_parameters,  warp_get_parameters, warp_get_param_descr, warp_get_help };
+  
+  if (!this || !video_target || !video_target[0]) {
+    free(this);
+    return NULL;
+  }
+  
+  _x_post_init(this_gen, 0, 1);
+  
+  port = _x_post_intercept_video_port(this_gen, video_target[0], &input, &output);
+  port->intercept_frame = intercept_frame_yuy;
+  port->new_frame->draw = post_draw;
+  input->xine_in.name   = "video";
+  output->xine_out.name = "video (scaled)";
+  this_gen->xine_post.video_input[0] = &port->new_port;
+
+  this_gen->dispose = warp_dispose;
+
+  input_param       = &this->parameter_input;
+  input_param->name = "parameters";
+  input_param->type = XINE_POST_DATA_PARAMETERS;
+  input_param->data = &post_api;
+  xine_list_push_back(this_gen->input, input_param);
+
+  this->config.output_aspect  = 0.0; /* -> do not change aspect ratio */
+  this->config.output_width   = 0;   /* -> do not change width */
+  this->config.output_height  = 0;   /* -> do not change height */
+  this->config.no_downscaling = 0;
+
+  this->input_width  = 0; /* not known yet, triggers initialization later */
+  this->input_height = 0;
+
+  return this_gen;
+}
+
+static void warp_dispose(post_plugin_t *this_gen)
+{
+  if (_x_post_dispose(this_gen)) {
+    warp_plugin_t *this = (warp_plugin_t *) this_gen;
+
+    DBG("dispose\n");
+
+    free(this->pMem);
+    free(this);
+  }
+}
+
+static vo_frame_t *got_frame(vo_frame_t *frame)
+{
+  post_video_port_t *port = (post_video_port_t *)frame->port;
+  warp_plugin_t     *this = (warp_plugin_t *)port->post;
+  double             adiff = this->input_aspect - frame->ratio;
+
+  if (this->input_width != frame->width || this->input_height != frame->height ||
+      this->input_format != frame->format || FABS(adiff)>0.1 ||
+      this->input_interlaced != !!(frame->flags & VO_INTERLACED_FLAG)) {
+
+    DBG("detected frame format change: %dx%d -> %dx%d, interlaced %d->%d, aspect %1.3lf->%1.3lf, %s->%s\n",
+	this->input_width, this->input_height, frame->width, frame->height,
+	this->input_interlaced, !!(frame->flags & VO_INTERLACED_FLAG),
+	this->input_aspect, frame->ratio, 
+	this->input_format==XINE_IMGFMT_YV12 ? "yv12":"yuy2", 
+	frame->format==XINE_IMGFMT_YV12 ? "yv12":"yuy2" );
+
+    /* free tables and buffers */
+    free(this->pMem);
+    this->pMem = NULL;
+
+    /* remember frame properties to detect changes in video format */
+    this->input_width  = frame->width;
+    this->input_height = frame->height;
+    this->input_format = frame->format;
+    this->input_aspect = frame->ratio;
+    this->input_interlaced = !!(frame->flags & VO_INTERLACED_FLAG);
+
+    /* re-configure target size and aspect ratio */ 
+    this->output_aspect = this->config.output_aspect ?: frame->ratio;
+    if (!this->config.no_downscaling) {
+      this->output_width  = this->config.output_width  ?: frame->width;
+      this->output_height = this->config.output_height ?: frame->height;
+    } else {
+      this->output_width  = MAX(this->config.output_width,  frame->width);
+      this->output_height = MAX(this->config.output_height, frame->height);
+    }
+
+    /* calculate warp function factors */
+    calculate_factors(this);
+
+    if(this->output_width  == frame->width &&
+       this->output_height == frame->height &&
+       adiff < 0.1  && 
+       adiff > -0.1 ) {
+      this->enable = 0;
+      DBG("--> nothing to do, disabling processing for now");
+      return NULL;
+    }
+
+    this->enable = 1;
+
+    init_tables(this);
+  }
+
+  if (!this->enable)
+    return NULL;
+
+  return port->original_port->get_frame(port->original_port,
+					this->output_width, this->output_height, 
+					this->output_aspect, frame->format,
+					frame->flags | VO_BOTH_FIELDS);
+}
+
+static void draw_internal(vo_frame_t *frame, vo_frame_t *new_frame)
+{
+  post_video_port_t *port = (post_video_port_t *)frame->port;
+  warp_plugin_t *this = (warp_plugin_t *)port->post;
+  int proc_height = frame->height;
+
+  if (frame->format == XINE_IMGFMT_YV12) {
+
+    do_warp_yv12(new_frame->base[0], frame->base[0], 
+		 new_frame->pitches[0], frame->pitches[0],
+		 this->output_width, this->output_height,
+		 frame->width, proc_height,
+		 this->input_interlaced,
+		 this->hControl, this->vOffsets, this->vWeights,
+		 this->vWorkY,
+		 0);
+    proc_height /= 2;
+    do_warp_yv12(new_frame->base[1], frame->base[1], 
+		 new_frame->pitches[1], frame->pitches[1],
+		 this->output_width/2, this->output_height/2,
+		 frame->width/2, proc_height, 
+		 this->input_interlaced,
+		 this->hControlUV, this->vOffsetsUV, this->vWeightsUV,
+		 this->vWorkUV,
+		 0);
+    do_warp_yv12(new_frame->base[2], frame->base[2], 
+		 new_frame->pitches[2], frame->pitches[2],
+		 this->output_width/2, this->output_height/2,
+		 frame->width/2, proc_height, 
+		 this->input_interlaced,
+		 this->hControlUV, this->vOffsetsUV, this->vWeightsUV,
+		 this->vWorkUV,
+		 0);
+
+  } else if (frame->format == XINE_IMGFMT_YUY2) {
+    do_warp_yuy2(new_frame->base[0], frame->base[0], 
+		 new_frame->pitches[0], frame->pitches[0],
+		 this->output_width, this->output_height,
+		 frame->width, proc_height, 
+		 this->input_interlaced,
+		 this->hControl, this->vOffsets, this->vWeights,
+		 this->vWorkY, this->vWorkUV, 
+		 0);
+  }
+}
+
+/*
+ * parameter functions
+ */
+
+static xine_post_api_descr_t *warp_get_param_descr(void)
+{
+  return &warp_param_descr;
+}
+
+static int warp_set_parameters(xine_post_t *this_gen, void *param_gen)
+{
+  warp_plugin_t *this = (warp_plugin_t *)this_gen;
+  warp_parameters_t *params = (warp_parameters_t *)param_gen;
+
+  memcpy(&this->config, params, sizeof(warp_parameters_t));  
+  this->input_width = this->input_height = 0;
+
+  DBG("warp_set_parameters: "
+      "output_width=%d, output_height=%d, output_aspect=%4.3lf, no_downscaling=%d\n",
+      this->config.output_width, this->config.output_height, this->config.output_aspect,
+      this->config.no_downscaling);
+
+  return 1;
+}
+
+static int warp_get_parameters(xine_post_t *this_gen, void *param_gen)
+{
+  warp_plugin_t *this = (warp_plugin_t *)this_gen;
+  warp_parameters_t *params = (warp_parameters_t *)param_gen;
+  
+  DBG("warp_get_parameters\n");
+  memcpy(params, &this->config, sizeof(warp_parameters_t));
+
+  return 1;
+}
+
+static char *warp_get_help(void) {
+  return _(
+	   "The warp plugin scales video to another resolution. "
+           "It supports non-linear stretching to change video aspect ratio. "
+	   "\n"
+           "Parameters\n"
+	   "  output_width:       Scale video to width\n"
+	   "                      (0 -> do not change video width)\n"
+	   "  output_height:      Scale video to height\n"
+	   "                      (0 -> do not change video height)\n"
+	   "  output_aspect:      Adjust aspect ratio using non-linear scaling\n"
+	   "                      (0 -> do not change video aspect ratio)\n"
+	   "  no_downscaling:     Do not downscale video\n"
+           "\n"
+         );
+}
+
+
+/*
+ * plugin info
+ */
+
+static post_info_t info = { XINE_POST_TYPE_VIDEO_FILTER };
+
+const plugin_info_t xine_plugin_info[] __attribute__((visibility("default"))) =
+{
+  /* type, API, "name", version, special_info, init_function */  
+  { PLUGIN_POST, 9, "warp",    XINE_VERSION_CODE, &info, &warp_init_plugin },
+  { PLUGIN_POST, 9, "swscale", XINE_VERSION_CODE, &info, &warp_init_plugin },
+  { PLUGIN_NONE, 0, "", 0, NULL, NULL }
+};