summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--xine_post_swscale.c1703
1 files changed, 1703 insertions, 0 deletions
diff --git a/xine_post_swscale.c b/xine_post_swscale.c
new file mode 100644
index 00000000..bea7ce0a
--- /dev/null
+++ b/xine_post_swscale.c
@@ -0,0 +1,1703 @@
+/*
+ * Copyright (C) 2000-2007 the xine project
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * $Id: xine_post_swscale.c,v 1.1 2008-02-20 22:31:23 phintuka Exp $
+ *
+ * Simple (faster) resize for avisynth
+ * Copyright (C) 2002 Tom Barry
+ *
+ * Very simple 2 tap linear interpolation.
+ * It is unfiltered which means it will not soften much.
+ *
+ * WarpedResize will do a non-linear stretch/squeeze in both the horizontal
+ * and vertical dimensions. This can be useful when you want to change the
+ * aspect ratio of a video clip and have it mostly distorted at the
+ * top, bottom, and side edges.
+ *
+ *
+ * Ported to linux/xine by Petri Hintukainen <phintuka@users.sourceforge.net>
+ * - Added x86_64 support
+ * - Added PIC support (do not clobber ebx in x86, access only local variables from asm)
+ * - Fixed yv12 stretched warp tables generation
+ */
+
+#include <xine/xine_internal.h>
+#include <xine/post.h>
+
+/*#define DBG(x...)*/
+#define DBG(x...) fprintf(stderr, "post_warp: " x)
+
+#define STREAMING_STORE
+#define PREFETCH
+/*#define VANILLA*/
+
+/*
+ * This function accepts a position from 0 to 1 and warps it, to 0 through 1 based
+ * upon the wFact var. The warp equations are designed to:
+ *
+ * * Always be rising but yield results from 0 to 1
+ *
+ * * Have a first derivative that doesn't go to 0 or infinity, at least close
+ * to the center of the screen
+ *
+ * * Have a curvature (absolute val of 2nd derivative) that is small in the
+ * center and smoothly rises towards the edges. We would like the curvature
+ * to be everywhere = 0 when the warp factor = 1
+ */
+static double WarpFactor(double position, double wFact)
+{
+ double x;
+ double z;
+ double w;
+ x = 2 * (position - .5);
+ if (1) /*(wFact < 1.0)*/
+ /* For warp factor < 1 the warp is calculated as (1-w) * x^3 + w *x, centered
+ *
+ * The warp is calculated as z = (1 - w) * x^3 + w * x, centered
+ * around .5 and ranging from 0 to 1. After some tinkering this seems
+ * to give decent values and derivatives at the right places.
+ */
+ w = 2.0 - wFact; /* reverse parm for compat with initial release */
+
+ if (x < 0.0) {
+ z = -(1 - w) * x*x*x - w * x; /* -1 < x < 0, wFact < 1 */
+ return .5 - .5 * z;
+ } else {
+ z = (1 - w) * x*x*x + w * x; /* -1 < x < 0, wFact < 1 */
+ return .5 + .5 * z; /* amts to same formula as above for now */
+ }
+}
+
+/*
+ * YV12
+ *
+ * For each horizontal output pair of pixels there is are 2 qword masks followed by 2 int
+ * offsets. The 2 masks are the weights to be used for the luma and chroma, respectively.
+ * Each mask contains LeftWeight1, RightWeight1, LeftWeight2, RightWeight2. So a pair of pixels
+ * will later be processed each pass through the horizontal resize loop. I think with my
+ * current math the Horizontal Luma and Chroma contains the same values but since I may have screwed it
+ * up I'll leave it this way for now. Vertical chroma is different.
+ *
+ * Note - try just using the luma calcs for both, seem to be the same.
+ *
+ * The weights are scaled 0-256 and the left and right weights will sum to 256 for each pixel.
+ */
+static void init_tables_yv12(int newwidth, int newheight, int oldwidth, int oldheight,
+ int Interlaced, double hWarp, double vWarp,
+ uint32_t *hControl, uint32_t *vOffsets, uint32_t *vWeights,
+ uint32_t *hControlUV, uint32_t *vOffsetsUV, uint32_t *vWeightsUV)
+{
+ int i;
+ int j;
+ int k;
+ int wY1;
+ int wY2;
+ DBG("init_yv12: %dx%d->%dx%d hWarp %1.3lf vWarp %1.3lf\n",
+ oldwidth, oldheight, newwidth, newheight, hWarp, vWarp);
+
+ /* First set up horizontal table, use for both luma & chroma since
+ * it seems to have the same equation.
+ * We will geneerate these values in pairs, mostly because that's the way
+ * I wrote it for YUY2 above.
+ */
+
+ for(i=0; i < newwidth; i+=2) {
+ /* first make even pixel control */
+ if (hWarp==1) /*if no warp factor */
+ j = i * 256 * (oldwidth-1) / (newwidth-1);
+ else /* stretch and warp somehow */
+ j = (int) (256 * WarpFactor(i / (newwidth-1.0), hWarp) * (oldwidth-1));
+
+ k = j>>8;
+ wY2 = j - (k << 8); /* luma weight of right pixel */
+ wY1 = 256 - wY2; /* luma weight of left pixel */
+
+ if (k > oldwidth - 2) {
+ hControl[i*3+4] = oldwidth - 1; /* point to last byte */
+ hControl[i*3] = 0x00000100; /* use 100% of rightmost Y */
+ } else {
+ hControl[i*3+4] = k; /* pixel offset */
+ hControl[i*3] = wY2 << 16 | wY1; /* luma weights */
+ }
+
+ /* now make odd pixel control */
+ if (hWarp==1) /* if no warp factor */
+ j = (i+1) * 256 * (oldwidth-1) / (newwidth-1);
+ else /* stretch and warp somehow */
+ j = (int) (256 * WarpFactor((i+1) / (newwidth-1.0), hWarp) * (oldwidth-1));
+
+ k = j>>8;
+ wY2 = j - (k << 8); /* luma weight of right pixel */
+ wY1 = 256 - wY2; /* luma weight of left pixel */
+
+ if (k > oldwidth - 2) {
+ hControl[i*3+5] = oldwidth - 1; /* point to last byte */
+ hControl[i*3+1] = 0x00000100; /* use 100% of rightmost Y */
+ } else {
+ hControl[i*3+5] = k; /* pixel offset */
+ hControl[i*3+1] = wY2 << 16 | wY1; /* luma weights */
+ }
+ }
+
+ hControl[newwidth*3+4] = 2 * (oldwidth-1); /* give it something to prefetch at end */
+ hControl[newwidth*3+5] = 2 * (oldwidth-1); /* " */
+#ifndef VANILLA
+ // UV
+ for(i=0; i < newwidth/2; i+=2) {
+ /* first make even pixel control */
+ if (hWarp==1) /*if no warp factor */
+ j = i * 256 * (oldwidth/2-1) / (newwidth/2-1);
+ else /* stretch and warp somehow */
+ j = (int) (256 * WarpFactor(i / (newwidth/2-1.0), hWarp) * (oldwidth/2-1));
+
+ k = j>>8;
+ wY2 = j - (k << 8); /* luma weight of right pixel */
+ wY1 = 256 - wY2; /* luma weight of left pixel */
+
+ if (k > oldwidth/2 - 2) {
+ hControlUV[i*3+4] = oldwidth/2 - 1; /* point to last byte */
+ hControlUV[i*3] = 0x00000100; /* use 100% of rightmost Y */
+ } else {
+ hControlUV[i*3+4] = k; /* pixel offset */
+ hControlUV[i*3] = wY2 << 16 | wY1; /* luma weights */
+ }
+
+ /* now make odd pixel control */
+ if (hWarp==1) /* if no warp factor */
+ j = (i+1) * 256 * (oldwidth/2-1) / (newwidth/2-1);
+ else /* stretch and warp somehow */
+ j = (int) (256 * WarpFactor((i+1) / (newwidth/2-1.0), hWarp) * (oldwidth/2-1));
+
+ k = j>>8;
+ wY2 = j - (k << 8); /* luma weight of right pixel */
+ wY1 = 256 - wY2; /* luma weight of left pixel */
+
+ if (k > oldwidth/2 - 2) {
+ hControlUV[i*3+5] = oldwidth/2 - 1; /* point to last byte */
+ hControlUV[i*3+1] = 0x00000100; /* use 100% of rightmost Y */
+ } else {
+ hControlUV[i*3+5] = k; /* pixel offset */
+ hControlUV[i*3+1] = wY2 << 16 | wY1; /* luma weights */
+ }
+ }
+
+ hControlUV[newwidth/2*3+4] = (oldwidth/2-1); /* give it something to prefetch at end */
+ hControlUV[newwidth/2*3+5] = (oldwidth/2-1); /* " */
+#endif
+
+ /* Next set up vertical tables. The offsets are measured in lines and will be mult */
+ /* by the source pitch later . */
+
+ /* For YV12 we need separate Luma and chroma tables */
+
+ /* First Luma Table */
+ for(i=0; i< newheight; ++i) {
+ if (vWarp==1) /* if no warp factor */
+ j = i * 256 * (oldheight-1) / (newheight-1);
+ else /* stretch and warp somehow */
+ j = (int) (256 * WarpFactor(i / (newheight-1.0), vWarp) * (oldheight-1));
+
+ if (Interlaced) { /* do hard way? */
+ if (i%2) { /* is odd output line? */
+ if (j < 256) { /* before 1st odd input line */
+ vOffsets[i] = 1; /* all from line 1 */
+ vWeights[i] = 0; /* weight to give to 2nd line */
+ } else {
+ k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */
+ vOffsets[i] = k;
+ wY2 = j - (k << 8);
+ vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */
+ }
+ } else { /* is even output line */
+ k = (j >> 9) << 1; /* next lower even line */
+ vOffsets[i] = k;
+ wY2 = j - (k << 8);
+ vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */
+ }
+ } else { /* simple way, do as progressive */
+ k = j >> 8;
+ vOffsets[i] = k;
+ wY2 = j - (k << 8);
+ vWeights[i] = wY2; /* weight to give to 2nd line */
+ }
+ }
+
+ /* Vertical table for chroma */
+ for(i=0; i< newheight/2; ++i) {
+ if (vWarp==1) /* if no warp factor */
+#ifdef VANILLA
+ j = (int) ( (i+.25) * 256 * (oldheight-1) / (newheight-1.0) - 64 );
+#else
+ j = (int) ( (i+.25) * 256 * (oldheight/2-1) / (newheight/2-1.0) - 64 );
+#endif
+ else /* stretch and warp somehow */
+#ifdef VANILLA
+ j = (int) (256 * WarpFactor( (i+.25) / (newheight-1.0), vWarp) * (oldheight-1.0) );
+#else
+ j = (int) (256 * WarpFactor( (i+.25) / (newheight/2 - 1.0), vWarp) * (oldheight/2 - 1.0) );
+#endif
+#ifndef VANILLA
+ if(j<0) j=0;
+#endif
+ if (Interlaced) { /* do hard way? */
+ if (i%2) { /* is odd output line? */
+ if (j < 256) { /* before 1st odd input line */
+ vOffsetsUV[i] = 1; /* all from line 1 */
+ vWeightsUV[i] = 0; /* weight to give to 2nd line */
+ } else {
+ k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */
+ vOffsetsUV[i] = k;
+ wY2 = j - (k << 8);
+ vWeightsUV[i] = wY2 >> 1; /* weight to give to 2nd line */
+ }
+ } else { /* is even output line */
+#ifdef VANILLA
+ k = (j >> 9) << 1; /* next lower even line */
+ vOffsetsUV[i] = k;
+ wY2 = j - (k << 8);
+ vWeightsUV[i] = wY2 >> 1; /* weight to give to 2nd line */
+#else
+ k = (j / 512) << 1; /* next lower even line */
+ vOffsetsUV[i] = k;
+ wY2 = j - (k << 8);
+ vWeightsUV[i] = wY2 >> 1; /* weight to give to 2nd line */
+#endif
+ }
+ } else { /* simple way, do as progressive */
+#ifdef VANILLA
+ k = j >> 8;
+#else
+ k = j / 256; /* j >> 8; does not work right if -256 < j < 0 */
+#endif
+ vOffsetsUV[i] = k;
+ wY2 = j - (k << 8);
+ vWeightsUV[i] = wY2; /* weight to give to 2nd line */
+ }
+ }
+}
+
+/*
+ * YUY2
+ *
+ * For each horizontal output pair of pixels there is are 2 qword masks followed by 2 int
+ * offsets. The 2 masks are the weights to be used for the luma and chroma, respectively.
+ * Each mask contains LeftWeight1, RightWeight1, LeftWeight2, RightWeight2. So a pair of pixels
+ * will later be processed each pass through the horizontal resize loop.
+ *
+ * The weights are scaled 0-256 and the left and right weights will sum to 256 for each pixel.
+ */
+static void init_tables_yuy2(int newwidth, int newheight, int oldwidth, int oldheight,
+ int Interlaced, double hWarp, double vWarp,
+ uint32_t *hControl, uint32_t *vOffsets, uint32_t *vWeights )
+{
+ int i;
+ int j;
+ int k;
+ int wY1;
+ int wY2;
+ int wUV1;
+ int wUV2;
+ DBG("init_yuy2: %dx%d->%dx%d hWarp %1.3lf vWarp %1.3lf\n",
+ oldwidth, oldheight, newwidth, newheight, hWarp, vWarp);
+
+ /* First set up horizontal table */
+ for(i=0; i < newwidth; i+=2) {
+ /* first make even pixel control */
+ if (hWarp==1) /* if no warp factor */
+ j = i * 256 * (oldwidth-1) / (newwidth-1);
+ else /* stretch and warp somehow */
+ j = (int) (256 * WarpFactor(i / (newwidth-1.0), hWarp) * (oldwidth-1));
+
+ k = j>>8;
+ wY2 = j - (k << 8); /* luma weight of right pixel */
+ wY1 = 256 - wY2; /* luma weight of left pixel */
+ wUV2 = (k%2) ? 128 + (wY2 >> 1) : wY2 >> 1;
+ wUV1 = 256 - wUV2;
+
+ if (k > oldwidth - 2) {
+ hControl[i*3+4] = oldwidth - 1; /* point to last byte */
+ hControl[i*3] = 0x00000100; /* use 100% of rightmost Y */
+ hControl[i*3+2] = 0x00000100; /* use 100% of rightmost U */
+ } else {
+ hControl[i*3+4] = k; /* pixel offset */
+ hControl[i*3] = wY2 << 16 | wY1; /* luma weights */
+ hControl[i*3+2] = wUV2 << 16 | wUV1; /* chroma weights */
+ }
+
+ /* now make odd pixel control */
+ if (hWarp==1) /* if no warp factor */
+ j = (i+1) * 256 * (oldwidth-1) / (newwidth-1);
+ else /* stretch and warp somehow */
+ j = (int) (256 * WarpFactor((i+1) / (newwidth-1.0), hWarp) * (oldwidth-1));
+
+ k = j>>8;
+ wY2 = j - (k << 8); /* luma weight of right pixel */
+ wY1 = 256 - wY2; /* luma weight of left pixel */
+ wUV2 = (k%2) ? 128 + (wY2 >> 1) : wY2 >> 1;
+ wUV1 = 256 - wUV2;
+
+ if (k > oldwidth - 2) {
+ hControl[i*3+5] = oldwidth - 1; /* point to last byte */
+ hControl[i*3+1] = 0x00000100; /* use 100% of rightmost Y */
+ hControl[i*3+3] = 0x00000100; /* use 100% of rightmost V */
+ } else {
+ hControl[i*3+5] = k; /* pixel offset */
+ hControl[i*3+1] = wY2 << 16 | wY1; /* luma weights */
+ /* hControl[i*3+3] = wUV2 << 16 | wUV1; // chroma weights */
+ /* horiz chroma weights should be same as for even pixel - trbarry 09/16/2002 */
+ hControl[i*3+3] = hControl[i*3+2]; /* chroma weights */
+ }
+ }
+
+ hControl[newwidth*3+4] = 2 * (oldwidth-1); /* give it something to prefetch at end */
+ hControl[newwidth*3+5] = 2 * (oldwidth-1);
+
+ /* Next set up vertical table. The offsets are measured in lines and will be mult */
+ /* by the source pitch later */
+ for(i=0; i< newheight; ++i) {
+ if (vWarp==1) /* if no warp factor */
+ j = i * 256 * (oldheight-1) / (newheight-1);
+ else /* stretch and warp somehow */
+ j = (int) (256 * WarpFactor(i / (newheight-1.0), vWarp) * (oldheight-1));
+
+ if (Interlaced) { /* do hard way? */
+ if (i%2) { /* is odd output line? */
+ if (j < 256) { /* before 1st odd input line */
+ vOffsets[i] = 1; /* all from line 1 */
+ vWeights[i] = 0; /* weight to give to 2nd line */
+ } else {
+ k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */
+ vOffsets[i] = k;
+ wY2 = j - (k << 8);
+ vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */
+ }
+ } else { /* is even output line */
+ k = (j >> 9) << 1; /* next lower even line */
+ vOffsets[i] = k;
+ wY2 = j - (k << 8);
+ vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */
+ }
+ } else { /* simple way, do as progressive */
+ k = j >> 8;
+ vOffsets[i] = k;
+ wY2 = j - (k << 8);
+ vWeights[i] = wY2; /* weight to give to 2nd line */
+ }
+ }
+}
+
+/* Register allocation */
+/* index/counter registers (REGA, REGC) are loaded from 32bit vars/arrays ! */
+#define REGEA "eax"
+#define REGEB "ebx"
+#if defined(__x86_64__)
+# define REGA "rax"
+# define REGB "rbx"
+# define REGC "ecx"
+# define REGD "rdx"
+# define REGDI "rdi"
+# define REGSI "rsi"
+#elif defined(__i386__)
+# define REGA "eax"
+# define REGB "ebx"
+# define REGC "ecx"
+# define REGD "edx"
+# define REGDI "edi"
+# define REGSI "esi"
+#endif
+
+/* variables accessed from assembler code */
+#define _FPround1 "%0"
+#define _vWeight1 "%1"
+#define _vWeight2 "%2"
+#define _YMask "%3"
+#define _src_row_size "%4"
+#define _EndOffset "%5"
+#define _pControl "%6"
+#define _row_size "%7"
+#define _vWorkYW "%8"
+#define _dstp "%9"
+#define _vWorkUVW "%10"
+#define _FPround2 "%11"
+#define _srcp1 "%12"
+#define _srcp2 "%13"
+#if !defined(__x86_64__)
+#define _oldbx "%14"
+#define _SSEMMXenabledW "%15"
+#define _SSE2enabledW "%16"
+#endif
+
+/* structure for mmx constants */
+typedef union {
+ uint64_t uq[1]; /* Unsigned Quadword */
+ uint32_t ud[2]; /* Unsigned Doubleword */
+} ATTR_ALIGN(16) mmx_t;
+
+/* structure for sse2 constants */
+typedef union {
+ uint64_t uq[2]; /* Unsigned Quadword */
+ uint32_t ud[4]; /* Unsigned Doubleword */
+} ATTR_ALIGN(16) sse2_t;
+
+
+static int do_warp_yuy2(uint8_t *dst, const uint8_t *src,
+ const int dst_pitch, const int src_pitch,
+ const int dst_width, const int dst_height,
+ const int src_width, const int src_height,
+ const int Interlaced, const uint32_t * const hControl,
+ const uint32_t * const vOffsets, const uint32_t * const vWeights,
+ uint32_t *vWorkY, uint32_t *vWorkUV,
+ int dst_start)
+{
+#if defined(__i386__) || defined(__x86_64__)
+ const sse2_t YMask = {uq:{UINT64_C(0x00ff00ff00ff00ff),UINT64_C(0x00ff00ff00ff00ff)}}; /* keeps only luma */
+ const sse2_t FPround1 = {uq:{UINT64_C(0x0080008000800080),UINT64_C(0x0080008000800080)}}; /* round words */
+ const sse2_t FPround2 = {uq:{UINT64_C(0x0000008000000080),UINT64_C(0x0000008000000080)}}; /* round dwords */
+ sse2_t vWeight1;
+ sse2_t vWeight2;
+
+ const uint32_t *pControl = &hControl[0];
+ const uint32_t *vWorkYW = vWorkY;
+ const uint32_t *vWorkUVW = vWorkUV;
+ const uint8_t *srcp = src;
+ const uint8_t *srcp1;
+ const uint8_t *srcp2;
+ uint8_t *dstp = dst + dst_pitch*dst_start;
+
+ const uint32_t src_row_size = src_width * 2;
+ const uint32_t row_size = dst_width * 2;
+ const uint32_t EndOffset = src_row_size / 2;
+
+#if !defined(__x86_64__)
+ const int accel = xine_mm_accel();
+ const uint32_t SSE2enabledW = !!(accel & MM_ACCEL_X86_SSE2); /* in local storage for asm */
+ const uint32_t SSEMMXenabledW = !!(accel & MM_ACCEL_X86_MMXEXT); /* in local storage for asm */
+ long int oldbx;
+#endif
+ int y;
+
+ for (y = dst_start; y < dst_height; y++) {
+
+ if(vOffsets[y] >= src_height) {
+ /* slice completed */
+ /*DBG("do_warp_yuy2: max input height reached: need line %d, height %d\n -> Returning next output line: %d\n",
+ vOffsets[y], src_height, y);*/
+ return y;
+ }
+
+ vWeight1.ud[0] = vWeight1.ud[1] = vWeight1.ud[2] = vWeight1.ud[3] =
+ (256-vWeights[y]) << 16 | (256-vWeights[y]);
+ vWeight2.ud[0] = vWeight2.ud[1] = vWeight2.ud[2] = vWeight2.ud[3] =
+ vWeights[y] << 16 | vWeights[y];
+
+ srcp1 = srcp + vOffsets[y] * src_pitch;
+ if (Interlaced)
+ srcp2 = (y < dst_height-2) ? srcp1 + 2 * src_pitch : srcp1;
+ else
+ srcp2 = (y < dst_height-1) ? srcp1 + src_pitch : srcp1;
+
+ __asm__ __volatile__ (
+#if !defined(__x86_64__)
+ /* store ebx (PIC) */
+ "mov %%"REGB", "_oldbx" \n\t"
+#endif
+ "movl "_src_row_size", %%"REGC" \n\t"
+ "shrl $3, %%"REGC" \n\t" /* 8 bytes a time */
+ "mov "_srcp1", %%"REGSI" \n\t" /* top of 2 src lines to get */
+ "mov "_srcp2", %%"REGD" \n\t" /* next " */
+ "mov "_vWorkYW", %%"REGDI" \n\t" /* luma work destination line */
+ "mov "_vWorkUVW", %%"REGB" \n\t" /* luma work destination line */
+ "xor %%"REGA", %%"REGA" \n\t"
+#if !defined(__x86_64__)
+ /* Let's check here to see if we are on a P4 or higher and can use SSE2 instructions.
+ * This first loop is not the performance bottleneck anyway but it is trivial to tune
+ * using SSE2 if we have proper alignment.
+ */
+ "testl $1, "_SSE2enabledW" \n\t" /* is SSE2 supported?*/
+ "jz vMaybeSSEMMX \n\t" /* n, can't do anyway*/
+#endif
+ "cmpl $2, %%"REGC" \n\t" /* we have at least 16 bytes, 2 qwords? */
+ "jl vMaybeSSEMMX \n\t" /* n, don't bother*/
+
+ "shrl $1, %%"REGC" \n\t" /* do 16 bytes at a time instead*/
+ "decl %%"REGC" \n" /* jigger loop ct */
+
+ ".align 16 \n\t"
+
+ "movdqa "_FPround1", %%xmm0 \n\t"
+ "movdqa "_vWeight1", %%xmm5 \n\t"
+ "movdqa "_vWeight2", %%xmm6 \n\t"
+ "movdqa "_YMask", %%xmm7 \n"
+
+ "vLoopSSE2_Fetch: \n\t"
+#ifdef PREFETCH
+ " prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t"
+ " prefetcht0 16(%%"REGD", %%"REGA", 2) \n"
+#endif
+ "vLoopSSE2: \n\t"
+ " movdqu (%%"REGSI", %%"REGA", 2), %%xmm1 \n\t" /* top of 2 lines to interpolate */
+ " movdqu (%%"REGD", %%"REGA", 2), %%xmm2 \n\t" /* 2nd of 2 lines */
+
+ " movdqa %%xmm1, %%xmm3 \n\t" /* get chroma bytes */
+ " pand %%xmm7, %%xmm1 \n\t" /* keep only luma */
+ " psrlw $8, %%xmm3 \n\t" /* right just chroma */
+ " pmullw %%xmm5, %%xmm1 \n\t" /* mult by weighting factor */
+ " pmullw %%xmm5, %%xmm3 \n\t" /* mult by weighting factor */
+
+ " movdqa %%xmm2, %%xmm4 \n\t" /* get chroma bytes */
+ " pand %%xmm7, %%xmm2 \n\t" /* keep only luma */
+ " psrlw $8, %%xmm4 \n\t" /* right just chroma */
+ " pmullw %%xmm6, %%xmm2 \n\t" /* mult by weighting factor */
+ " pmullw %%xmm6, %%xmm4 \n\t" /* mult by weighting factor */
+
+ " paddw %%xmm2, %%xmm1 \n\t" /* combine lumas */
+ " paddusw %%xmm0, %%xmm1 \n\t" /* round */
+ " psrlw $8, %%xmm1 \n\t" /* right adjust luma */
+#ifdef STREAMING_STORE
+ " movntdq %%xmm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
+#else
+ " movdqu %%xmm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
+#endif
+ " paddw %%xmm4, %%xmm3 \n\t" /* combine chromas */
+ " paddusw %%xmm0, %%xmm3 \n\t" /* round */
+ " psrlw $8, %%xmm3 \n\t" /* right adjust chroma */
+ " packuswb %%xmm3, %%xmm3 \n\t" /* pack UV's into low dword */
+ " movdq2q %%xmm3, %%mm1 \n\t" /* save in our work area */
+#ifdef STREAMING_STORE
+ " movntq %%mm1, (%%"REGB", %%"REGA") \n\t" /* save in our work area */
+#else
+ " movq %%mm1, (%%"REGB", %%"REGA") \n\t" /* save in our work area */
+#endif
+ " lea 8(%%"REGA"), %%"REGA" \n\t"
+ " decl %%"REGC" \n\t"
+
+ " jg vLoopSSE2_Fetch \n\t" /* if not on last one loop, prefetch */
+ " jz vLoopSSE2 \n\t" /* or just loop, or not */
+
+ /* done with our SSE2 fortified loop but we may need to pick up the spare change */
+#ifdef STREAMING_STORE
+ " sfence \n\t"
+#endif
+ " movl "_src_row_size", %%"REGC" \n\t" /* get count again */
+ " andl $15, %%"REGC" \n\t" /* just need mod 16 */
+
+ " movq "_YMask", %%mm7 \n\t" /* useful luma mask constant - lazy dupl init */
+ " movq "_vWeight1", %%mm5 \n\t"
+ " movq "_vWeight2", %%mm6 \n\t"
+ " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */
+
+ " shrl $3, %%"REGC" \n\t" /* 8 bytes at a time, any? */
+ " jz MoreSpareChange \n" /* n, did them all */
+
+ /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions.
+ * This first loop is not the performance bottleneck anyway but it is trivial to tune
+ * using SSE if we have proper alignment.
+ */
+ "vMaybeSSEMMX: \n\t"
+
+ " movq "_YMask", %%mm7 \n\t" /* useful luma mask constant - lazy dupl init */
+ " movq "_vWeight1", %%mm5 \n\t"
+ " movq "_vWeight2", %%mm6 \n\t"
+ " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */
+#if !defined(__x86_64__)
+ " testl $1, "_SSEMMXenabledW" \n\t" /* MMXEXTsupported? */
+ " jz vLoopMMX \n\t" /* n, can't do anyway */
+#endif
+ " decl %%"REGC" \n" /* jigger loop ctr */
+
+ ".align 16 \n"
+ "vLoopSSEMMX_Fetch: \n\t"
+#ifdef PREFETCH
+ " prefetcht0 8(%%"REGSI", %%"REGA", 2) \n\t"
+ " prefetcht0 8(%%"REGD", %%"REGA", 2) \n"
+#endif
+ "vLoopSSEMMX: \n\t"
+ " movq (%%"REGSI", %%"REGA", 2), %%mm1 \n\t" /* top of 2 lines to interpolate */
+ " movq (%%"REGD", %%"REGA", 2), %%mm2 \n\t" /* 2nd of 2 lines */
+
+ " movq %%mm1, %%mm3 \n\t" /* copy top bytes */
+ " pand %%mm7, %%mm1 \n\t" /* keep only luma */
+ " pxor %%mm1, %%mm3 \n\t" /* keep only chroma */
+ " psrlw $8, %%mm3 \n\t" /* right just chroma */
+ " pmullw %%mm5, %%mm1 \n\t" /* mult by weighting factor */
+ " pmullw %%mm5, %%mm3 \n\t" /* mult by weighting factor */
+
+ " movq %%mm2, %%mm4 \n\t" /* copy 2nd bytes */
+ " pand %%mm7, %%mm2 \n\t" /* keep only luma */
+ " pxor %%mm2, %%mm4 \n\t" /* keep only chroma */
+ " psrlw $8, %%mm4 \n\t" /* right just chroma */
+ " pmullw %%mm6, %%mm2 \n\t" /* mult by weighting factor */
+ " pmullw %%mm6, %%mm4 \n\t" /* mult by weighting factor */
+
+ " paddw %%mm2, %%mm1 \n\t" /* combine lumas */
+ " paddusw %%mm0, %%mm1 \n\t" /* round */
+ " psrlw $8, %%mm1 \n\t" /* right adjust luma */
+#ifdef STREAMING_STORE
+ " movntq %%mm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
+#else
+ " movq %%mm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
+#endif
+ " paddw %%mm4, %%mm3 \n\t" /* combine chromas */
+ " paddusw %%mm0, %%mm3 \n\t" /* round */
+ " psrlw $8, %%mm3 \n\t" /* right adjust chroma */
+ " packuswb %%mm3, %%mm3 \n\t" /* pack UV's into low dword */
+ " movd %%mm3, (%%"REGB", %%"REGA") \n\t" /* save in our work area */
+
+ " lea 4(%%"REGA"), %%"REGA" \n\t"
+ " decl %%"REGC" \n\t"
+ " jg vLoopSSEMMX_Fetch \n\t" /* if not on last one loop, prefetch */
+ " jz vLoopSSEMMX \n\t" /* or just loop, or not */
+#ifdef STREAMING_STORE
+ " sfence \n\t"
+#endif
+ " jmp MoreSpareChange \n" /* all done with vertical */
+
+ ".align 16 \n"
+ "vLoopMMX: \n\t"
+
+ " movq (%%"REGSI", %%"REGA", 2), %%mm1 \n\t" /* top of 2 lines to interpolate */
+ " movq (%%"REGD", %%"REGA", 2), %%mm2 \n\t" /* 2nd of 2 lines */
+
+ " movq %%mm1, %%mm3 \n\t" /* copy top bytes */
+ " pand %%mm7, %%mm1 \n\t" /* keep only luma */
+ " pxor %%mm1, %%mm3 \n\t" /* keep only chroma */
+ " psrlw $8, %%mm3 \n\t" /* right just chroma */
+ " pmullw %%mm5, %%mm1 \n\t" /* mult by weighting factor */
+ " pmullw %%mm5, %%mm3 \n\t" /* mult by weighting factor */
+
+ " movq %%mm2, %%mm4 \n\t" /* copy 2nd bytes */
+ " pand %%mm7, %%mm2 \n\t" /* keep only luma */
+ " pxor %%mm2, %%mm4 \n\t" /* keep only chroma */
+ " psrlw $8, %%mm4 \n\t" /* right just chroma */
+ " pmullw %%mm6, %%mm2 \n\t" /* mult by weighting factor */
+ " pmullw %%mm6, %%mm4 \n\t" /* mult by weighting factor */
+
+ " paddw %%mm2, %%mm1 \n\t" /* combine lumas */
+ " paddusw %%mm0, %%mm1 \n\t" /* round */
+ " psrlw $8, %%mm1 \n\t" /* right adjust luma */
+ " movq %%mm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
+
+ " paddw %%mm4, %%mm3 \n\t" /* combine chromas */
+ " paddusw %%mm0, %%mm3 \n\t" /* round */
+ " psrlw $8, %%mm3 \n\t" /* right adjust chroma */
+ " packuswb %%mm3, %%mm3 \n\t" /* pack UV's into low dword */
+ " movd %%mm3, (%%"REGB", %%"REGA") \n\t" /* save in our work area */
+
+ " lea 4(%%"REGA"), %%"REGA" \n\t"
+ " loop vLoopMMX \n"
+
+ /* Add a little code here to check if we have 2 more pixels to do and, if so, make one
+ * more pass thru vLoopMMX. We were processing in multiples of 4 pixels and alway have
+ * an even number so there will never be more than 2 left. trbarry 7/29/2002
+ */
+ "MoreSpareChange: \n\t"
+
+ " cmpl "_EndOffset", %%"REGEA" \n\t" /* did we get them all */
+ " jnl DoHorizontal \n\t" /* yes, else have 2 left */
+ " movl $1, %%"REGC" \n\t" /* jigger loop ct */
+ " sub $2, %%"REGA" \n\t" /* back up 2 pixels (4 bytes, but eax carried as 1/2) */
+ " jmp vLoopMMX \n"
+
+ /* We've taken care of the vertical scaling, now do horizontal */
+ "DoHorizontal: \n\t"
+
+ " movq "_YMask", %%mm7 \n\t" /* useful 0U0U.. mask constant */
+ " movq "_FPround2", %%mm6 \n\t" /* useful rounding constant, dwords */
+ " mov "_pControl", %%"REGSI" \n\t" /* @ horiz control bytes */
+ " movl "_row_size", %%"REGC" \n\t"
+ " shrl $2, %%"REGC" \n\t" /* bytes a time, 2 pixels */
+ " mov "_vWorkYW", %%"REGD" \n\t" /* our luma data, as 0Y0Y 0Y0Y.. */
+ " mov "_dstp", %%"REGDI" \n\t" /* the destination line */
+ " mov "_vWorkUVW", %%"REGB" \n" /* chroma data, as UVUV UVUV... */
+
+ ".align 16 \n"
+ "hLoopMMX: \n\t"
+
+ /* x86_64: must use movl (accessing table of uint32's) */
+ " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */
+ " movd (%%"REGD", %%"REGA", 2), %%mm0 \n\t" /* copy luma pair */
+ " shr $1, %%"REGA" \n\t" /* div offset by 2 */
+ " movd (%%"REGB", %%"REGA", 2), %%mm1 \n\t" /* copy UV pair VUVU */
+ " psllw $8, %%mm1 \n\t" /* shift out V, keep 0000U0U0 */
+
+ /* we need to use both even and odd croma from same location - trb 9/2002 */
+ " punpckldq (%%"REGB", %%"REGA", 2), %%mm1 \r\n" /* copy UV pair VUVU */
+ " psrlw $8, %%mm1 \r\n" /* shift out U0, keep 0V0V 0U0U */
+ " movl 20(%%"REGSI"), %%"REGEA" \r\n" /* get data offset in pixels, 2nd pixel pair */
+ " punpckldq (%%"REGD", %%"REGA", 2), %%mm0 \r\n" /* copy luma pair */
+
+ " pmaddwd (%%"REGSI"), %%mm0 \r\n" /* mult and sum lumas by ctl weights */
+ " paddusw %%mm6, %%mm0 \r\n" /* round */
+ " psrlw $8, %%mm0 \r\n" /* right just 2 luma pixel value 000Y,000Y */
+
+ " pmaddwd 8(%%"REGSI"), %%mm1 \r\n" /* mult and sum chromas by ctl weights */
+ " paddusw %%mm6, %%mm1 \r\n" /* round */
+ " pslld $8, %%mm1 \r\n" /* shift into low bytes of different words */
+ " pand %%mm7, %%mm1 \r\n" /* keep only 2 chroma values 0V00,0U00 */
+ " por %%mm1, %%mm0 \r\n" /* combine luma and chroma, 0V0Y,0U0Y */
+ " packuswb %%mm0, %%mm0 \r\n" /* pack all into low dword, xxxxVYUY */
+ " movd %%mm0, (%%"REGDI") \n\t" /* done with 2 pixels */
+
+ " lea 24(%%"REGSI"), %%"REGSI" \n\t" /* bump to next control bytest */
+ " lea 4(%%"REGDI"), %%"REGDI" \n\t" /* bump to next output pixel addr */
+
+ " loop hLoopMMX \n\t" /* loop for more */
+
+ "emms \n\t"
+ /* done with one line */
+
+#if !defined(__x86_64__)
+ "mov "_oldbx", %%"REGB" \n\t"
+#endif
+ ::
+ "m" /*0*/(FPround1),
+ "m" /*1*/(vWeight1),
+ "m" /*2*/(vWeight2),
+ "m" /*3*/(YMask),
+ "m" /*4*/(src_row_size),
+ "m" /*5*/(EndOffset),
+ "m" /*6*/(pControl),
+ "m" /*7*/(row_size),
+ "m" /*8*/(vWorkYW),
+ "m" /*9*/(dstp),
+ "m" /*10*/(vWorkUVW),
+ "m" /*11*/(FPround2),
+ "m" /*12*/(srcp1),
+ "m" /*13*/(srcp2)
+#if !defined(__x86_64__)
+ ,
+ "m" /*14*/(oldbx),
+ "m" /*15*/(SSEMMXenabledW),
+ "m" /*16*/(SSE2enabledW)
+ : REGA, /*REGB,*/ REGC, REGD, REGSI, REGDI
+#else
+ : REGA, REGB, REGC, REGD, REGSI, REGDI
+#endif
+ );
+
+ dstp += dst_pitch;
+ }
+#endif
+ return 0;
+}
+
+static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
+ const int dst_pitch, const int src_pitch,
+ const int dst_width, const int dst_height,
+ const int src_width, const int src_height,
+ const int Interlaced, const uint32_t * const hControl,
+ const uint32_t * vOffsets, const uint32_t * vWeights,
+ uint32_t *vWorkY, int dst_start)
+{
+#if defined(__i386__) || defined(__x86_64__)
+ const sse2_t FPround1 = {uq:{UINT64_C(0x0080008000800080),UINT64_C(0x0080008000800080)}}; /* round words */
+ const sse2_t FPround2 = {uq:{UINT64_C(0x0000008000000080),UINT64_C(0x0000008000000080)}}; /* round dwords */
+ sse2_t vWeight1;
+ sse2_t vWeight2;
+
+ const uint32_t *pControl = &hControl[0];
+ const uint32_t *vWorkYW = vWorkY;
+ const uint8_t *srcp = src;
+ const uint8_t *srcp1;
+ const uint8_t *srcp2;
+ uint8_t *dstp = dst + dst_pitch*dst_start;
+
+ const uint32_t src_row_size = src_width;
+ const uint32_t row_size = dst_width;
+
+#if !defined(__x86_64__)
+ const int accel = xine_mm_accel();
+ const uint32_t SSE2enabledW = !!(accel & MM_ACCEL_X86_SSE2); /* in local storage for asm */
+ const uint32_t SSEMMXenabledW = !!(accel & MM_ACCEL_X86_MMXEXT); /* in local storage for asm */
+ long int oldbx;
+#endif
+ int y;
+
+ /* Operation in sliced mode:
+ * - continue until required next source line is out of slice
+ * - return next output line
+ * - at next call, continue from next souce line
+ */
+
+ for (y = dst_start; y < dst_height; y++) {
+ if(vOffsets[y] >= src_height) {
+ /* slice completed */
+ /*DBG("do_warp_yv12: max input height reached: need line %d, height %d\n -> Returning next output line: %d , start was %d\n",
+ (int)vOffsets[y], (int)src_height, (int)y, (int)dst_start);*/
+ return y;
+ }
+
+ vWeight1.ud[0] = vWeight1.ud[1] = vWeight1.ud[2] = vWeight1.ud[3] =
+ (256-vWeights[y]) << 16 | (256-vWeights[y]);
+ vWeight2.ud[0] = vWeight2.ud[1] = vWeight2.ud[2] = vWeight2.ud[3] =
+ vWeights[y] << 16 | vWeights[y];
+
+ srcp1 = srcp + vOffsets[y] * src_pitch;
+
+ if (Interlaced)
+ srcp2 = (y < dst_height-2) ? srcp1 + 2 * src_pitch : srcp1;
+ else
+ srcp2 = (y < dst_height-1) ? srcp1 + src_pitch : srcp1;
+
+ __asm__ __volatile__(
+ "movl "_src_row_size", %%"REGC" \n\t"
+ "shr $3, %%"REGC" \n\t" /* 8 bytes a time */
+ "mov "_srcp1", %%"REGSI" \n\t" /* top of 2 src lines to get */
+ "mov "_srcp2", %%"REGD" \n\t" /* next " */
+ "mov "_vWorkYW", %%"REGDI" \n\t" /* luma work destination line */
+ "xor %%"REGA", %%"REGA" \n\t"
+#if !defined(__x86_64__)
+ /* Let's check here to see if we are on a P4 or higher and can use SSE2 instructions.
+ * This first loop is not the performance bottleneck anyway but it is trivial to tune
+ * using SSE2 if we have proper alignment.
+ */
+ "testl $1, "_SSE2enabledW" \n\t" /* is SSE2 supported? */
+ "jz vMaybeSSEMMX_12 \n\t" /* n, can't do anyway */
+#endif
+ "cmpl $2, %%"REGC" \n\t" /* we have at least 16 byts, 2 qwords? */
+ "jl vMaybeSSEMMX_12 \n\t" /* n, don't bother */
+
+ "mov %%"REGSI", %%"REGB" \n\t"
+ "or %%"REGD", %%"REGB" \n\t"
+ "test $15, %%"REGB" \n\t" /* both src rows 16 byte aligned? */
+ "jnz vMaybeSSEMMX_12 \n\t" /* n, don't use sse2 */
+
+ "shr $1, %%"REGC" \n\t" /* do 16 bytes at a time instead */
+ "dec %%"REGC" \n\t" /* jigger loop ct */
+
+ "movdqa "_FPround1", %%xmm0 \n\t"
+ "movdqa "_vWeight1", %%xmm5 \n\t"
+ "movdqa "_vWeight2", %%xmm6 \n\t"
+ "pxor %%xmm7, %%xmm7 \n"
+
+ ".align 16 \n"
+ "vLoopSSE2_Fetch_12: \n\t"
+#ifdef PREFETCH
+ " prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t"
+ " prefetcht0 16(%%"REGD", %%"REGA", 2) \n"
+#endif
+ "vLoopSSE2_12: \n\t"
+ /* we're already checked pointers to be on dqword aligned */
+ " movdqa (%%"REGSI", %%"REGA"), %%xmm1 \n\t" /* top of 2 lines to interpolate */
+ " movdqa (%%"REGD", %%"REGA"), %%xmm3 \n\t" /* 2nd of 2 lines */
+ " movdqa %%xmm1, %%xmm2 \n\t"
+ " movdqa %%xmm3, %%xmm4 \n\t"
+
+ " punpcklbw %%xmm7, %%xmm1 \n\t" /* make words */
+ " punpckhbw %%xmm7, %%xmm2 \n\t" /* " */
+ " punpcklbw %%xmm7, %%xmm3 \n\t" /* " */
+ " punpckhbw %%xmm7, %%xmm4 \n\t" /* " */
+
+ " pmullw %%xmm5, %%xmm1 \n\t" /* mult by top weighting factor */
+ " pmullw %%xmm5, %%xmm2 \n\t" /* " */
+ " pmullw %%xmm6, %%xmm3 \n\t" /* mult by bot weighting factor */
+ " pmullw %%xmm6, %%xmm4 \n\t" /* " */
+
+ " paddw %%xmm3, %%xmm1 \n\t" /* combine lumas low */
+ " paddw %%xmm4, %%xmm2 \n\t" /* combine lumas high */
+
+ " paddusw %%xmm0, %%xmm1 \n\t" /* round */
+ " paddusw %%xmm0, %%xmm2 \n\t" /* round */
+
+ " psrlw $8, %%xmm1 \n\t" /* right adjust luma */
+ " psrlw $8, %%xmm2 \n\t" /* right adjust luma */
+
+ " packuswb %%xmm2, %%xmm1 \n\t" /* pack words to our 16 byte answer */
+#ifdef STREAMING_STORE
+ " movntdq %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+#else
+ " movdqu %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+#endif
+ " lea 16(%%"REGA"), %%"REGA" \n\t"
+ " decl %%"REGC" \n\t"
+
+ " jg vLoopSSE2_Fetch_12 \n\t" /* if not on last one loop, prefetch */
+ " jz vLoopSSE2_12 \n\t" /* or just loop, or not */
+
+ /* done with our SSE2 fortified loop but we may need to pick up the spare change */
+#ifdef STREAMING_STORE
+ " sfence \n\t"
+#endif
+ " movl "_src_row_size", %%"REGC" \n\t" /* get count again */
+ " andl $15, %%"REGC" \n\t" /* just need mod 16 */
+ " movq "_vWeight1", %%mm5 \n\t"
+ " movq "_vWeight2", %%mm6 \n\t"
+ " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */
+
+ " shrl $3, %%"REGC" \n\t" /* 8 bytes at a time, any? */
+ " jz MoreSpareChange_12 \n" /* n, did them all */
+
+ /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions.
+ * This first loop is not the performance bottleneck anyway but it is trivial to tune
+ * using SSE if we have proper alignment.
+ */
+ "vMaybeSSEMMX_12: \n\t"
+
+ " movq "_vWeight1", %%mm5 \n\t"
+ " movq "_vWeight2", %%mm6 \n\t"
+ " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */
+ " pxor %%mm7, %%mm7 \n\t"
+#if !defined(__x86_64__)
+ " testl $1, "_SSEMMXenabledW" \n\t"/* MMXEXTsupported? */
+ " jz vLoopMMX_12 \n\t" /* n, can't do anyway */
+#endif
+ " decl %%"REGC" \n" /* jigger loop ctr */
+
+ ".align 16 \n"
+ "vLoopSSEMMX_Fetch_12: \n\t"
+#ifdef PREFETCH
+ " prefetcht0 8(%%"REGSI", %%"REGA") \n\t"
+ " prefetcht0 8(%%"REGD", %%"REGA") \n"
+#endif
+ "vLoopSSEMMX_12: \n\t"
+
+ " movq (%%"REGSI", %%"REGA"), %%mm1 \n\t" /* top of 2 lines to interpolate */
+ " movq (%%"REGD", %%"REGA"), %%mm3 \n\t" /* 2nd of 2 lines */
+
+ " movq %%mm1, %%mm2 \n\t"
+ " movq %%mm3, %%mm4 \n\t"
+
+ " punpcklbw %%mm7, %%mm1 \n\t" /* make words */
+ " punpckhbw %%mm7, %%mm2 \n\t" /* " */
+ " punpcklbw %%mm7, %%mm3 \n\t" /* " */
+ " punpckhbw %%mm7, %%mm4 \n\t" /* " */
+
+ " pmullw %%mm5, %%mm1 \n\t" /* mult by top weighting factor */
+ " pmullw %%mm5, %%mm2 \n\t" /* " */
+ " pmullw %%mm6, %%mm3 \n\t" /* mult by bot weighting factor */
+ " pmullw %%mm6, %%mm4 \n\t" /* " */
+
+ " paddw %%mm3, %%mm1 \n\t" /* combine lumas low */
+ " paddw %%mm4, %%mm2 \n\t" /* combine lumas high */
+
+ " paddusw %%mm0, %%mm1 \n\t" /* round */
+ " paddusw %%mm0, %%mm2 \n\t" /* round */
+
+ " psrlw $8, %%mm1 \n\t" /* right adjust luma */
+ " psrlw $8, %%mm2 \n\t" /* right adjust luma */
+
+ " packuswb %%mm2, %%mm1 \n\t" /* pack words to our 16 byte answer */
+#ifdef STREAMING_STORE
+ " movntq %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+#else
+ " movq %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+#endif
+ " lea 8(%%"REGA"), %%"REGA" \n\t"
+ " decl %%"REGC" \n\t"
+
+ " jg vLoopSSEMMX_Fetch_12 \n\t" /* if not on last one loop, prefetch */
+ " jz vLoopSSEMMX_12 \n\t" /* or just loop, or not */
+#ifdef STREAMING_STORE
+ " sfence \n\t"
+#endif
+ " jmp MoreSpareChange_12 \n" /* all done with vertical */
+
+ ".align 16 \n"
+ "vLoopMMX_12: \n\t"
+
+ " movq (%%"REGSI", %%"REGA"), %%mm1 \n\t" /* top of 2 lines to interpolate */
+ " movq (%%"REGD", %%"REGA"), %%mm3 \n\t" /* 2nd of 2 lines */
+
+ " movq %%mm1, %%mm2 \n\t"
+ " movq %%mm3, %%mm4 \n\t"
+
+ " punpcklbw %%mm7, %%mm1 \n\t" /* make words */
+ " punpckhbw %%mm7, %%mm2 \n\t" /* " */
+ " punpcklbw %%mm7, %%mm3 \n\t" /* " */
+ " punpckhbw %%mm7, %%mm4 \n\t" /* " */
+
+ " pmullw %%mm5, %%mm1 \n\t" /* mult by top weighting factor */
+ " pmullw %%mm5, %%mm2 \n\t" /* " */
+ " pmullw %%mm6, %%mm3 \n\t" /* mult by bot weighting factor */
+ " pmullw %%mm6, %%mm4 \n\t" /* " */
+
+ " paddw %%mm3, %%mm1 \n\t" /* combine lumas low */
+ " paddw %%mm4, %%mm2 \n\t" /* combine lumas high */
+
+ " paddusw %%mm0, %%mm1 \n\t" /* round */
+ " paddusw %%mm0, %%mm2 \n\t" /* round */
+
+ " psrlw $8, %%mm1 \n\t" /* right adjust luma */
+ " psrlw $8, %%mm2 \n\t" /* right adjust luma */
+
+ " packuswb %%mm2, %%mm1 \n\t" /* pack words to our 16 byte answer */
+ " movq %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+
+ " lea 8(%%"REGA"), %%"REGA" \n\t"
+ " loop vLoopMMX_12 \n"
+
+ /* Add a little code here to check if we have more pixels to do and, if so, make one
+ * more pass thru vLoopMMX. We were processing in multiples of 8 pixels and alway have
+ * an even number so there will never be more than 7 left.
+ */
+ "MoreSpareChange_12: \n\t"
+
+ " cmpl "_src_row_size", %%"REGEA" \n\t" /* did we get them all */
+ " jnl DoHorizontal_12 \n\t" /* yes, else have 2 left */
+ " movl $1, %%"REGC" \n\t" /* jigger loop ct */
+ " movl "_src_row_size", %%"REGEA" \n\t"
+ " sub $8, %%"REGA" \n\t" /* back up to last 8 pixels */
+ " jmp vLoopMMX_12 \n"
+
+ /* We've taken care of the vertical scaling, now do horizontal */
+ "DoHorizontal_12: \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
+ " movq "_FPround2", %%mm6 \n\t" /* useful rounding constant, dwords */
+ " mov "_pControl", %%"REGSI" \n\t" /* @ horiz control bytes */
+ " movl "_row_size", %%"REGC" \n\t"
+ " shrl $2, %%"REGC" \n\t" /* 4 bytes a time, 4 pixels */
+ " mov "_vWorkYW", %%"REGD" \n\t" /* our luma data, as 0Y0Y 0Y0Y.. */
+ " mov "_dstp", %%"REGDI" \n\t" /* the destination line */
+#if !defined(__x86_64__)
+ " testl $1, "_SSEMMXenabledW" \n\t" /* MMXEXTsupported? */
+ " jz hLoopMMX_12 \n\t" /* n, can't do anyway */
+#endif
+ /* With SSE support we will make 8 pixels (from 8 pairs) at a time */
+ " shrl $1, %%"REGC" \n\t" /* 8 bytes a time instead of 4 */
+ " jz LessThan8 \n"
+
+ ".align 16 \n"
+ "hLoopMMXSSE_12: \n\t"
+
+
+ /* handle first 2 pixels */
+ /* phi: must use movl here (x86_64, reading from table of uint_32's) */
+ " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */
+ " movl 20(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 2nd pixel pair */
+
+ " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */
+ " punpcklwd (%%"REGD", %%"REGB"), %%mm0 \r\n" /* 2nd luma pair, now xxxxYYYY */
+ " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
+ " movl 16+24(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 3st pixel pair */
+ " movl 20+24(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 4nd pixel pair */
+ " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */
+ " paddusw %%mm6, %%mm0 \n\t" /* round */
+ " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
+
+ /* handle 3rd and 4th pixel pairs */
+ " movd (%%"REGD", %%"REGA"), %%mm1 \n\t" /* copy luma pair 0000xxYY */
+ " punpcklwd (%%"REGD", %%"REGB"), %%mm1 \r\n" /* 2nd luma pair, now xxxxYYYY */
+ " punpcklbw %%mm7, %%mm1 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
+ " movl 16+48(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 5st pixel pair */
+ " movl 20+48(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 6nd pixel pair */
+ " pmaddwd 24(%%"REGSI"), %%mm1 \n\t" /* mult and sum lumas by ctl weights */
+ " paddusw %%mm6, %%mm1 \n\t" /* round */
+ " psrlw $8, %%mm1 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
+
+ /* handle 5th and 6th pixel pairs */
+ " movd (%%"REGD", %%"REGA"), %%mm2 \n\t" /* copy luma pair 0000xxYY */
+ " punpcklwd (%%"REGD", %%"REGB"), %%mm2 \r\n" /* 2nd luma pair, now xxxxYYYY */
+ " punpcklbw %%mm7, %%mm2 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
+ " movl 16+72(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 7st pixel pair */
+ " movl 20+72(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 8nd pixel pair */
+ " pmaddwd 48(%%"REGSI"), %%mm2 \n\t" /* mult and sum lumas by ctl weights */
+ " paddusw %%mm6, %%mm2 \n\t" /* round */
+ " psrlw $8, %%mm2 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
+
+ /* handle 7th and 8th pixel pairs */
+ " movd (%%"REGD", %%"REGA"), %%mm3 \n\t" /* copy luma pair 0000xxYY */
+ " punpcklwd (%%"REGD", %%"REGB"), %%mm3 \r\n" /* 2nd luma pair, now xxxxYYYY */
+ " punpcklbw %%mm7, %%mm3 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
+ " pmaddwd 72(%%"REGSI"), %%mm3 \n\t" /* mult and sum lumas by ctl weights */
+ " paddusw %%mm6, %%mm3 \n\t" /* round */
+ " psrlw $8, %%mm3 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
+
+ /* combine, store, and loop */
+ " packuswb %%mm1, %%mm0 \n\t" /* pack into qword, 0Y0Y0Y0Y */
+ " packuswb %%mm3, %%mm2 \n\t" /* pack into qword, 0Y0Y0Y0Y */
+ " packuswb %%mm2, %%mm0 \n\t" /* and again into YYYYYYYY */
+#ifdef STREAMING_STORE
+ " movntq %%mm0, (%%"REGDI") \n\t" /* done with 4 pixels */
+#else
+ " movq %%mm0, (%%"REGDI") \n\t" /* done with 4 pixels */
+#endif
+
+ " lea 96(%%"REGSI"), %%"REGSI" \n\t"
+ " lea 8(%%"REGDI"), %%"REGDI" \n\t"
+ " decl %%"REGC" \n\t"
+ " jg hLoopMMXSSE_12 \n\t" /* loop for more */
+#ifdef STREAMING_STORE
+ " sfence \n"
+#endif
+ "LessThan8: \n\t"
+ " movl "_row_size", %%"REGC" \n\t"
+ " andl $7, %%"REGC" \n\t" /* we have done all but maybe this */
+ " shrl $2, %%"REGC" \n\t" /* now do only 4 bytes at a time */
+ " jz LessThan4 \n"
+
+ ".align 16 \n"
+ "hLoopMMX_12: \n\t"
+
+ /* handle first 2 pixels */
+ " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */
+ " movl 20(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 2nd pixel pair */
+ " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */
+ " punpcklwd (%%"REGD", %%"REGB"), %%mm0 \r\n" /* 2nd luma pair, now xxxxYYYY */
+ " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
+ " movl 16+24(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 3st pixel pair */
+ " movl 20+24(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 4nd pixel pair */
+ " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */
+ " paddusw %%mm6, %%mm0 \n\t" /* round */
+ " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
+
+ /* handle 3rd and 4th pixel pairs */
+ " movd (%%"REGD", %%"REGA"), %%mm1 \n\t" /* copy luma pair 0000xxYY */
+ " punpckldq (%%"REGD", %%"REGB"), %%mm1 \r\n" /* 2nd luma pair, now xxxxYYYY */
+ " punpcklbw %%mm7, %%mm1 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
+ " pmaddwd 24(%%"REGSI"), %%mm1 \n\t" /* mult and sum lumas by ctl weights */
+ " paddusw %%mm6, %%mm1 \n\t" /* round */
+ " psrlw $8, %%mm1 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
+
+ /* combine, store, and loop */
+ " packuswb %%mm1, %%mm0 \n\t" /* pack into qword, 0Y0Y0Y0Y */
+ " packuswb %%mm7, %%mm0 \n\t" /* and again into 0000YYYY */
+ " movd %%mm0, (%%"REGDI") \n\t" /* done with 4 pixels */
+ " lea 48(%%"REGSI"), %%"REGSI" \n\t"
+ " lea 4(%%"REGDI"), %%"REGDI" \n\t"
+
+ " loop hLoopMMX_12 \n" /* loop for more */
+
+ /* test to see if we have a mod 4 size row, if not then more spare change */
+ "LessThan4: \n\t"
+ " movl "_row_size", %%"REGC" \n\t"
+ " andl $3, %%"REGC" \n\t" /* remainder side mod 4 */
+ " cmpl $2, %%"REGC" \n\t"
+ " jl LastOne \n\t" /* none, none */
+
+ /* handle 2 more pixels */
+ " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */
+ " movl 20(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 2nd pixel pair */
+ " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */
+ " punpcklwd (%%"REGD", %%"REGB"), %%mm0 \r\n" /* 2nd luma pair, now xxxxYYYY */
+ " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
+
+ " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */
+ " paddusw %%mm6, %%mm0 \n\t" /* round */
+ " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
+ " packuswb %%mm7, %%mm0 \n\t" /* pack into qword, 00000Y0Y */
+ " packuswb %%mm7, %%mm0 \n\t" /* and again into 000000YY */
+ " movd %%mm0, (%%"REGDI") \n\t" /* store, we are guarrenteed room in buffer (8 byte mult) */
+ " subl $2, %%"REGC" \n\t"
+
+ " lea 24(%%"REGSI"), %%"REGSI" \n\t" /* bump to next control bytes */
+ " lea 2(%%"REGDI"), %%"REGDI" \n" /* bump to next output pixel addr */
+
+ /* maybe one last pixel */
+ "LastOne: \n\t"
+ " cmpl $0, %%"REGC" \r\n" /* still more ? */
+ " jz AllDone \r\n" /* n, done */
+ " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */
+ " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */
+ " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
+
+ " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */
+ " paddusw %%mm6, %%mm0 \n\t" /* round */
+ " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
+ " movd %%mm0, %%"REGEA" \n\t"
+ " movb %%al, (%%"REGDI") \n" /* store last one */
+
+ "AllDone: \n\t"
+ " emms \n\t"
+#if !defined(__x86_64__)
+ "mov "_oldbx", %%"REGB" \n\t"
+#endif
+ ::
+ "m" /*0*/(FPround1),
+ "m" /*1*/(vWeight1),
+ "m" /*2*/(vWeight2),
+ "m" /*3*/(y/*YMask[0]*/),
+ "m" /*4*/(src_row_size),
+ "m" /*5*/(y/*EndOffset*/),
+ "m" /*6*/(pControl),
+ "m" /*7*/(row_size),
+ "m" /*8*/(vWorkYW),
+ "m" /*9*/(dstp),
+ "m" /*10*/(y/*vWorkUVW*/),
+ "m" /*11*/(FPround2),
+ "m" /*12*/(srcp1),
+ "m" /*13*/(srcp2)
+#if !defined(__x86_64__)
+ ,
+ "m" /*14*/(oldbx),
+ "m" /*15*/(SSEMMXenabledW),
+ "m" /*16*/(SSE2enabledW)
+ : REGA, /*REGB,*/ REGC, REGD, REGSI, REGDI
+#else
+ : REGA, REGB, REGC, REGD, REGSI, REGDI
+#endif
+ );
+
+ dstp += dst_pitch;
+ }
+#endif
+ return 0;
+}
+
+/*
+ * tools
+ */
+
+#ifndef ALIGN
+# define ALIGN(b,p) ((void*)((((unsigned long)(p)) + (b)-1) & (~((b)-1))))
+#endif
+#ifndef MIN
+# define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+#ifndef MAX
+# define MAX(a,b) ((a) > (b) ? (a) : (b))
+#endif
+#ifndef FABS
+# define FABS(x) ((x) < 0.0 ? -(x) : (x))
+#endif
+
+/*
+ * xine plugin
+ */
+
+#define PLUGIN_ID "warp"
+#define PLUGIN_DESCR "(non-)linear software scaling post plugin";
+#define PLUGIN_T warp_plugin_t
+/*#define POST_THREADS*/
+/*#define POST_SLICES*/
+#include "xine/post_util.h"
+
+
+/* plugin class initialization function */
+void *warp_init_plugin(xine_t *xine, void *);
+
+/* plugin class functions */
+static post_plugin_t *open_plugin(post_class_t *class_gen, int inputs,
+ xine_audio_port_t **audio_target,
+ xine_video_port_t **video_target);
+
+/* plugin instance functions */
+static void warp_dispose(post_plugin_t *this_gen);
+
+/* vo_frame functions */
+static vo_frame_t *got_frame(vo_frame_t *frame);
+static void draw_internal(vo_frame_t *frame, vo_frame_t *new_frame);
+
+
+/* parameter functions */
+static xine_post_api_descr_t *warp_get_param_descr(void);
+static int warp_set_parameters(xine_post_t *this_gen, void *param_gen);
+static int warp_get_parameters(xine_post_t *this_gen, void *param_gen);
+static char *warp_get_help(void);
+
+
+typedef struct warp_parameters_s {
+ int output_width;
+ int output_height;
+ double output_aspect;
+ int no_downscaling;
+} warp_parameters_t;
+
+START_PARAM_DESCR(warp_parameters_t)
+PARAM_ITEM(POST_PARAM_TYPE_INT, output_width, NULL, 640, 1920, 0,
+ "output video width")
+PARAM_ITEM(POST_PARAM_TYPE_INT, output_height, NULL, 480, 1080, 0,
+ "output video height")
+PARAM_ITEM(POST_PARAM_TYPE_DOUBLE, output_aspect, NULL, 1, 3, 0,
+ "output video aspect ratio")
+PARAM_ITEM(POST_PARAM_TYPE_BOOL, no_downscaling,NULL, 0, 1, 0,
+ "disable downscaling")
+END_PARAM_DESCR(warp_param_descr)
+
+
+typedef struct {
+ post_plugin_t post;
+
+ xine_post_in_t parameter_input;
+
+ /* User config (changes to actual config are delayed) */
+ warp_parameters_t config;
+
+ /* Current config */
+ int enable;
+ int output_width;
+ int output_height;
+ double output_aspect;
+ double factor_x;
+ double factor_y;
+
+ /* Last seen input frame */
+ int input_width;
+ int input_height;
+ int input_format;
+ int input_interlaced;
+ double input_aspect;
+
+ /* working buffers */
+ uint32_t *vWorkY;
+ uint32_t *vWorkUV;
+
+ /* scaling tables */
+ uint32_t *hControl;
+ uint32_t *hControlUV;
+ uint32_t *vOffsets;
+ uint32_t *vOffsetsUV;
+ uint32_t *vWeights;
+ uint32_t *vWeightsUV;
+
+ /* memory for work areas and scaling tables */
+ void *pMem;
+
+} warp_plugin_t;
+
+/*
+ *
+ */
+
+static void init_tables(warp_plugin_t *this)
+{
+#define BP(x) ((uint8_t*)(x))
+ /* allocate memory for scaling tables and workspace */
+ free(this->pMem);
+ this->pMem = xine_xmalloc(this->input_width*3 + this->output_width*sizeof(uint32_t)*3*2 +
+ this->output_height*sizeof(uint32_t)*4 + 2*9*128);
+
+ /* - aligned for P4 cache line */
+ this->vWorkY = (uint32_t*)ALIGN(128, this->pMem);
+ this->vWorkUV = (uint32_t*)ALIGN(128, BP(this->vWorkY) + this->input_width*2 + 128);
+ this->hControl = (uint32_t*)ALIGN(128, BP(this->vWorkUV) + this->input_width + 128);
+ this->vOffsets = (uint32_t*)ALIGN(128, BP(this->hControl) + this->output_width * sizeof(uint32_t) * 3 + 128);
+ this->vWeights = (uint32_t*)ALIGN(128, BP(this->vOffsets) + this->output_height * sizeof(uint32_t) + 128);
+
+ if (this->input_format == XINE_IMGFMT_YV12) {
+ this->vOffsetsUV = (uint32_t*)ALIGN(128, BP(this->vWeights) + this->output_height * sizeof(uint32_t) + 128);
+ this->vWeightsUV = (uint32_t*)ALIGN(128, BP(this->vOffsetsUV) + this->output_height * sizeof(uint32_t) + 128);
+ this->hControlUV = (uint32_t*)ALIGN(128, BP(this->vWeightsUV) + this->output_height * sizeof(uint32_t) + 128);
+
+ init_tables_yv12(this->output_width, this->output_height,
+ this->input_width, this->input_height,
+ this->input_interlaced, this->factor_x, this->factor_y,
+ this->hControl, this->vOffsets, this->vWeights,
+ this->hControlUV, this->vOffsetsUV, this->vWeightsUV );
+
+ } else if (this->input_format == XINE_IMGFMT_YUY2) {
+
+ init_tables_yuy2(this->output_width, this->output_height,
+ this->input_width, this->input_height,
+ this->input_interlaced, this->factor_x, this->factor_y,
+ this->hControl, this->vOffsets, this->vWeights );
+ }
+}
+
+static void calculate_factors(warp_plugin_t *this)
+{
+ /* try to guess amount to stretch/shrink */
+ double adiff = this->input_aspect - this->output_aspect;
+ this->factor_x = 1.0;
+ this->factor_y = 1.0;
+
+ if (adiff > 0.1) {
+
+ if (adiff > 0.1 + ((16.0-12.0)/9.0)) {
+ /* >16:9 -> >4:3 */
+ DBG("aspect ratio diff %1.3lf > 0 : too large !\n", adiff);
+ this->factor_x = 0.95;
+ this->factor_y = 1.15;
+ this->output_aspect += (adiff - 4.0/9.0);
+ DBG(" changing target ratio to %3.1lf\n", this->output_aspect);
+ } else {
+ /* 16:9 ... 12:9 -> 4:3 */
+ DBG("aspect ratio diff %1.3lf > 0 : 16.9...12:9 -> 4:3\n", adiff);
+ this->factor_x = 1.0 - 0.05 * adiff * 9.0/4.0;
+ this->factor_y = 1.0 + 0.15 * adiff * 9.0/4.0;
+ }
+
+ } else if (adiff < -0.1) {
+
+ if(adiff < -0.1-((16.0-12.0)/9.0)) {
+ /* <4:3 -> <16:9 */
+ DBG("aspect ratio diff %1.3lf > 0 : too large !\n", adiff);
+ this->factor_x = 1.05;
+ this->factor_y = 0.85;
+ this->output_aspect += (adiff + 4.0/9.0);
+ DBG(" changing target ratio to %3.1lf\n", this->output_aspect);
+ } else {
+ /* 4:3...16:9 -> 16:9 */
+ DBG("aspect ratio diff %1.3lf < 0 : 4:3...16:9 -> 16:9\n", adiff);
+ this->factor_x = 1.0 + 0.05 * adiff * 9.0/4.0;
+ this->factor_y = 1.0 - 0.15 * adiff * 9.0/4.0;
+ }
+
+ } else {
+ DBG("aspect ratio matches, no warp\n");
+ this->factor_x = 1.0;
+ this->factor_y = 1.0;
+ }
+
+ DBG("factor_x = %1.3lf factor_y = %1.3lf output ratio = %1.3lf\n",
+ this->factor_x, this->factor_y, this->output_aspect);
+}
+
+/*
+ *
+ */
+
+void *warp_init_plugin(xine_t *xine, void *data)
+{
+#if !defined(__x86_64__)
+ /* Need at least MMX */
+ if (!(xine_mm_accel() & MM_ACCEL_X86_MMX)) {
+ fprintf(stderr, "warp_init_plugin: ERROR: at least MMX required\n");
+ return NULL;
+ }
+#endif
+
+ return init_plugin(xine, data);
+}
+
+static post_plugin_t *open_plugin(post_class_t *class_gen, int inputs,
+ xine_audio_port_t **audio_target,
+ xine_video_port_t **video_target)
+{
+ warp_plugin_t *this = (warp_plugin_t *) xine_xmalloc(sizeof(warp_plugin_t));
+ post_plugin_t *this_gen = (post_plugin_t *) this;
+ post_in_t *input;
+ post_out_t *output;
+ xine_post_in_t *input_param;
+ post_video_port_t *port;
+
+ static xine_post_api_t post_api =
+ { warp_set_parameters, warp_get_parameters, warp_get_param_descr, warp_get_help };
+
+ if (!this || !video_target || !video_target[0]) {
+ free(this);
+ return NULL;
+ }
+
+ _x_post_init(this_gen, 0, 1);
+
+ port = _x_post_intercept_video_port(this_gen, video_target[0], &input, &output);
+ port->intercept_frame = intercept_frame_yuy;
+ port->new_frame->draw = post_draw;
+ input->xine_in.name = "video";
+ output->xine_out.name = "video (scaled)";
+ this_gen->xine_post.video_input[0] = &port->new_port;
+
+ this_gen->dispose = warp_dispose;
+
+ input_param = &this->parameter_input;
+ input_param->name = "parameters";
+ input_param->type = XINE_POST_DATA_PARAMETERS;
+ input_param->data = &post_api;
+ xine_list_push_back(this_gen->input, input_param);
+
+ this->config.output_aspect = 0.0; /* -> do not change aspect ratio */
+ this->config.output_width = 0; /* -> do not change width */
+ this->config.output_height = 0; /* -> do not change height */
+ this->config.no_downscaling = 0;
+
+ this->input_width = 0; /* not known yet, triggers initialization later */
+ this->input_height = 0;
+
+ return this_gen;
+}
+
+static void warp_dispose(post_plugin_t *this_gen)
+{
+ if (_x_post_dispose(this_gen)) {
+ warp_plugin_t *this = (warp_plugin_t *) this_gen;
+
+ DBG("dispose\n");
+
+ free(this->pMem);
+ free(this);
+ }
+}
+
+static vo_frame_t *got_frame(vo_frame_t *frame)
+{
+ post_video_port_t *port = (post_video_port_t *)frame->port;
+ warp_plugin_t *this = (warp_plugin_t *)port->post;
+ double adiff = this->input_aspect - frame->ratio;
+
+ if (this->input_width != frame->width || this->input_height != frame->height ||
+ this->input_format != frame->format || FABS(adiff)>0.1 ||
+ this->input_interlaced != !!(frame->flags & VO_INTERLACED_FLAG)) {
+
+ DBG("detected frame format change: %dx%d -> %dx%d, interlaced %d->%d, aspect %1.3lf->%1.3lf, %s->%s\n",
+ this->input_width, this->input_height, frame->width, frame->height,
+ this->input_interlaced, !!(frame->flags & VO_INTERLACED_FLAG),
+ this->input_aspect, frame->ratio,
+ this->input_format==XINE_IMGFMT_YV12 ? "yv12":"yuy2",
+ frame->format==XINE_IMGFMT_YV12 ? "yv12":"yuy2" );
+
+ /* free tables and buffers */
+ free(this->pMem);
+ this->pMem = NULL;
+
+ /* remember frame properties to detect changes in video format */
+ this->input_width = frame->width;
+ this->input_height = frame->height;
+ this->input_format = frame->format;
+ this->input_aspect = frame->ratio;
+ this->input_interlaced = !!(frame->flags & VO_INTERLACED_FLAG);
+
+ /* re-configure target size and aspect ratio */
+ this->output_aspect = this->config.output_aspect ?: frame->ratio;
+ if (!this->config.no_downscaling) {
+ this->output_width = this->config.output_width ?: frame->width;
+ this->output_height = this->config.output_height ?: frame->height;
+ } else {
+ this->output_width = MAX(this->config.output_width, frame->width);
+ this->output_height = MAX(this->config.output_height, frame->height);
+ }
+
+ /* calculate warp function factors */
+ calculate_factors(this);
+
+ if(this->output_width == frame->width &&
+ this->output_height == frame->height &&
+ adiff < 0.1 &&
+ adiff > -0.1 ) {
+ this->enable = 0;
+ DBG("--> nothing to do, disabling processing for now");
+ return NULL;
+ }
+
+ this->enable = 1;
+
+ init_tables(this);
+ }
+
+ if (!this->enable)
+ return NULL;
+
+ return port->original_port->get_frame(port->original_port,
+ this->output_width, this->output_height,
+ this->output_aspect, frame->format,
+ frame->flags | VO_BOTH_FIELDS);
+}
+
+static void draw_internal(vo_frame_t *frame, vo_frame_t *new_frame)
+{
+ post_video_port_t *port = (post_video_port_t *)frame->port;
+ warp_plugin_t *this = (warp_plugin_t *)port->post;
+ int proc_height = frame->height;
+
+ if (frame->format == XINE_IMGFMT_YV12) {
+
+ do_warp_yv12(new_frame->base[0], frame->base[0],
+ new_frame->pitches[0], frame->pitches[0],
+ this->output_width, this->output_height,
+ frame->width, proc_height,
+ this->input_interlaced,
+ this->hControl, this->vOffsets, this->vWeights,
+ this->vWorkY,
+ 0);
+ proc_height /= 2;
+ do_warp_yv12(new_frame->base[1], frame->base[1],
+ new_frame->pitches[1], frame->pitches[1],
+ this->output_width/2, this->output_height/2,
+ frame->width/2, proc_height,
+ this->input_interlaced,
+ this->hControlUV, this->vOffsetsUV, this->vWeightsUV,
+ this->vWorkUV,
+ 0);
+ do_warp_yv12(new_frame->base[2], frame->base[2],
+ new_frame->pitches[2], frame->pitches[2],
+ this->output_width/2, this->output_height/2,
+ frame->width/2, proc_height,
+ this->input_interlaced,
+ this->hControlUV, this->vOffsetsUV, this->vWeightsUV,
+ this->vWorkUV,
+ 0);
+
+ } else if (frame->format == XINE_IMGFMT_YUY2) {
+ do_warp_yuy2(new_frame->base[0], frame->base[0],
+ new_frame->pitches[0], frame->pitches[0],
+ this->output_width, this->output_height,
+ frame->width, proc_height,
+ this->input_interlaced,
+ this->hControl, this->vOffsets, this->vWeights,
+ this->vWorkY, this->vWorkUV,
+ 0);
+ }
+}
+
+/*
+ * parameter functions
+ */
+
+static xine_post_api_descr_t *warp_get_param_descr(void)
+{
+ return &warp_param_descr;
+}
+
+static int warp_set_parameters(xine_post_t *this_gen, void *param_gen)
+{
+ warp_plugin_t *this = (warp_plugin_t *)this_gen;
+ warp_parameters_t *params = (warp_parameters_t *)param_gen;
+
+ memcpy(&this->config, params, sizeof(warp_parameters_t));
+ this->input_width = this->input_height = 0;
+
+ DBG("warp_set_parameters: "
+ "output_width=%d, output_height=%d, output_aspect=%4.3lf, no_downscaling=%d\n",
+ this->config.output_width, this->config.output_height, this->config.output_aspect,
+ this->config.no_downscaling);
+
+ return 1;
+}
+
+static int warp_get_parameters(xine_post_t *this_gen, void *param_gen)
+{
+ warp_plugin_t *this = (warp_plugin_t *)this_gen;
+ warp_parameters_t *params = (warp_parameters_t *)param_gen;
+
+ DBG("warp_get_parameters\n");
+ memcpy(params, &this->config, sizeof(warp_parameters_t));
+
+ return 1;
+}
+
+static char *warp_get_help(void) {
+ return _(
+ "The warp plugin scales video to another resolution. "
+ "It supports non-linear stretching to change video aspect ratio. "
+ "\n"
+ "Parameters\n"
+ " output_width: Scale video to width\n"
+ " (0 -> do not change video width)\n"
+ " output_height: Scale video to height\n"
+ " (0 -> do not change video height)\n"
+ " output_aspect: Adjust aspect ratio using non-linear scaling\n"
+ " (0 -> do not change video aspect ratio)\n"
+ " no_downscaling: Do not downscale video\n"
+ "\n"
+ );
+}
+
+
+/*
+ * plugin info
+ */
+
+static post_info_t info = { XINE_POST_TYPE_VIDEO_FILTER };
+
+const plugin_info_t xine_plugin_info[] __attribute__((visibility("default"))) =
+{
+ /* type, API, "name", version, special_info, init_function */
+ { PLUGIN_POST, 9, "warp", XINE_VERSION_CODE, &info, &warp_init_plugin },
+ { PLUGIN_POST, 9, "swscale", XINE_VERSION_CODE, &info, &warp_init_plugin },
+ { PLUGIN_NONE, 0, "", 0, NULL, NULL }
+};