summaryrefslogtreecommitdiff
path: root/xine_post_swscale.c
diff options
context:
space:
mode:
Diffstat (limited to 'xine_post_swscale.c')
-rw-r--r--xine_post_swscale.c1730
1 files changed, 0 insertions, 1730 deletions
diff --git a/xine_post_swscale.c b/xine_post_swscale.c
deleted file mode 100644
index 9e5be7c2..00000000
--- a/xine_post_swscale.c
+++ /dev/null
@@ -1,1730 +0,0 @@
-/*
- * Copyright (C) 2000-2007 the xine project
- *
- * This file is part of xine, a free video player.
- *
- * xine is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * xine is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * $Id: xine_post_swscale.c,v 1.9 2008-12-13 14:24:03 phintuka Exp $
- *
- * Simple (faster) resize for avisynth
- * Copyright (C) 2002 Tom Barry
- *
- * Very simple 2 tap linear interpolation.
- * It is unfiltered which means it will not soften much.
- *
- * WarpedResize will do a non-linear stretch/squeeze in both the horizontal
- * and vertical dimensions. This can be useful when you want to change the
- * aspect ratio of a video clip and have it mostly distorted at the
- * top, bottom, and side edges.
- *
- *
- * Ported to linux/xine by Petri Hintukainen <phintuka@users.sourceforge.net>
- * - Added x86_64 support
- * - Added PIC support (do not clobber ebx in x86, access only local variables from asm)
- * - Fixed yv12 stretched warp tables generation
- */
-
-#include <xine/xine_internal.h>
-#include <xine/post.h>
-
-/*#define DBG(x...)*/
-#define DBG(x...) fprintf(stderr, "post_warp: " x)
-
-/*#define STREAMING_STORE_TMP*/
-/*#define STREAMING_STORE*/
-/*#define PREFETCH*/
-/* streaming store and prefetch seems to be slower ...
- * Tested with P3 (128M L2) and C2D (4M L2).
- * Maybe access pattern is enough simple for HW prefetchers.
- */
-
-/*#define VANILLA*/
-
-/*
- * This function accepts a position from 0 to 1 and warps it, to 0 through 1 based
- * upon the wFact var. The warp equations are designed to:
- *
- * * Always be rising but yield results from 0 to 1
- *
- * * Have a first derivative that doesn't go to 0 or infinity, at least close
- * to the center of the screen
- *
- * * Have a curvature (absolute val of 2nd derivative) that is small in the
- * center and smoothly rises towards the edges. We would like the curvature
- * to be everywhere = 0 when the warp factor = 1
- */
-static double WarpFactor(double position, double wFact)
-{
- double x;
- double z;
- double w;
- x = 2 * (position - .5);
- if (1) /*(wFact < 1.0)*/
- /* For warp factor < 1 the warp is calculated as (1-w) * x^3 + w *x, centered
- *
- * The warp is calculated as z = (1 - w) * x^3 + w * x, centered
- * around .5 and ranging from 0 to 1. After some tinkering this seems
- * to give decent values and derivatives at the right places.
- */
- w = 2.0 - wFact; /* reverse parm for compat with initial release */
-
- if (x < 0.0) {
- z = -(1 - w) * x*x*x - w * x; /* -1 < x < 0, wFact < 1 */
- return .5 - .5 * z;
- } else {
- z = (1 - w) * x*x*x + w * x; /* -1 < x < 0, wFact < 1 */
- return .5 + .5 * z; /* amts to same formula as above for now */
- }
-}
-
-/*
- * YV12
- *
- * For each horizontal output pair of pixels there is are 2 qword masks followed by 2 int
- * offsets. The 2 masks are the weights to be used for the luma and chroma, respectively.
- * Each mask contains LeftWeight1, RightWeight1, LeftWeight2, RightWeight2. So a pair of pixels
- * will later be processed each pass through the horizontal resize loop. I think with my
- * current math the Horizontal Luma and Chroma contains the same values but since I may have screwed it
- * up I'll leave it this way for now. Vertical chroma is different.
- *
- * Note - try just using the luma calcs for both, seem to be the same.
- *
- * The weights are scaled 0-256 and the left and right weights will sum to 256 for each pixel.
- */
-static void init_tables_yv12(int newwidth, int newheight, int oldwidth, int oldheight,
- int Interlaced, double hWarp, double vWarp,
- uint32_t *hControl, uint32_t *vOffsets, uint32_t *vWeights,
- uint32_t *hControlUV, uint32_t *vOffsetsUV, uint32_t *vWeightsUV)
-{
- int i;
- int j;
- int k;
- int wY1;
- int wY2;
- DBG("init_yv12: %dx%d->%dx%d hWarp %1.3lf vWarp %1.3lf\n",
- oldwidth, oldheight, newwidth, newheight, hWarp, vWarp);
-
- /* First set up horizontal table, use for both luma & chroma since
- * it seems to have the same equation.
- * We will geneerate these values in pairs, mostly because that's the way
- * I wrote it for YUY2 above.
- */
-
- for(i=0; i < newwidth; i+=2) {
- /* first make even pixel control */
- if (hWarp==1) /*if no warp factor */
- j = i * 256 * (oldwidth-1) / (newwidth-1);
- else /* stretch and warp somehow */
- j = (int) (256 * WarpFactor(i / (newwidth-1.0), hWarp) * (oldwidth-1));
-
- k = j>>8;
- wY2 = j - (k << 8); /* luma weight of right pixel */
- wY1 = 256 - wY2; /* luma weight of left pixel */
-
- if (k > oldwidth - 2) {
- hControl[i*3+4] = oldwidth - 1; /* point to last byte */
- hControl[i*3] = 0x00000100; /* use 100% of rightmost Y */
- } else {
- hControl[i*3+4] = k; /* pixel offset */
- hControl[i*3] = wY2 << 16 | wY1; /* luma weights */
- }
-
- /* now make odd pixel control */
- if (hWarp==1) /* if no warp factor */
- j = (i+1) * 256 * (oldwidth-1) / (newwidth-1);
- else /* stretch and warp somehow */
- j = (int) (256 * WarpFactor((i+1) / (newwidth-1.0), hWarp) * (oldwidth-1));
-
- k = j>>8;
- wY2 = j - (k << 8); /* luma weight of right pixel */
- wY1 = 256 - wY2; /* luma weight of left pixel */
-
- if (k > oldwidth - 2) {
- hControl[i*3+5] = oldwidth - 1; /* point to last byte */
- hControl[i*3+1] = 0x00000100; /* use 100% of rightmost Y */
- } else {
- hControl[i*3+5] = k; /* pixel offset */
- hControl[i*3+1] = wY2 << 16 | wY1; /* luma weights */
- }
- }
-
- hControl[newwidth*3+4] = 2 * (oldwidth-1); /* give it something to prefetch at end */
- hControl[newwidth*3+5] = 2 * (oldwidth-1); /* " */
-#ifndef VANILLA
- // UV
- for(i=0; i < newwidth/2; i+=2) {
- /* first make even pixel control */
- if (hWarp==1) /*if no warp factor */
- j = i * 256 * (oldwidth/2-1) / (newwidth/2-1);
- else /* stretch and warp somehow */
- j = (int) (256 * WarpFactor(i / (newwidth/2-1.0), hWarp) * (oldwidth/2-1));
-
- k = j>>8;
- wY2 = j - (k << 8); /* luma weight of right pixel */
- wY1 = 256 - wY2; /* luma weight of left pixel */
-
- if (k > oldwidth/2 - 2) {
- hControlUV[i*3+4] = oldwidth/2 - 1; /* point to last byte */
- hControlUV[i*3] = 0x00000100; /* use 100% of rightmost Y */
- } else {
- hControlUV[i*3+4] = k; /* pixel offset */
- hControlUV[i*3] = wY2 << 16 | wY1; /* luma weights */
- }
-
- /* now make odd pixel control */
- if (hWarp==1) /* if no warp factor */
- j = (i+1) * 256 * (oldwidth/2-1) / (newwidth/2-1);
- else /* stretch and warp somehow */
- j = (int) (256 * WarpFactor((i+1) / (newwidth/2-1.0), hWarp) * (oldwidth/2-1));
-
- k = j>>8;
- wY2 = j - (k << 8); /* luma weight of right pixel */
- wY1 = 256 - wY2; /* luma weight of left pixel */
-
- if (k > oldwidth/2 - 2) {
- hControlUV[i*3+5] = oldwidth/2 - 1; /* point to last byte */
- hControlUV[i*3+1] = 0x00000100; /* use 100% of rightmost Y */
- } else {
- hControlUV[i*3+5] = k; /* pixel offset */
- hControlUV[i*3+1] = wY2 << 16 | wY1; /* luma weights */
- }
- }
-
- hControlUV[newwidth/2*3+4] = (oldwidth/2-1); /* give it something to prefetch at end */
- hControlUV[newwidth/2*3+5] = (oldwidth/2-1); /* " */
-#endif
-
- /* Next set up vertical tables. The offsets are measured in lines and will be mult */
- /* by the source pitch later . */
-
- /* For YV12 we need separate Luma and chroma tables */
-
- /* First Luma Table */
- for(i=0; i< newheight; ++i) {
- if (vWarp==1) /* if no warp factor */
- j = i * 256 * (oldheight-1) / (newheight-1);
- else /* stretch and warp somehow */
- j = (int) (256 * WarpFactor(i / (newheight-1.0), vWarp) * (oldheight-1));
-
- if (Interlaced) { /* do hard way? */
- if (i%2) { /* is odd output line? */
- if (j < 256) { /* before 1st odd input line */
- vOffsets[i] = 1; /* all from line 1 */
- vWeights[i] = 0; /* weight to give to 2nd line */
- } else {
- k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */
- vOffsets[i] = k;
- wY2 = j - (k << 8);
- vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */
- }
- } else { /* is even output line */
- k = (j >> 9) << 1; /* next lower even line */
- vOffsets[i] = k;
- wY2 = j - (k << 8);
- vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */
- }
- } else { /* simple way, do as progressive */
- k = j >> 8;
- vOffsets[i] = k;
- wY2 = j - (k << 8);
- vWeights[i] = wY2; /* weight to give to 2nd line */
- }
- }
-
- /* Vertical table for chroma */
- for(i=0; i< newheight/2; ++i) {
- if (vWarp==1) /* if no warp factor */
-#ifdef VANILLA
- j = (int) ( (i+.25) * 256 * (oldheight-1) / (newheight-1.0) - 64 );
-#else
- j = (int) ( (i+.25) * 256 * (oldheight/2-1) / (newheight/2-1.0) - 64 );
-#endif
- else /* stretch and warp somehow */
-#ifdef VANILLA
- j = (int) (256 * WarpFactor( (i+.25) / (newheight-1.0), vWarp) * (oldheight-1.0) );
-#else
- j = (int) (256 * WarpFactor( (i+.25) / (newheight/2 - 1.0), vWarp) * (oldheight/2 - 1.0) );
-#endif
-#ifndef VANILLA
- if(j<0) j=0;
-#endif
- if (Interlaced) { /* do hard way? */
- if (i%2) { /* is odd output line? */
- if (j < 256) { /* before 1st odd input line */
- vOffsetsUV[i] = 1; /* all from line 1 */
- vWeightsUV[i] = 0; /* weight to give to 2nd line */
- } else {
- k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */
- vOffsetsUV[i] = k;
- wY2 = j - (k << 8);
- vWeightsUV[i] = wY2 >> 1; /* weight to give to 2nd line */
- }
- } else { /* is even output line */
-#ifdef VANILLA
- k = (j >> 9) << 1; /* next lower even line */
- vOffsetsUV[i] = k;
- wY2 = j - (k << 8);
- vWeightsUV[i] = wY2 >> 1; /* weight to give to 2nd line */
-#else
- k = (j / 512) << 1; /* next lower even line */
- vOffsetsUV[i] = k;
- wY2 = j - (k << 8);
- vWeightsUV[i] = wY2 >> 1; /* weight to give to 2nd line */
-#endif
- }
- } else { /* simple way, do as progressive */
-#ifdef VANILLA
- k = j >> 8;
-#else
- k = j / 256; /* j >> 8; does not work right if -256 < j < 0 */
-#endif
- vOffsetsUV[i] = k;
- wY2 = j - (k << 8);
- vWeightsUV[i] = wY2; /* weight to give to 2nd line */
- }
- }
-}
-
-/*
- * YUY2
- *
- * For each horizontal output pair of pixels there is are 2 qword masks followed by 2 int
- * offsets. The 2 masks are the weights to be used for the luma and chroma, respectively.
- * Each mask contains LeftWeight1, RightWeight1, LeftWeight2, RightWeight2. So a pair of pixels
- * will later be processed each pass through the horizontal resize loop.
- *
- * The weights are scaled 0-256 and the left and right weights will sum to 256 for each pixel.
- */
-static void init_tables_yuy2(int newwidth, int newheight, int oldwidth, int oldheight,
- int Interlaced, double hWarp, double vWarp,
- uint32_t *hControl, uint32_t *vOffsets, uint32_t *vWeights )
-{
- int i;
- int j;
- int k;
- int wY1;
- int wY2;
- int wUV1;
- int wUV2;
- DBG("init_yuy2: %dx%d->%dx%d hWarp %1.3lf vWarp %1.3lf\n",
- oldwidth, oldheight, newwidth, newheight, hWarp, vWarp);
-
- /* First set up horizontal table */
- for(i=0; i < newwidth; i+=2) {
- /* first make even pixel control */
- if (hWarp==1) /* if no warp factor */
- j = i * 256 * (oldwidth-1) / (newwidth-1);
- else /* stretch and warp somehow */
- j = (int) (256 * WarpFactor(i / (newwidth-1.0), hWarp) * (oldwidth-1));
-
- k = j>>8;
- wY2 = j - (k << 8); /* luma weight of right pixel */
- wY1 = 256 - wY2; /* luma weight of left pixel */
- wUV2 = (k%2) ? 128 + (wY2 >> 1) : wY2 >> 1;
- wUV1 = 256 - wUV2;
-
- if (k > oldwidth - 2) {
- hControl[i*3+4] = oldwidth - 1; /* point to last byte */
- hControl[i*3] = 0x00000100; /* use 100% of rightmost Y */
- hControl[i*3+2] = 0x00000100; /* use 100% of rightmost U */
- } else {
- hControl[i*3+4] = k; /* pixel offset */
- hControl[i*3] = wY2 << 16 | wY1; /* luma weights */
- hControl[i*3+2] = wUV2 << 16 | wUV1; /* chroma weights */
- }
-
- /* now make odd pixel control */
- if (hWarp==1) /* if no warp factor */
- j = (i+1) * 256 * (oldwidth-1) / (newwidth-1);
- else /* stretch and warp somehow */
- j = (int) (256 * WarpFactor((i+1) / (newwidth-1.0), hWarp) * (oldwidth-1));
-
- k = j>>8;
- wY2 = j - (k << 8); /* luma weight of right pixel */
- wY1 = 256 - wY2; /* luma weight of left pixel */
- wUV2 = (k%2) ? 128 + (wY2 >> 1) : wY2 >> 1;
- wUV1 = 256 - wUV2;
-
- if (k > oldwidth - 2) {
- hControl[i*3+5] = oldwidth - 1; /* point to last byte */
- hControl[i*3+1] = 0x00000100; /* use 100% of rightmost Y */
- hControl[i*3+3] = 0x00000100; /* use 100% of rightmost V */
- } else {
- hControl[i*3+5] = k; /* pixel offset */
- hControl[i*3+1] = wY2 << 16 | wY1; /* luma weights */
- /* hControl[i*3+3] = wUV2 << 16 | wUV1; // chroma weights */
- /* horiz chroma weights should be same as for even pixel - trbarry 09/16/2002 */
- hControl[i*3+3] = hControl[i*3+2]; /* chroma weights */
- }
- }
-
- hControl[newwidth*3+4] = 2 * (oldwidth-1); /* give it something to prefetch at end */
- hControl[newwidth*3+5] = 2 * (oldwidth-1);
-
- /* Next set up vertical table. The offsets are measured in lines and will be mult */
- /* by the source pitch later */
- for(i=0; i< newheight; ++i) {
- if (vWarp==1) /* if no warp factor */
- j = i * 256 * (oldheight-1) / (newheight-1);
- else /* stretch and warp somehow */
- j = (int) (256 * WarpFactor(i / (newheight-1.0), vWarp) * (oldheight-1));
-
- if (Interlaced) { /* do hard way? */
- if (i%2) { /* is odd output line? */
- if (j < 256) { /* before 1st odd input line */
- vOffsets[i] = 1; /* all from line 1 */
- vWeights[i] = 0; /* weight to give to 2nd line */
- } else {
- k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */
- vOffsets[i] = k;
- wY2 = j - (k << 8);
- vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */
- }
- } else { /* is even output line */
- k = (j >> 9) << 1; /* next lower even line */
- vOffsets[i] = k;
- wY2 = j - (k << 8);
- vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */
- }
- } else { /* simple way, do as progressive */
- k = j >> 8;
- vOffsets[i] = k;
- wY2 = j - (k << 8);
- vWeights[i] = wY2; /* weight to give to 2nd line */
- }
- }
-}
-
-/* Register allocation */
-/* index/counter registers (REGA, REGC) are loaded from 32bit vars/arrays ! */
-#define REGEA "eax"
-#define REGEB "ebx"
-#if defined(__x86_64__)
-# define REGA "rax"
-# define REGB "rbx"
-# define REGC "ecx"
-# define REGD "rdx"
-# define REGDI "rdi"
-# define REGSI "rsi"
-#elif defined(__i386__)
-# define REGA "eax"
-# define REGB "ebx"
-# define REGC "ecx"
-# define REGD "edx"
-# define REGDI "edi"
-# define REGSI "esi"
-#endif
-
-/* variables accessed from assembler code */
-#define _FPround1 "%0"
-#define _vWeight1 "%1"
-#define _vWeight2 "%2"
-#define _YMask "%3"
-#define _src_row_size "%4"
-#define _EndOffset "%5"
-#define _pControl "%6"
-#define _row_size "%7"
-#define _vWorkYW "%8"
-#define _dstp "%9"
-#define _vWorkUVW "%10"
-#define _FPround2 "%11"
-#define _srcp1 "%12"
-#define _srcp2 "%13"
-#if !defined(__x86_64__)
-#define _oldbx "%14"
-#define _SSEMMXenabledW "%15"
-#define _SSE2enabledW "%16"
-#endif
-
-/* Labels */
-#define vMaybeSSEMMX "1"
-#define LessThan8 "2"
-#define LessThan4 "3"
-#define AllDone "4"
-#define LastOne "5"
-#define vLoopSSE2_Fetch "6"
-#define vLoopSSE2 "7"
-#define vLoopSSEMMX_Fetch "8"
-#define vLoopSSEMMX "9"
-#define vLoopMMX "10"
-#define MoreSpareChange "11"
-#define DoHorizontal "12"
-#define hLoopMMX "13"
-#define hLoopMMXSSE "14"
-
-
-/* structure for mmx constants */
-typedef union {
- uint64_t uq[1]; /* Unsigned Quadword */
- uint32_t ud[2]; /* Unsigned Doubleword */
-} ATTR_ALIGN(16) mmx_t;
-
-/* structure for sse2 constants */
-typedef union {
- uint64_t uq[2]; /* Unsigned Quadword */
- uint32_t ud[4]; /* Unsigned Doubleword */
-} ATTR_ALIGN(16) sse2_t;
-
-
-static int do_warp_yuy2(uint8_t *dst, const uint8_t *src,
- const int dst_pitch, const int src_pitch,
- const int dst_width, const int dst_height,
- const int src_width, const int src_height,
- const int Interlaced, const uint32_t * const hControl,
- const uint32_t * const vOffsets, const uint32_t * const vWeights,
- uint32_t *vWorkY, uint32_t *vWorkUV,
- int dst_start)
-{
-#if defined(__i386__) || defined(__x86_64__)
- sse2_t YMask = {uq:{UINT64_C(0x00ff00ff00ff00ff),UINT64_C(0x00ff00ff00ff00ff)}}; /* keeps only luma */
- sse2_t FPround1 = {uq:{UINT64_C(0x0080008000800080),UINT64_C(0x0080008000800080)}}; /* round words */
- sse2_t FPround2 = {uq:{UINT64_C(0x0000008000000080),UINT64_C(0x0000008000000080)}}; /* round dwords */
- sse2_t vWeight1;
- sse2_t vWeight2;
-
- const uint32_t *pControl = &hControl[0];
- const uint32_t *vWorkYW = vWorkY;
- const uint32_t *vWorkUVW = vWorkUV;
- const uint8_t *srcp = src;
- const uint8_t *srcp1;
- const uint8_t *srcp2;
- uint8_t *dstp = dst + dst_pitch*dst_start;
-
- const uint32_t src_row_size = src_width * 2;
- const uint32_t row_size = dst_width * 2;
- const uint32_t EndOffset = src_row_size / 2;
-
-#if !defined(__x86_64__)
- const int accel = xine_mm_accel();
- const uint32_t SSE2enabledW = !!(accel & MM_ACCEL_X86_SSE2); /* in local storage for asm */
- const uint32_t SSEMMXenabledW = !!(accel & MM_ACCEL_X86_MMXEXT); /* in local storage for asm */
- long int oldbx;
-#endif
- int y;
-
- for (y = dst_start; y < dst_height; y++) {
-
- if(vOffsets[y] >= src_height) {
- /* slice completed */
- /*DBG("do_warp_yuy2: max input height reached: need line %d, height %d\n -> Returning next output line: %d\n",
- vOffsets[y], src_height, y);*/
- return y;
- }
-
- vWeight1.ud[0] = vWeight1.ud[1] = vWeight1.ud[2] = vWeight1.ud[3] =
- (256-vWeights[y]) << 16 | (256-vWeights[y]);
- vWeight2.ud[0] = vWeight2.ud[1] = vWeight2.ud[2] = vWeight2.ud[3] =
- vWeights[y] << 16 | vWeights[y];
-
- srcp1 = srcp + vOffsets[y] * src_pitch;
- if (Interlaced)
- srcp2 = (y < dst_height-2) ? srcp1 + 2 * src_pitch : srcp1;
- else
- srcp2 = (y < dst_height-1) ? srcp1 + src_pitch : srcp1;
-
- __asm__ __volatile__ (
-#if !defined(__x86_64__)
- /* store ebx (PIC) */
- "mov %%"REGB", "_oldbx" \n\t"
-#endif
- "movl "_src_row_size", %%"REGC" \n\t"
- "shrl $3, %%"REGC" \n\t" /* 8 bytes a time */
- "mov "_srcp1", %%"REGSI" \n\t" /* top of 2 src lines to get */
- "mov "_srcp2", %%"REGD" \n\t" /* next " */
- "mov "_vWorkYW", %%"REGDI" \n\t" /* luma work destination line */
- "mov "_vWorkUVW", %%"REGB" \n\t" /* luma work destination line */
- "xor %%"REGA", %%"REGA" \n\t"
-#if !defined(__x86_64__)
- /* Let's check here to see if we are on a P4 or higher and can use SSE2 instructions.
- * This first loop is not the performance bottleneck anyway but it is trivial to tune
- * using SSE2 if we have proper alignment.
- */
- "testl $1, "_SSE2enabledW" \n\t" /* is SSE2 supported?*/
- "jz "vMaybeSSEMMX"f \n\t" /* n, can't do anyway*/
-#endif
- "cmpl $2, %%"REGC" \n\t" /* we have at least 16 bytes, 2 qwords? */
- "jl "vMaybeSSEMMX"f \n\t" /* n, don't bother*/
-
- "shrl $1, %%"REGC" \n\t" /* do 16 bytes at a time instead*/
- "decl %%"REGC" \n" /* jigger loop ct */
-
- ".align 16 \n\t"
-
- "movdqa "_FPround1", %%xmm0 \n\t"
- "movdqa "_vWeight1", %%xmm5 \n\t"
- "movdqa "_vWeight2", %%xmm6 \n\t"
- "movdqa "_YMask", %%xmm7 \n"
-
- ""vLoopSSE2_Fetch": \n\t"
-#ifdef PREFETCH
- " prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t"
- " prefetcht0 16(%%"REGD", %%"REGA", 2) \n"
-#endif
- ""vLoopSSE2": \n\t"
- " movdqu (%%"REGSI", %%"REGA", 2), %%xmm1 \n\t" /* top of 2 lines to interpolate */
- " movdqu (%%"REGD", %%"REGA", 2), %%xmm2 \n\t" /* 2nd of 2 lines */
-
- " movdqa %%xmm1, %%xmm3 \n\t" /* get chroma bytes */
- " pand %%xmm7, %%xmm1 \n\t" /* keep only luma */
- " psrlw $8, %%xmm3 \n\t" /* right just chroma */
- " pmullw %%xmm5, %%xmm1 \n\t" /* mult by weighting factor */
- " pmullw %%xmm5, %%xmm3 \n\t" /* mult by weighting factor */
-
- " movdqa %%xmm2, %%xmm4 \n\t" /* get chroma bytes */
- " pand %%xmm7, %%xmm2 \n\t" /* keep only luma */
- " psrlw $8, %%xmm4 \n\t" /* right just chroma */
- " pmullw %%xmm6, %%xmm2 \n\t" /* mult by weighting factor */
- " pmullw %%xmm6, %%xmm4 \n\t" /* mult by weighting factor */
-
- " paddw %%xmm2, %%xmm1 \n\t" /* combine lumas */
- " paddusw %%xmm0, %%xmm1 \n\t" /* round */
- " psrlw $8, %%xmm1 \n\t" /* right adjust luma */
-#ifdef STREAMING_STORE_TMP
- " movntdq %%xmm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
-#else
- " movdqu %%xmm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
-#endif
- " paddw %%xmm4, %%xmm3 \n\t" /* combine chromas */
- " paddusw %%xmm0, %%xmm3 \n\t" /* round */
- " psrlw $8, %%xmm3 \n\t" /* right adjust chroma */
- " packuswb %%xmm3, %%xmm3 \n\t" /* pack UV's into low dword */
- " movdq2q %%xmm3, %%mm1 \n\t" /* save in our work area */
-#ifdef STREAMING_STORE_TMP
- " movntq %%mm1, (%%"REGB", %%"REGA") \n\t" /* save in our work area */
-#else
- " movq %%mm1, (%%"REGB", %%"REGA") \n\t" /* save in our work area */
-#endif
- " lea 8(%%"REGA"), %%"REGA" \n\t"
- " decl %%"REGC" \n\t"
-
- " jg "vLoopSSE2_Fetch"b \n\t" /* if not on last one loop, prefetch */
- " jz "vLoopSSE2"b \n\t" /* or just loop, or not */
-
- /* done with our SSE2 fortified loop but we may need to pick up the spare change */
-#ifdef STREAMING_STORE_TMP
- " sfence \n\t"
-#endif
- " movl "_src_row_size", %%"REGC" \n\t" /* get count again */
- " andl $15, %%"REGC" \n\t" /* just need mod 16 */
-
- " movq "_YMask", %%mm7 \n\t" /* useful luma mask constant - lazy dupl init */
- " movq "_vWeight1", %%mm5 \n\t"
- " movq "_vWeight2", %%mm6 \n\t"
- " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */
-
- " shrl $3, %%"REGC" \n\t" /* 8 bytes at a time, any? */
- " jz "MoreSpareChange"f \n" /* n, did them all */
-
- /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions.
- * This first loop is not the performance bottleneck anyway but it is trivial to tune
- * using SSE if we have proper alignment.
- */
- ""vMaybeSSEMMX": \n\t"
-
- " movq "_YMask", %%mm7 \n\t" /* useful luma mask constant - lazy dupl init */
- " movq "_vWeight1", %%mm5 \n\t"
- " movq "_vWeight2", %%mm6 \n\t"
- " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */
-#if !defined(__x86_64__)
- " testl $1, "_SSEMMXenabledW" \n\t" /* MMXEXTsupported? */
- " jz "vLoopMMX"f \n\t" /* n, can't do anyway */
-#endif
- " decl %%"REGC" \n" /* jigger loop ctr */
-
- ".align 16 \n"
- ""vLoopSSEMMX_Fetch": \n\t"
-#ifdef PREFETCH
- " prefetcht0 8(%%"REGSI", %%"REGA", 2) \n\t"
- " prefetcht0 8(%%"REGD", %%"REGA", 2) \n"
-#endif
- ""vLoopSSEMMX": \n\t"
- " movq (%%"REGSI", %%"REGA", 2), %%mm1 \n\t" /* top of 2 lines to interpolate */
- " movq (%%"REGD", %%"REGA", 2), %%mm2 \n\t" /* 2nd of 2 lines */
-
- " movq %%mm1, %%mm3 \n\t" /* copy top bytes */
- " pand %%mm7, %%mm1 \n\t" /* keep only luma */
- " pxor %%mm1, %%mm3 \n\t" /* keep only chroma */
- " psrlw $8, %%mm3 \n\t" /* right just chroma */
- " pmullw %%mm5, %%mm1 \n\t" /* mult by weighting factor */
- " pmullw %%mm5, %%mm3 \n\t" /* mult by weighting factor */
-
- " movq %%mm2, %%mm4 \n\t" /* copy 2nd bytes */
- " pand %%mm7, %%mm2 \n\t" /* keep only luma */
- " pxor %%mm2, %%mm4 \n\t" /* keep only chroma */
- " psrlw $8, %%mm4 \n\t" /* right just chroma */
- " pmullw %%mm6, %%mm2 \n\t" /* mult by weighting factor */
- " pmullw %%mm6, %%mm4 \n\t" /* mult by weighting factor */
-
- " paddw %%mm2, %%mm1 \n\t" /* combine lumas */
- " paddusw %%mm0, %%mm1 \n\t" /* round */
- " psrlw $8, %%mm1 \n\t" /* right adjust luma */
-#ifdef STREAMING_STORE_TMP
- " movntq %%mm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
-#else
- " movq %%mm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
-#endif
- " paddw %%mm4, %%mm3 \n\t" /* combine chromas */
- " paddusw %%mm0, %%mm3 \n\t" /* round */
- " psrlw $8, %%mm3 \n\t" /* right adjust chroma */
- " packuswb %%mm3, %%mm3 \n\t" /* pack UV's into low dword */
- " movd %%mm3, (%%"REGB", %%"REGA") \n\t" /* save in our work area */
-
- " lea 4(%%"REGA"), %%"REGA" \n\t"
- " decl %%"REGC" \n\t"
- " jg "vLoopSSEMMX_Fetch"b \n\t" /* if not on last one loop, prefetch */
- " jz "vLoopSSEMMX"b \n\t" /* or just loop, or not */
-#ifdef STREAMING_STORE_TMP
- " sfence \n\t"
-#endif
- " jmp "MoreSpareChange"f \n" /* all done with vertical */
-
- ".align 16 \n"
- ""vLoopMMX": \n\t"
-
- " movq (%%"REGSI", %%"REGA", 2), %%mm1 \n\t" /* top of 2 lines to interpolate */
- " movq (%%"REGD", %%"REGA", 2), %%mm2 \n\t" /* 2nd of 2 lines */
-
- " movq %%mm1, %%mm3 \n\t" /* copy top bytes */
- " pand %%mm7, %%mm1 \n\t" /* keep only luma */
- " pxor %%mm1, %%mm3 \n\t" /* keep only chroma */
- " psrlw $8, %%mm3 \n\t" /* right just chroma */
- " pmullw %%mm5, %%mm1 \n\t" /* mult by weighting factor */
- " pmullw %%mm5, %%mm3 \n\t" /* mult by weighting factor */
-
- " movq %%mm2, %%mm4 \n\t" /* copy 2nd bytes */
- " pand %%mm7, %%mm2 \n\t" /* keep only luma */
- " pxor %%mm2, %%mm4 \n\t" /* keep only chroma */
- " psrlw $8, %%mm4 \n\t" /* right just chroma */
- " pmullw %%mm6, %%mm2 \n\t" /* mult by weighting factor */
- " pmullw %%mm6, %%mm4 \n\t" /* mult by weighting factor */
-
- " paddw %%mm2, %%mm1 \n\t" /* combine lumas */
- " paddusw %%mm0, %%mm1 \n\t" /* round */
- " psrlw $8, %%mm1 \n\t" /* right adjust luma */
- " movq %%mm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
-
- " paddw %%mm4, %%mm3 \n\t" /* combine chromas */
- " paddusw %%mm0, %%mm3 \n\t" /* round */
- " psrlw $8, %%mm3 \n\t" /* right adjust chroma */
- " packuswb %%mm3, %%mm3 \n\t" /* pack UV's into low dword */
- " movd %%mm3, (%%"REGB", %%"REGA") \n\t" /* save in our work area */
-
- " lea 4(%%"REGA"), %%"REGA" \n\t"
- " loop "vLoopMMX"b \n"
-
- /* Add a little code here to check if we have 2 more pixels to do and, if so, make one
- * more pass thru vLoopMMX. We were processing in multiples of 4 pixels and alway have
- * an even number so there will never be more than 2 left. trbarry 7/29/2002
- */
- ""MoreSpareChange": \n\t"
-
- " cmpl "_EndOffset", %%"REGEA" \n\t" /* did we get them all */
- " jnl "DoHorizontal"f \n\t" /* yes, else have 2 left */
- " movl $1, %%"REGC" \n\t" /* jigger loop ct */
- " sub $2, %%"REGA" \n\t" /* back up 2 pixels (4 bytes, but eax carried as 1/2) */
- " jmp "vLoopMMX"b \n"
-
- /* We've taken care of the vertical scaling, now do horizontal */
- ""DoHorizontal": \n\t"
-
- " movq "_YMask", %%mm7 \n\t" /* useful 0U0U.. mask constant */
- " movq "_FPround2", %%mm6 \n\t" /* useful rounding constant, dwords */
- " mov "_pControl", %%"REGSI" \n\t" /* @ horiz control bytes */
- " movl "_row_size", %%"REGC" \n\t"
- " shrl $2, %%"REGC" \n\t" /* bytes a time, 2 pixels */
- " mov "_vWorkYW", %%"REGD" \n\t" /* our luma data, as 0Y0Y 0Y0Y.. */
- " mov "_dstp", %%"REGDI" \n\t" /* the destination line */
- " mov "_vWorkUVW", %%"REGB" \n" /* chroma data, as UVUV UVUV... */
-
- ".align 16 \n"
- ""hLoopMMX": \n\t"
-
- /* x86_64: must use movl (accessing table of uint32's) */
- " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */
- " movd (%%"REGD", %%"REGA", 2), %%mm0 \n\t" /* copy luma pair */
- " shr $1, %%"REGA" \n\t" /* div offset by 2 */
- " movd (%%"REGB", %%"REGA", 2), %%mm1 \n\t" /* copy UV pair VUVU */
- " psllw $8, %%mm1 \n\t" /* shift out V, keep 0000U0U0 */
-
- /* we need to use both even and odd croma from same location - trb 9/2002 */
- " punpckldq (%%"REGB", %%"REGA", 2), %%mm1 \r\n" /* copy UV pair VUVU */
- " psrlw $8, %%mm1 \r\n" /* shift out U0, keep 0V0V 0U0U */
- " movl 20(%%"REGSI"), %%"REGEA" \r\n" /* get data offset in pixels, 2nd pixel pair */
- " punpckldq (%%"REGD", %%"REGA", 2), %%mm0 \r\n" /* copy luma pair */
-
- " pmaddwd (%%"REGSI"), %%mm0 \r\n" /* mult and sum lumas by ctl weights */
- " paddusw %%mm6, %%mm0 \r\n" /* round */
- " psrlw $8, %%mm0 \r\n" /* right just 2 luma pixel value 000Y,000Y */
-
- " pmaddwd 8(%%"REGSI"), %%mm1 \r\n" /* mult and sum chromas by ctl weights */
- " paddusw %%mm6, %%mm1 \r\n" /* round */
- " pslld $8, %%mm1 \r\n" /* shift into low bytes of different words */
- " pand %%mm7, %%mm1 \r\n" /* keep only 2 chroma values 0V00,0U00 */
- " por %%mm1, %%mm0 \r\n" /* combine luma and chroma, 0V0Y,0U0Y */
- " packuswb %%mm0, %%mm0 \r\n" /* pack all into low dword, xxxxVYUY */
- " movd %%mm0, (%%"REGDI") \n\t" /* done with 2 pixels */
-
- " lea 24(%%"REGSI"), %%"REGSI" \n\t" /* bump to next control bytest */
- " lea 4(%%"REGDI"), %%"REGDI" \n\t" /* bump to next output pixel addr */
-
- " loop "hLoopMMX"b \n\t" /* loop for more */
-
- "emms \n\t"
- /* done with one line */
-
-#if !defined(__x86_64__)
- "mov "_oldbx", %%"REGB" \n\t"
-#endif
- ::
- "m" /*0*/(FPround1),
- "m" /*1*/(vWeight1),
- "m" /*2*/(vWeight2),
- "m" /*3*/(YMask),
- "m" /*4*/(src_row_size),
- "m" /*5*/(EndOffset),
- "m" /*6*/(pControl),
- "m" /*7*/(row_size),
- "m" /*8*/(vWorkYW),
- "m" /*9*/(dstp),
- "m" /*10*/(vWorkUVW),
- "m" /*11*/(FPround2),
- "m" /*12*/(srcp1),
- "m" /*13*/(srcp2)
-#if !defined(__x86_64__)
- ,
- "m" /*14*/(oldbx),
- "m" /*15*/(SSEMMXenabledW),
- "m" /*16*/(SSE2enabledW)
- : REGA, /*REGB,*/ REGC, REGD, REGSI, REGDI
-#else
- : REGA, REGB, REGC, REGD, REGSI, REGDI
-#endif
- );
-
- dstp += dst_pitch;
- }
-#endif
- return 0;
-}
-
-static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
- const int dst_pitch, const int src_pitch,
- const int dst_width, const int dst_height,
- const int src_width, const int src_height,
- const int Interlaced, const uint32_t * const hControl,
- const uint32_t * vOffsets, const uint32_t * vWeights,
- uint32_t *vWorkY, int dst_start)
-{
-#if defined(__i386__) || defined(__x86_64__)
- sse2_t FPround1 = {uq:{UINT64_C(0x0080008000800080),UINT64_C(0x0080008000800080)}}; /* round words */
- sse2_t FPround2 = {uq:{UINT64_C(0x0000008000000080),UINT64_C(0x0000008000000080)}}; /* round dwords */
- sse2_t vWeight1;
- sse2_t vWeight2;
-
- const uint32_t *pControl = &hControl[0];
- const uint32_t *vWorkYW = vWorkY;
- const uint8_t *srcp = src;
- const uint8_t *srcp1;
- const uint8_t *srcp2;
- uint8_t *dstp = dst + dst_pitch*dst_start;
-
- const uint32_t src_row_size = src_width;
- const uint32_t row_size = dst_width;
-
-#if !defined(__x86_64__)
- const int accel = xine_mm_accel();
- const uint32_t SSE2enabledW = !!(accel & MM_ACCEL_X86_SSE2); /* in local storage for asm */
- const uint32_t SSEMMXenabledW = !!(accel & MM_ACCEL_X86_MMXEXT); /* in local storage for asm */
- long int oldbx;
-#endif
- int y;
-
- /* Operation in sliced mode:
- * - continue until required next source line is out of slice
- * - return next output line
- * - at next call, continue from next souce line
- */
-
- for (y = dst_start; y < dst_height; y++) {
- if(vOffsets[y] >= src_height) {
- /* slice completed */
- /*DBG("do_warp_yv12: max input height reached: need line %d, height %d\n -> Returning next output line: %d , start was %d\n",
- (int)vOffsets[y], (int)src_height, (int)y, (int)dst_start);*/
- return y;
- }
-
- vWeight1.ud[0] = vWeight1.ud[1] = vWeight1.ud[2] = vWeight1.ud[3] =
- (256-vWeights[y]) << 16 | (256-vWeights[y]);
- vWeight2.ud[0] = vWeight2.ud[1] = vWeight2.ud[2] = vWeight2.ud[3] =
- vWeights[y] << 16 | vWeights[y];
-
- srcp1 = srcp + vOffsets[y] * src_pitch;
-
- if (Interlaced)
- srcp2 = (y < dst_height-2) ? srcp1 + 2 * src_pitch : srcp1;
- else
- srcp2 = (y < dst_height-1) ? srcp1 + src_pitch : srcp1;
-
- __asm__ __volatile__(
- "movl "_src_row_size", %%"REGC" \n\t"
- "shr $3, %%"REGC" \n\t" /* 8 bytes a time */
- "mov "_srcp1", %%"REGSI" \n\t" /* top of 2 src lines to get */
- "mov "_srcp2", %%"REGD" \n\t" /* next " */
- "mov "_vWorkYW", %%"REGDI" \n\t" /* luma work destination line */
- "xor %%"REGA", %%"REGA" \n\t"
-#if !defined(__x86_64__)
- /* Let's check here to see if we are on a P4 or higher and can use SSE2 instructions.
- * This first loop is not the performance bottleneck anyway but it is trivial to tune
- * using SSE2 if we have proper alignment.
- */
- "testl $1, "_SSE2enabledW" \n\t" /* is SSE2 supported? */
- "jz "vMaybeSSEMMX"f \n\t" /* n, can't do anyway */
-#endif
- "cmpl $2, %%"REGC" \n\t" /* we have at least 16 byts, 2 qwords? */
- "jl "vMaybeSSEMMX"f \n\t" /* n, don't bother */
-
- "mov %%"REGSI", %%"REGB" \n\t"
- "or %%"REGD", %%"REGB" \n\t"
- "test $15, %%"REGB" \n\t" /* both src rows 16 byte aligned? */
- "jnz "vMaybeSSEMMX"f \n\t" /* n, don't use sse2 */
-
- "shr $1, %%"REGC" \n\t" /* do 16 bytes at a time instead */
- "dec %%"REGC" \n\t" /* jigger loop ct */
-
- "movdqa "_FPround1", %%xmm0 \n\t"
- "movdqa "_vWeight1", %%xmm5 \n\t"
- "movdqa "_vWeight2", %%xmm6 \n\t"
- "pxor %%xmm7, %%xmm7 \n"
-
- ".align 16 \n"
- ""vLoopSSE2_Fetch": \n\t"
-#ifdef PREFETCH
- " prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t"
- " prefetcht0 16(%%"REGD", %%"REGA", 2) \n"
-#endif
- ""vLoopSSE2": \n\t"
- /* we're already checked pointers to be on dqword aligned */
- " movdqa (%%"REGSI", %%"REGA"), %%xmm1 \n\t" /* top of 2 lines to interpolate */
- " movdqa (%%"REGD", %%"REGA"), %%xmm3 \n\t" /* 2nd of 2 lines */
- " movdqa %%xmm1, %%xmm2 \n\t"
- " movdqa %%xmm3, %%xmm4 \n\t"
-
- " punpcklbw %%xmm7, %%xmm1 \n\t" /* make words */
- " punpckhbw %%xmm7, %%xmm2 \n\t" /* " */
- " punpcklbw %%xmm7, %%xmm3 \n\t" /* " */
- " punpckhbw %%xmm7, %%xmm4 \n\t" /* " */
-
- " pmullw %%xmm5, %%xmm1 \n\t" /* mult by top weighting factor */
- " pmullw %%xmm5, %%xmm2 \n\t" /* " */
- " pmullw %%xmm6, %%xmm3 \n\t" /* mult by bot weighting factor */
- " pmullw %%xmm6, %%xmm4 \n\t" /* " */
-
- " paddw %%xmm3, %%xmm1 \n\t" /* combine lumas low */
- " paddw %%xmm4, %%xmm2 \n\t" /* combine lumas high */
-
- " paddusw %%xmm0, %%xmm1 \n\t" /* round */
- " paddusw %%xmm0, %%xmm2 \n\t" /* round */
-
- " psrlw $8, %%xmm1 \n\t" /* right adjust luma */
- " psrlw $8, %%xmm2 \n\t" /* right adjust luma */
-
- " packuswb %%xmm2, %%xmm1 \n\t" /* pack words to our 16 byte answer */
-#ifdef STREAMING_STORE_TMP
- " movntdq %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
-#else
- " movdqu %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
-#endif
- " lea 16(%%"REGA"), %%"REGA" \n\t"
- " decl %%"REGC" \n\t"
-
- " jg "vLoopSSE2_Fetch"b \n\t" /* if not on last one loop, prefetch */
- " jz "vLoopSSE2"b \n\t" /* or just loop, or not */
-
- /* done with our SSE2 fortified loop but we may need to pick up the spare change */
-#ifdef STREAMING_STORE_TMP
- " sfence \n\t"
-#endif
- " movl "_src_row_size", %%"REGC" \n\t" /* get count again */
- " andl $15, %%"REGC" \n\t" /* just need mod 16 */
- " movq "_vWeight1", %%mm5 \n\t"
- " movq "_vWeight2", %%mm6 \n\t"
- " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */
-
- " shrl $3, %%"REGC" \n\t" /* 8 bytes at a time, any? */
- " jz "MoreSpareChange"f \n" /* n, did them all */
-
- /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions.
- * This first loop is not the performance bottleneck anyway but it is trivial to tune
- * using SSE if we have proper alignment.
- */
- ""vMaybeSSEMMX": \n\t"
-
- " movq "_vWeight1", %%mm5 \n\t"
- " movq "_vWeight2", %%mm6 \n\t"
- " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */
- " pxor %%mm7, %%mm7 \n\t"
-#if !defined(__x86_64__)
- " testl $1, "_SSEMMXenabledW" \n\t"/* MMXEXTsupported? */
- " jz "vLoopMMX"f \n\t" /* n, can't do anyway */
-#endif
- " decl %%"REGC" \n" /* jigger loop ctr */
-
- ".align 16 \n"
- ""vLoopSSEMMX_Fetch": \n\t"
-#ifdef PREFETCH
- " prefetcht0 8(%%"REGSI", %%"REGA") \n\t"
- " prefetcht0 8(%%"REGD", %%"REGA") \n"
-#endif
- ""vLoopSSEMMX": \n\t"
-
- " movq (%%"REGSI", %%"REGA"), %%mm1 \n\t" /* top of 2 lines to interpolate */
- " movq (%%"REGD", %%"REGA"), %%mm3 \n\t" /* 2nd of 2 lines */
-
- " movq %%mm1, %%mm2 \n\t"
- " movq %%mm3, %%mm4 \n\t"
-
- " punpcklbw %%mm7, %%mm1 \n\t" /* make words */
- " punpckhbw %%mm7, %%mm2 \n\t" /* " */
- " punpcklbw %%mm7, %%mm3 \n\t" /* " */
- " punpckhbw %%mm7, %%mm4 \n\t" /* " */
-
- " pmullw %%mm5, %%mm1 \n\t" /* mult by top weighting factor */
- " pmullw %%mm5, %%mm2 \n\t" /* " */
- " pmullw %%mm6, %%mm3 \n\t" /* mult by bot weighting factor */
- " pmullw %%mm6, %%mm4 \n\t" /* " */
-
- " paddw %%mm3, %%mm1 \n\t" /* combine lumas low */
- " paddw %%mm4, %%mm2 \n\t" /* combine lumas high */
-
- " paddusw %%mm0, %%mm1 \n\t" /* round */
- " paddusw %%mm0, %%mm2 \n\t" /* round */
-
- " psrlw $8, %%mm1 \n\t" /* right adjust luma */
- " psrlw $8, %%mm2 \n\t" /* right adjust luma */
-
- " packuswb %%mm2, %%mm1 \n\t" /* pack words to our 8 byte answer */
-#ifdef STREAMING_STORE_TMP
- " movntq %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
-#else
- " movq %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
-#endif
- " lea 8(%%"REGA"), %%"REGA" \n\t"
- " decl %%"REGC" \n\t"
-
- " jg "vLoopSSEMMX_Fetch"b \n\t" /* if not on last one loop, prefetch */
- " jz "vLoopSSEMMX"b \n\t" /* or just loop, or not */
-#ifdef STREAMING_STORE_TMP
- " sfence \n\t"
-#endif
- " jmp "MoreSpareChange"f \n" /* all done with vertical */
-
- ".align 16 \n"
- ""vLoopMMX": \n\t"
-
- " movq (%%"REGSI", %%"REGA"), %%mm1 \n\t" /* top of 2 lines to interpolate */
- " movq (%%"REGD", %%"REGA"), %%mm3 \n\t" /* 2nd of 2 lines */
-
- " movq %%mm1, %%mm2 \n\t"
- " movq %%mm3, %%mm4 \n\t"
-
- " punpcklbw %%mm7, %%mm1 \n\t" /* make words */
- " punpckhbw %%mm7, %%mm2 \n\t" /* " */
- " punpcklbw %%mm7, %%mm3 \n\t" /* " */
- " punpckhbw %%mm7, %%mm4 \n\t" /* " */
-
- " pmullw %%mm5, %%mm1 \n\t" /* mult by top weighting factor */
- " pmullw %%mm5, %%mm2 \n\t" /* " */
- " pmullw %%mm6, %%mm3 \n\t" /* mult by bot weighting factor */
- " pmullw %%mm6, %%mm4 \n\t" /* " */
-
- " paddw %%mm3, %%mm1 \n\t" /* combine lumas low */
- " paddw %%mm4, %%mm2 \n\t" /* combine lumas high */
-
- " paddusw %%mm0, %%mm1 \n\t" /* round */
- " paddusw %%mm0, %%mm2 \n\t" /* round */
-
- " psrlw $8, %%mm1 \n\t" /* right adjust luma */
- " psrlw $8, %%mm2 \n\t" /* right adjust luma */
-
- " packuswb %%mm2, %%mm1 \n\t" /* pack words to our 8 byte answer */
- " movq %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
-
- " lea 8(%%"REGA"), %%"REGA" \n\t"
- " loop "vLoopMMX"b \n"
-
- /* Add a little code here to check if we have more pixels to do and, if so, make one
- * more pass thru vLoopMMX. We were processing in multiples of 8 pixels and alway have
- * an even number so there will never be more than 7 left.
- */
- ""MoreSpareChange": \n\t"
-
- " cmpl "_src_row_size", %%"REGEA" \n\t" /* did we get them all */
- " jnl "DoHorizontal"f \n\t" /* yes, else have 2 left */
- " movl $1, %%"REGC" \n\t" /* jigger loop ct */
- " movl "_src_row_size", %%"REGEA" \n\t"
- " sub $8, %%"REGA" \n\t" /* back up to last 8 pixels */
- " jmp "vLoopMMX"b \n"
-
- /* We've taken care of the vertical scaling, now do horizontal */
- ""DoHorizontal": \n\t"
- " pxor %%mm7, %%mm7 \n\t"
- " movq "_FPround2", %%mm6 \n\t" /* useful rounding constant, dwords */
- " mov "_pControl", %%"REGSI" \n\t" /* @ horiz control bytes */
- " movl "_row_size", %%"REGC" \n\t"
- " shrl $2, %%"REGC" \n\t" /* 4 bytes a time, 4 pixels */
- " mov "_vWorkYW", %%"REGD" \n\t" /* our luma data, as 0Y0Y 0Y0Y.. */
- " mov "_dstp", %%"REGDI" \n\t" /* the destination line */
-#if !defined(__x86_64__)
- " testl $1, "_SSEMMXenabledW" \n\t" /* MMXEXTsupported? */
- " jz "hLoopMMX"f \n\t" /* n, can't do anyway */
-#endif
- /* With SSE support we will make 8 pixels (from 8 pairs) at a time */
- " shrl $1, %%"REGC" \n\t" /* 8 bytes a time instead of 4 */
- " jz "LessThan8"f \n"
-
- ".align 16 \n"
- ""hLoopMMXSSE": \n\t"
-
-
- /* handle first 2 pixels */
- /* phi: must use movl here (x86_64, reading from table of uint_32's) */
- " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */
- " movl 20(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 2nd pixel pair */
-
- " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */
- " punpcklwd (%%"REGD", %%"REGB"), %%mm0 \r\n" /* 2nd luma pair, now xxxxYYYY */
- " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
- " movl 16+24(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 3st pixel pair */
- " movl 20+24(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 4nd pixel pair */
- " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */
- " paddusw %%mm6, %%mm0 \n\t" /* round */
- " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
-
- /* handle 3rd and 4th pixel pairs */
- " movd (%%"REGD", %%"REGA"), %%mm1 \n\t" /* copy luma pair 0000xxYY */
- " punpcklwd (%%"REGD", %%"REGB"), %%mm1 \r\n" /* 2nd luma pair, now xxxxYYYY */
- " punpcklbw %%mm7, %%mm1 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
- " movl 16+48(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 5st pixel pair */
- " movl 20+48(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 6nd pixel pair */
- " pmaddwd 24(%%"REGSI"), %%mm1 \n\t" /* mult and sum lumas by ctl weights */
- " paddusw %%mm6, %%mm1 \n\t" /* round */
- " psrlw $8, %%mm1 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
-
- /* handle 5th and 6th pixel pairs */
- " movd (%%"REGD", %%"REGA"), %%mm2 \n\t" /* copy luma pair 0000xxYY */
- " punpcklwd (%%"REGD", %%"REGB"), %%mm2 \r\n" /* 2nd luma pair, now xxxxYYYY */
- " punpcklbw %%mm7, %%mm2 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
- " movl 16+72(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 7st pixel pair */
- " movl 20+72(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 8nd pixel pair */
- " pmaddwd 48(%%"REGSI"), %%mm2 \n\t" /* mult and sum lumas by ctl weights */
- " paddusw %%mm6, %%mm2 \n\t" /* round */
- " psrlw $8, %%mm2 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
-
- /* handle 7th and 8th pixel pairs */
- " movd (%%"REGD", %%"REGA"), %%mm3 \n\t" /* copy luma pair 0000xxYY */
- " punpcklwd (%%"REGD", %%"REGB"), %%mm3 \r\n" /* 2nd luma pair, now xxxxYYYY */
- " punpcklbw %%mm7, %%mm3 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
- " pmaddwd 72(%%"REGSI"), %%mm3 \n\t" /* mult and sum lumas by ctl weights */
- " paddusw %%mm6, %%mm3 \n\t" /* round */
- " psrlw $8, %%mm3 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
-
- /* combine, store, and loop */
- " packuswb %%mm1, %%mm0 \n\t" /* pack into qword, 0Y0Y0Y0Y */
- " packuswb %%mm3, %%mm2 \n\t" /* pack into qword, 0Y0Y0Y0Y */
- " packuswb %%mm2, %%mm0 \n\t" /* and again into YYYYYYYY */
-#ifdef STREAMING_STORE
- " movntq %%mm0, (%%"REGDI") \n\t" /* done with 4 pixels */
-#else
- " movq %%mm0, (%%"REGDI") \n\t" /* done with 4 pixels */
-#endif
-
- " lea 96(%%"REGSI"), %%"REGSI" \n\t"
- " lea 8(%%"REGDI"), %%"REGDI" \n\t"
- " decl %%"REGC" \n\t"
- " jg "hLoopMMXSSE"b \n\t" /* loop for more */
-#ifdef STREAMING_STORE
- " sfence \n"
-#endif
- ""LessThan8": \n\t"
- " movl "_row_size", %%"REGC" \n\t"
- " andl $7, %%"REGC" \n\t" /* we have done all but maybe this */
- " shrl $2, %%"REGC" \n\t" /* now do only 4 bytes at a time */
- " jz "LessThan4"f \n"
-
- ".align 16 \n"
- ""hLoopMMX": \n\t"
-
- /* handle first 2 pixels */
- " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */
- " movl 20(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 2nd pixel pair */
- " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */
- " punpcklwd (%%"REGD", %%"REGB"), %%mm0 \r\n" /* 2nd luma pair, now xxxxYYYY */
- " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
- " movl 16+24(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 3st pixel pair */
- " movl 20+24(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 4nd pixel pair */
- " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */
- " paddusw %%mm6, %%mm0 \n\t" /* round */
- " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
-
- /* handle 3rd and 4th pixel pairs */
- " movd (%%"REGD", %%"REGA"), %%mm1 \n\t" /* copy luma pair 0000xxYY */
- " punpckldq (%%"REGD", %%"REGB"), %%mm1 \r\n" /* 2nd luma pair, now xxxxYYYY */
- " punpcklbw %%mm7, %%mm1 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
- " pmaddwd 24(%%"REGSI"), %%mm1 \n\t" /* mult and sum lumas by ctl weights */
- " paddusw %%mm6, %%mm1 \n\t" /* round */
- " psrlw $8, %%mm1 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
-
- /* combine, store, and loop */
- " packuswb %%mm1, %%mm0 \n\t" /* pack into qword, 0Y0Y0Y0Y */
- " packuswb %%mm7, %%mm0 \n\t" /* and again into 0000YYYY */
- " movd %%mm0, (%%"REGDI") \n\t" /* done with 4 pixels */
- " lea 48(%%"REGSI"), %%"REGSI" \n\t"
- " lea 4(%%"REGDI"), %%"REGDI" \n\t"
-
- " loop "hLoopMMX"b \n" /* loop for more */
-
- /* test to see if we have a mod 4 size row, if not then more spare change */
- ""LessThan4": \n\t"
- " movl "_row_size", %%"REGC" \n\t"
- " andl $3, %%"REGC" \n\t" /* remainder side mod 4 */
- " cmpl $2, %%"REGC" \n\t"
- " jl "LastOne"f \n\t" /* none, none */
-
- /* handle 2 more pixels */
- " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */
- " movl 20(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 2nd pixel pair */
- " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */
- " punpcklwd (%%"REGD", %%"REGB"), %%mm0 \r\n" /* 2nd luma pair, now xxxxYYYY */
- " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
-
- " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */
- " paddusw %%mm6, %%mm0 \n\t" /* round */
- " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
- " packuswb %%mm7, %%mm0 \n\t" /* pack into qword, 00000Y0Y */
- " packuswb %%mm7, %%mm0 \n\t" /* and again into 000000YY */
- " movd %%mm0, (%%"REGDI") \n\t" /* store, we are guarrenteed room in buffer (8 byte mult) */
- " subl $2, %%"REGC" \n\t"
-
- " lea 24(%%"REGSI"), %%"REGSI" \n\t" /* bump to next control bytes */
- " lea 2(%%"REGDI"), %%"REGDI" \n" /* bump to next output pixel addr */
-
- /* maybe one last pixel */
- ""LastOne": \n\t"
- " cmpl $0, %%"REGC" \r\n" /* still more ? */
- " jz "AllDone"f \r\n" /* n, done */
- " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */
- " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */
- " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
-
- " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */
- " paddusw %%mm6, %%mm0 \n\t" /* round */
- " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */
- " movd %%mm0, %%"REGEA" \n\t"
- " movb %%al, (%%"REGDI") \n" /* store last one */
-
- ""AllDone": \n\t"
- " emms \n\t"
-#if !defined(__x86_64__)
- "mov "_oldbx", %%"REGB" \n\t"
-#endif
- ::
- "m" /*0*/(FPround1),
- "m" /*1*/(vWeight1),
- "m" /*2*/(vWeight2),
- "m" /*3*/(y/*YMask[0]*/),
- "m" /*4*/(src_row_size),
- "m" /*5*/(y/*EndOffset*/),
- "m" /*6*/(pControl),
- "m" /*7*/(row_size),
- "m" /*8*/(vWorkYW),
- "m" /*9*/(dstp),
- "m" /*10*/(y/*vWorkUVW*/),
- "m" /*11*/(FPround2),
- "m" /*12*/(srcp1),
- "m" /*13*/(srcp2)
-#if !defined(__x86_64__)
- ,
- "m" /*14*/(oldbx),
- "m" /*15*/(SSEMMXenabledW),
- "m" /*16*/(SSE2enabledW)
- : REGA, /*REGB,*/ REGC, REGD, REGSI, REGDI
-#else
- : REGA, REGB, REGC, REGD, REGSI, REGDI
-#endif
- );
-
- dstp += dst_pitch;
- }
-#endif
- return 0;
-}
-
-/*
- * tools
- */
-
-#ifndef ALIGN
-# define ALIGN(b,p) ((void*)((((unsigned long)(p)) + (b)-1) & (~((b)-1))))
-#endif
-#ifndef MIN
-# define MIN(a,b) ((a) < (b) ? (a) : (b))
-#endif
-#ifndef MAX
-# define MAX(a,b) ((a) > (b) ? (a) : (b))
-#endif
-#ifndef FABS
-# define FABS(x) ((x) < 0.0 ? -(x) : (x))
-#endif
-
-/*
- * xine plugin
- */
-
-#define PLUGIN_ID "warp"
-#define PLUGIN_DESCR "(non-)linear software scaling post plugin";
-#define PLUGIN_T warp_plugin_t
-/*#define POST_THREADS*/
-/*#define POST_SLICES*/
-#include "xine/post_util.h"
-
-
-/* plugin class initialization function */
-void *warp_init_plugin(xine_t *xine, void *);
-
-/* plugin class functions */
-static post_plugin_t *open_plugin(post_class_t *class_gen, int inputs,
- xine_audio_port_t **audio_target,
- xine_video_port_t **video_target);
-
-/* plugin instance functions */
-static void warp_dispose(post_plugin_t *this_gen);
-
-/* vo_frame functions */
-static vo_frame_t *got_frame(vo_frame_t *frame);
-static void draw_internal(vo_frame_t *frame, vo_frame_t *new_frame);
-
-
-/* parameter functions */
-static xine_post_api_descr_t *warp_get_param_descr(void);
-static int warp_set_parameters(xine_post_t *this_gen, void *param_gen);
-static int warp_get_parameters(xine_post_t *this_gen, void *param_gen);
-static char *warp_get_help(void);
-
-
-typedef struct warp_parameters_s {
- int output_width;
- int output_height;
- double output_aspect;
- int no_downscaling;
-} warp_parameters_t;
-
-START_PARAM_DESCR(warp_parameters_t)
-PARAM_ITEM(POST_PARAM_TYPE_INT, output_width, NULL, 640, 1920, 0,
- "output video width")
-PARAM_ITEM(POST_PARAM_TYPE_INT, output_height, NULL, 480, 1080, 0,
- "output video height")
-PARAM_ITEM(POST_PARAM_TYPE_DOUBLE, output_aspect, NULL, 1, 3, 0,
- "output video aspect ratio")
-PARAM_ITEM(POST_PARAM_TYPE_BOOL, no_downscaling,NULL, 0, 1, 0,
- "disable downscaling")
-END_PARAM_DESCR(warp_param_descr)
-
-
-typedef struct {
- post_plugin_t post;
-
- xine_post_in_t parameter_input;
-
- /* User config (changes to actual config are delayed) */
- warp_parameters_t config;
-
- /* Current config */
- int enable;
- int output_width;
- int output_height;
- double output_aspect;
- double factor_x;
- double factor_y;
-
- /* Last seen input frame */
- int input_width;
- int input_height;
- int input_format;
- int input_interlaced;
- double input_aspect;
-
- /* working buffers */
- uint32_t *vWorkY;
- uint32_t *vWorkUV;
-
- /* scaling tables */
- uint32_t *hControl;
- uint32_t *hControlUV;
- uint32_t *vOffsets;
- uint32_t *vOffsetsUV;
- uint32_t *vWeights;
- uint32_t *vWeightsUV;
-
- /* memory for work areas and scaling tables */
- void *pMem;
-
-} warp_plugin_t;
-
-/*
- *
- */
-
-static void init_tables(warp_plugin_t *this)
-{
-#define BP(x) ((uint8_t*)(x))
- /* allocate memory for scaling tables and workspace */
- free(this->pMem);
- this->pMem = malloc(this->input_width*3 + this->output_width*sizeof(uint32_t)*3*2 +
- this->output_height*sizeof(uint32_t)*4 + 2*9*128);
-
- /* - aligned for P4 cache line */
- this->vWorkY = (uint32_t*)ALIGN(128, this->pMem);
- this->vWorkUV = (uint32_t*)ALIGN(128, BP(this->vWorkY) + this->input_width*2 + 128);
- this->hControl = (uint32_t*)ALIGN(128, BP(this->vWorkUV) + this->input_width + 128);
- this->vOffsets = (uint32_t*)ALIGN(128, BP(this->hControl) + this->output_width * sizeof(uint32_t) * 3 + 128);
- this->vWeights = (uint32_t*)ALIGN(128, BP(this->vOffsets) + this->output_height * sizeof(uint32_t) + 128);
-
- if (this->input_format == XINE_IMGFMT_YV12) {
- this->vOffsetsUV = (uint32_t*)ALIGN(128, BP(this->vWeights) + this->output_height * sizeof(uint32_t) + 128);
- this->vWeightsUV = (uint32_t*)ALIGN(128, BP(this->vOffsetsUV) + this->output_height * sizeof(uint32_t) + 128);
- this->hControlUV = (uint32_t*)ALIGN(128, BP(this->vWeightsUV) + this->output_height * sizeof(uint32_t) + 128);
-
- init_tables_yv12(this->output_width, this->output_height,
- this->input_width, this->input_height,
- this->input_interlaced, this->factor_x, this->factor_y,
- this->hControl, this->vOffsets, this->vWeights,
- this->hControlUV, this->vOffsetsUV, this->vWeightsUV );
-
- } else if (this->input_format == XINE_IMGFMT_YUY2) {
-
- init_tables_yuy2(this->output_width, this->output_height,
- this->input_width, this->input_height,
- this->input_interlaced, this->factor_x, this->factor_y,
- this->hControl, this->vOffsets, this->vWeights );
- }
-}
-
-static void calculate_factors(warp_plugin_t *this)
-{
- /* try to guess amount to stretch/shrink */
- double adiff = this->input_aspect - this->output_aspect;
- this->factor_x = 1.0;
- this->factor_y = 1.0;
-
- if (adiff > 0.1) {
-
- if (adiff > 0.1 + ((16.0-12.0)/9.0)) {
- /* >16:9 -> >4:3 */
- DBG("aspect ratio diff %1.3lf > 0 : too large !\n", adiff);
- this->factor_x = 0.95;
- this->factor_y = 1.15;
- this->output_aspect += (adiff - 4.0/9.0);
- DBG(" changing target ratio to %3.1lf\n", this->output_aspect);
- } else {
- /* 16:9 ... 12:9 -> 4:3 */
- DBG("aspect ratio diff %1.3lf > 0 : 16.9...12:9 -> 4:3\n", adiff);
- this->factor_x = 1.0 - 0.05 * adiff * 9.0/4.0;
- this->factor_y = 1.0 + 0.15 * adiff * 9.0/4.0;
- }
-
- } else if (adiff < -0.1) {
-
- if(adiff < -0.1-((16.0-12.0)/9.0)) {
- /* <4:3 -> <16:9 */
- DBG("aspect ratio diff %1.3lf > 0 : too large !\n", adiff);
- this->factor_x = 1.05;
- this->factor_y = 0.85;
- this->output_aspect += (adiff + 4.0/9.0);
- DBG(" changing target ratio to %3.1lf\n", this->output_aspect);
- } else {
- /* 4:3...16:9 -> 16:9 */
- DBG("aspect ratio diff %1.3lf < 0 : 4:3...16:9 -> 16:9\n", adiff);
- this->factor_x = 1.0 + 0.05 * adiff * 9.0/4.0;
- this->factor_y = 1.0 - 0.15 * adiff * 9.0/4.0;
- }
-
- } else {
- DBG("aspect ratio matches, no warp\n");
- this->factor_x = 1.0;
- this->factor_y = 1.0;
- }
-
- DBG("factor_x = %1.3lf factor_y = %1.3lf output ratio = %1.3lf\n",
- this->factor_x, this->factor_y, this->output_aspect);
-}
-
-/*
- *
- */
-
-void *warp_init_plugin(xine_t *xine, void *data)
-{
-#if !defined(__x86_64__)
- /* Need at least MMX */
- if (!(xine_mm_accel() & MM_ACCEL_X86_MMX)) {
- fprintf(stderr, "warp_init_plugin: ERROR: at least MMX required\n");
- return NULL;
- }
-#endif
-
- return init_plugin(xine, data);
-}
-
-static post_plugin_t *open_plugin(post_class_t *class_gen, int inputs,
- xine_audio_port_t **audio_target,
- xine_video_port_t **video_target)
-{
- warp_plugin_t *this = calloc(1, sizeof(warp_plugin_t));
- post_plugin_t *this_gen = (post_plugin_t *) this;
- post_in_t *input;
- post_out_t *output;
- xine_post_in_t *input_param;
- post_video_port_t *port;
-
- static xine_post_api_t post_api =
- { warp_set_parameters, warp_get_parameters, warp_get_param_descr, warp_get_help };
-
- if (!this || !video_target || !video_target[0]) {
- free(this);
- return NULL;
- }
-
- _x_post_init(this_gen, 0, 1);
-
- port = _x_post_intercept_video_port(this_gen, video_target[0], &input, &output);
- port->intercept_frame = intercept_frame_yuy;
- port->new_frame->draw = post_draw;
- input->xine_in.name = "video";
- output->xine_out.name = "video (scaled)";
- this_gen->xine_post.video_input[0] = &port->new_port;
-
- this_gen->dispose = warp_dispose;
-
- input_param = &this->parameter_input;
- input_param->name = "parameters";
- input_param->type = XINE_POST_DATA_PARAMETERS;
- input_param->data = &post_api;
- xine_list_push_back(this_gen->input, input_param);
-
- this->config.output_aspect = 0.0; /* -> do not change aspect ratio */
- this->config.output_width = 0; /* -> do not change width */
- this->config.output_height = 0; /* -> do not change height */
- this->config.no_downscaling = 0;
-
- this->input_width = 0; /* not known yet, triggers initialization later */
- this->input_height = 0;
-
- return this_gen;
-}
-
-static void warp_dispose(post_plugin_t *this_gen)
-{
- if (_x_post_dispose(this_gen)) {
- warp_plugin_t *this = (warp_plugin_t *) this_gen;
-
- DBG("dispose\n");
-
- free(this->pMem);
- free(this);
- }
-}
-
-static vo_frame_t *got_frame(vo_frame_t *frame)
-{
- post_video_port_t *port = (post_video_port_t *)frame->port;
- warp_plugin_t *this = (warp_plugin_t *)port->post;
- double adiff = this->input_aspect - frame->ratio;
-
- if (this->input_width != frame->width || this->input_height != frame->height ||
- this->input_format != frame->format || FABS(adiff)>0.1 ||
- this->input_interlaced != !!(frame->flags & VO_INTERLACED_FLAG)) {
-
- DBG("detected frame format change: %dx%d -> %dx%d, interlaced %d->%d, aspect %1.3lf->%1.3lf, %s->%s\n",
- this->input_width, this->input_height, frame->width, frame->height,
- this->input_interlaced, !!(frame->flags & VO_INTERLACED_FLAG),
- this->input_aspect, frame->ratio,
- this->input_format==XINE_IMGFMT_YV12 ? "yv12":"yuy2",
- frame->format==XINE_IMGFMT_YV12 ? "yv12":"yuy2" );
-
- /* free tables and buffers */
- free(this->pMem);
- this->pMem = NULL;
-
- /* remember frame properties to detect changes in video format */
- this->input_width = frame->width;
- this->input_height = frame->height;
- this->input_format = frame->format;
- this->input_aspect = frame->ratio;
- this->input_interlaced = !!(frame->flags & VO_INTERLACED_FLAG);
-
- /* re-configure target size and aspect ratio */
- this->output_aspect = this->config.output_aspect ?: frame->ratio;
- if (!this->config.no_downscaling) {
- this->output_width = this->config.output_width ?: frame->width;
- this->output_height = this->config.output_height ?: frame->height;
- } else {
- this->output_width = MAX(this->config.output_width, frame->width);
- this->output_height = MAX(this->config.output_height, frame->height);
- }
-
- /* calculate warp function factors */
- calculate_factors(this);
-
- adiff = this->input_aspect - this->output_aspect;
- if(this->output_width == frame->width &&
- this->output_height == frame->height &&
- adiff < 0.1 &&
- adiff > -0.1 ) {
- this->enable = 0;
- DBG("--> nothing to do, disabling processing for now\n");
- return NULL;
- }
-
- this->enable = 1;
-
- init_tables(this);
- }
-
- if (!this->enable)
- return NULL;
-
- return port->original_port->get_frame(port->original_port,
- this->output_width, this->output_height,
- this->output_aspect, frame->format,
- frame->flags | VO_BOTH_FIELDS);
-}
-
-static void draw_internal(vo_frame_t *frame, vo_frame_t *new_frame)
-{
- post_video_port_t *port = (post_video_port_t *)frame->port;
- warp_plugin_t *this = (warp_plugin_t *)port->post;
- int proc_height = frame->height;
-
- if (frame->format == XINE_IMGFMT_YV12) {
-
- do_warp_yv12(new_frame->base[0], frame->base[0],
- new_frame->pitches[0], frame->pitches[0],
- this->output_width, this->output_height,
- frame->width, proc_height,
- this->input_interlaced,
- this->hControl, this->vOffsets, this->vWeights,
- this->vWorkY,
- 0);
- proc_height /= 2;
- do_warp_yv12(new_frame->base[1], frame->base[1],
- new_frame->pitches[1], frame->pitches[1],
- this->output_width/2, this->output_height/2,
- frame->width/2, proc_height,
- this->input_interlaced,
- this->hControlUV, this->vOffsetsUV, this->vWeightsUV,
- this->vWorkUV,
- 0);
- do_warp_yv12(new_frame->base[2], frame->base[2],
- new_frame->pitches[2], frame->pitches[2],
- this->output_width/2, this->output_height/2,
- frame->width/2, proc_height,
- this->input_interlaced,
- this->hControlUV, this->vOffsetsUV, this->vWeightsUV,
- this->vWorkUV,
- 0);
-
- } else if (frame->format == XINE_IMGFMT_YUY2) {
- do_warp_yuy2(new_frame->base[0], frame->base[0],
- new_frame->pitches[0], frame->pitches[0],
- this->output_width, this->output_height,
- frame->width, proc_height,
- this->input_interlaced,
- this->hControl, this->vOffsets, this->vWeights,
- this->vWorkY, this->vWorkUV,
- 0);
- }
-}
-
-/*
- * parameter functions
- */
-
-static xine_post_api_descr_t *warp_get_param_descr(void)
-{
- return &warp_param_descr;
-}
-
-static int warp_set_parameters(xine_post_t *this_gen, void *param_gen)
-{
- warp_plugin_t *this = (warp_plugin_t *)this_gen;
- warp_parameters_t *params = (warp_parameters_t *)param_gen;
-
- memcpy(&this->config, params, sizeof(warp_parameters_t));
- this->input_width = this->input_height = 0;
-
- if(this->config.output_aspect > 999)
- this->config.output_aspect /= 1000.0;
-
- DBG("warp_set_parameters: "
- "output_width=%d, output_height=%d, output_aspect=%4.3lf, no_downscaling=%d\n",
- this->config.output_width, this->config.output_height, this->config.output_aspect,
- this->config.no_downscaling);
-
- return 1;
-}
-
-static int warp_get_parameters(xine_post_t *this_gen, void *param_gen)
-{
- warp_plugin_t *this = (warp_plugin_t *)this_gen;
- warp_parameters_t *params = (warp_parameters_t *)param_gen;
-
- DBG("warp_get_parameters\n");
- memcpy(params, &this->config, sizeof(warp_parameters_t));
-
- return 1;
-}
-
-static char *warp_get_help(void) {
- return _(
- "The warp plugin scales video to another resolution. "
- "It supports non-linear stretching to change video aspect ratio. "
- "\n"
- "Parameters\n"
- " output_width: Scale video to width\n"
- " (0 -> do not change video width)\n"
- " output_height: Scale video to height\n"
- " (0 -> do not change video height)\n"
- " output_aspect: Adjust aspect ratio using non-linear scaling\n"
- " (0 -> do not change video aspect ratio)\n"
- " no_downscaling: Do not downscale video\n"
- "\n"
- );
-}
-
-
-/*
- * plugin info
- */
-
-static post_info_t info = { XINE_POST_TYPE_VIDEO_FILTER };
-
-const plugin_info_t xine_plugin_info[] __attribute__((visibility("default"))) =
-{
- /* type, API, "name", version, special_info, init_function */
- { PLUGIN_POST, POST_PLUGIN_IFACE_VERSION, "warp", XINE_VERSION_CODE, &info, &warp_init_plugin },
- { PLUGIN_POST, POST_PLUGIN_IFACE_VERSION, "swscale", XINE_VERSION_CODE, &info, &warp_init_plugin },
- { PLUGIN_NONE, 0, "", 0, NULL, NULL }
-};