diff options
author | cvs2svn <admin@example.com> | 2009-10-21 00:02:02 +0000 |
---|---|---|
committer | cvs2svn <admin@example.com> | 2009-10-21 00:02:02 +0000 |
commit | 97a97ca3358eb48de3eb7a222e487e800566569f (patch) | |
tree | 97c920d0225a1c9773a3bce2207f261d7d230123 /xine_post_swscale.c | |
parent | a61961358c5a2ec92340b3f8e056bab55438f103 (diff) | |
download | xineliboutput-CVS.tar.gz xineliboutput-CVS.tar.bz2 |
This commit was manufactured by cvs2svn to create branch 'CVS'.CVS
Diffstat (limited to 'xine_post_swscale.c')
-rw-r--r-- | xine_post_swscale.c | 1730 |
1 files changed, 0 insertions, 1730 deletions
diff --git a/xine_post_swscale.c b/xine_post_swscale.c deleted file mode 100644 index 9e5be7c2..00000000 --- a/xine_post_swscale.c +++ /dev/null @@ -1,1730 +0,0 @@ -/* - * Copyright (C) 2000-2007 the xine project - * - * This file is part of xine, a free video player. - * - * xine is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * xine is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * $Id: xine_post_swscale.c,v 1.9 2008-12-13 14:24:03 phintuka Exp $ - * - * Simple (faster) resize for avisynth - * Copyright (C) 2002 Tom Barry - * - * Very simple 2 tap linear interpolation. - * It is unfiltered which means it will not soften much. - * - * WarpedResize will do a non-linear stretch/squeeze in both the horizontal - * and vertical dimensions. This can be useful when you want to change the - * aspect ratio of a video clip and have it mostly distorted at the - * top, bottom, and side edges. - * - * - * Ported to linux/xine by Petri Hintukainen <phintuka@users.sourceforge.net> - * - Added x86_64 support - * - Added PIC support (do not clobber ebx in x86, access only local variables from asm) - * - Fixed yv12 stretched warp tables generation - */ - -#include <xine/xine_internal.h> -#include <xine/post.h> - -/*#define DBG(x...)*/ -#define DBG(x...) fprintf(stderr, "post_warp: " x) - -/*#define STREAMING_STORE_TMP*/ -/*#define STREAMING_STORE*/ -/*#define PREFETCH*/ -/* streaming store and prefetch seems to be slower ... - * Tested with P3 (128M L2) and C2D (4M L2). - * Maybe access pattern is enough simple for HW prefetchers. - */ - -/*#define VANILLA*/ - -/* - * This function accepts a position from 0 to 1 and warps it, to 0 through 1 based - * upon the wFact var. The warp equations are designed to: - * - * * Always be rising but yield results from 0 to 1 - * - * * Have a first derivative that doesn't go to 0 or infinity, at least close - * to the center of the screen - * - * * Have a curvature (absolute val of 2nd derivative) that is small in the - * center and smoothly rises towards the edges. We would like the curvature - * to be everywhere = 0 when the warp factor = 1 - */ -static double WarpFactor(double position, double wFact) -{ - double x; - double z; - double w; - x = 2 * (position - .5); - if (1) /*(wFact < 1.0)*/ - /* For warp factor < 1 the warp is calculated as (1-w) * x^3 + w *x, centered - * - * The warp is calculated as z = (1 - w) * x^3 + w * x, centered - * around .5 and ranging from 0 to 1. After some tinkering this seems - * to give decent values and derivatives at the right places. - */ - w = 2.0 - wFact; /* reverse parm for compat with initial release */ - - if (x < 0.0) { - z = -(1 - w) * x*x*x - w * x; /* -1 < x < 0, wFact < 1 */ - return .5 - .5 * z; - } else { - z = (1 - w) * x*x*x + w * x; /* -1 < x < 0, wFact < 1 */ - return .5 + .5 * z; /* amts to same formula as above for now */ - } -} - -/* - * YV12 - * - * For each horizontal output pair of pixels there is are 2 qword masks followed by 2 int - * offsets. The 2 masks are the weights to be used for the luma and chroma, respectively. - * Each mask contains LeftWeight1, RightWeight1, LeftWeight2, RightWeight2. So a pair of pixels - * will later be processed each pass through the horizontal resize loop. I think with my - * current math the Horizontal Luma and Chroma contains the same values but since I may have screwed it - * up I'll leave it this way for now. Vertical chroma is different. - * - * Note - try just using the luma calcs for both, seem to be the same. - * - * The weights are scaled 0-256 and the left and right weights will sum to 256 for each pixel. - */ -static void init_tables_yv12(int newwidth, int newheight, int oldwidth, int oldheight, - int Interlaced, double hWarp, double vWarp, - uint32_t *hControl, uint32_t *vOffsets, uint32_t *vWeights, - uint32_t *hControlUV, uint32_t *vOffsetsUV, uint32_t *vWeightsUV) -{ - int i; - int j; - int k; - int wY1; - int wY2; - DBG("init_yv12: %dx%d->%dx%d hWarp %1.3lf vWarp %1.3lf\n", - oldwidth, oldheight, newwidth, newheight, hWarp, vWarp); - - /* First set up horizontal table, use for both luma & chroma since - * it seems to have the same equation. - * We will geneerate these values in pairs, mostly because that's the way - * I wrote it for YUY2 above. - */ - - for(i=0; i < newwidth; i+=2) { - /* first make even pixel control */ - if (hWarp==1) /*if no warp factor */ - j = i * 256 * (oldwidth-1) / (newwidth-1); - else /* stretch and warp somehow */ - j = (int) (256 * WarpFactor(i / (newwidth-1.0), hWarp) * (oldwidth-1)); - - k = j>>8; - wY2 = j - (k << 8); /* luma weight of right pixel */ - wY1 = 256 - wY2; /* luma weight of left pixel */ - - if (k > oldwidth - 2) { - hControl[i*3+4] = oldwidth - 1; /* point to last byte */ - hControl[i*3] = 0x00000100; /* use 100% of rightmost Y */ - } else { - hControl[i*3+4] = k; /* pixel offset */ - hControl[i*3] = wY2 << 16 | wY1; /* luma weights */ - } - - /* now make odd pixel control */ - if (hWarp==1) /* if no warp factor */ - j = (i+1) * 256 * (oldwidth-1) / (newwidth-1); - else /* stretch and warp somehow */ - j = (int) (256 * WarpFactor((i+1) / (newwidth-1.0), hWarp) * (oldwidth-1)); - - k = j>>8; - wY2 = j - (k << 8); /* luma weight of right pixel */ - wY1 = 256 - wY2; /* luma weight of left pixel */ - - if (k > oldwidth - 2) { - hControl[i*3+5] = oldwidth - 1; /* point to last byte */ - hControl[i*3+1] = 0x00000100; /* use 100% of rightmost Y */ - } else { - hControl[i*3+5] = k; /* pixel offset */ - hControl[i*3+1] = wY2 << 16 | wY1; /* luma weights */ - } - } - - hControl[newwidth*3+4] = 2 * (oldwidth-1); /* give it something to prefetch at end */ - hControl[newwidth*3+5] = 2 * (oldwidth-1); /* " */ -#ifndef VANILLA - // UV - for(i=0; i < newwidth/2; i+=2) { - /* first make even pixel control */ - if (hWarp==1) /*if no warp factor */ - j = i * 256 * (oldwidth/2-1) / (newwidth/2-1); - else /* stretch and warp somehow */ - j = (int) (256 * WarpFactor(i / (newwidth/2-1.0), hWarp) * (oldwidth/2-1)); - - k = j>>8; - wY2 = j - (k << 8); /* luma weight of right pixel */ - wY1 = 256 - wY2; /* luma weight of left pixel */ - - if (k > oldwidth/2 - 2) { - hControlUV[i*3+4] = oldwidth/2 - 1; /* point to last byte */ - hControlUV[i*3] = 0x00000100; /* use 100% of rightmost Y */ - } else { - hControlUV[i*3+4] = k; /* pixel offset */ - hControlUV[i*3] = wY2 << 16 | wY1; /* luma weights */ - } - - /* now make odd pixel control */ - if (hWarp==1) /* if no warp factor */ - j = (i+1) * 256 * (oldwidth/2-1) / (newwidth/2-1); - else /* stretch and warp somehow */ - j = (int) (256 * WarpFactor((i+1) / (newwidth/2-1.0), hWarp) * (oldwidth/2-1)); - - k = j>>8; - wY2 = j - (k << 8); /* luma weight of right pixel */ - wY1 = 256 - wY2; /* luma weight of left pixel */ - - if (k > oldwidth/2 - 2) { - hControlUV[i*3+5] = oldwidth/2 - 1; /* point to last byte */ - hControlUV[i*3+1] = 0x00000100; /* use 100% of rightmost Y */ - } else { - hControlUV[i*3+5] = k; /* pixel offset */ - hControlUV[i*3+1] = wY2 << 16 | wY1; /* luma weights */ - } - } - - hControlUV[newwidth/2*3+4] = (oldwidth/2-1); /* give it something to prefetch at end */ - hControlUV[newwidth/2*3+5] = (oldwidth/2-1); /* " */ -#endif - - /* Next set up vertical tables. The offsets are measured in lines and will be mult */ - /* by the source pitch later . */ - - /* For YV12 we need separate Luma and chroma tables */ - - /* First Luma Table */ - for(i=0; i< newheight; ++i) { - if (vWarp==1) /* if no warp factor */ - j = i * 256 * (oldheight-1) / (newheight-1); - else /* stretch and warp somehow */ - j = (int) (256 * WarpFactor(i / (newheight-1.0), vWarp) * (oldheight-1)); - - if (Interlaced) { /* do hard way? */ - if (i%2) { /* is odd output line? */ - if (j < 256) { /* before 1st odd input line */ - vOffsets[i] = 1; /* all from line 1 */ - vWeights[i] = 0; /* weight to give to 2nd line */ - } else { - k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */ - vOffsets[i] = k; - wY2 = j - (k << 8); - vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */ - } - } else { /* is even output line */ - k = (j >> 9) << 1; /* next lower even line */ - vOffsets[i] = k; - wY2 = j - (k << 8); - vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */ - } - } else { /* simple way, do as progressive */ - k = j >> 8; - vOffsets[i] = k; - wY2 = j - (k << 8); - vWeights[i] = wY2; /* weight to give to 2nd line */ - } - } - - /* Vertical table for chroma */ - for(i=0; i< newheight/2; ++i) { - if (vWarp==1) /* if no warp factor */ -#ifdef VANILLA - j = (int) ( (i+.25) * 256 * (oldheight-1) / (newheight-1.0) - 64 ); -#else - j = (int) ( (i+.25) * 256 * (oldheight/2-1) / (newheight/2-1.0) - 64 ); -#endif - else /* stretch and warp somehow */ -#ifdef VANILLA - j = (int) (256 * WarpFactor( (i+.25) / (newheight-1.0), vWarp) * (oldheight-1.0) ); -#else - j = (int) (256 * WarpFactor( (i+.25) / (newheight/2 - 1.0), vWarp) * (oldheight/2 - 1.0) ); -#endif -#ifndef VANILLA - if(j<0) j=0; -#endif - if (Interlaced) { /* do hard way? */ - if (i%2) { /* is odd output line? */ - if (j < 256) { /* before 1st odd input line */ - vOffsetsUV[i] = 1; /* all from line 1 */ - vWeightsUV[i] = 0; /* weight to give to 2nd line */ - } else { - k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */ - vOffsetsUV[i] = k; - wY2 = j - (k << 8); - vWeightsUV[i] = wY2 >> 1; /* weight to give to 2nd line */ - } - } else { /* is even output line */ -#ifdef VANILLA - k = (j >> 9) << 1; /* next lower even line */ - vOffsetsUV[i] = k; - wY2 = j - (k << 8); - vWeightsUV[i] = wY2 >> 1; /* weight to give to 2nd line */ -#else - k = (j / 512) << 1; /* next lower even line */ - vOffsetsUV[i] = k; - wY2 = j - (k << 8); - vWeightsUV[i] = wY2 >> 1; /* weight to give to 2nd line */ -#endif - } - } else { /* simple way, do as progressive */ -#ifdef VANILLA - k = j >> 8; -#else - k = j / 256; /* j >> 8; does not work right if -256 < j < 0 */ -#endif - vOffsetsUV[i] = k; - wY2 = j - (k << 8); - vWeightsUV[i] = wY2; /* weight to give to 2nd line */ - } - } -} - -/* - * YUY2 - * - * For each horizontal output pair of pixels there is are 2 qword masks followed by 2 int - * offsets. The 2 masks are the weights to be used for the luma and chroma, respectively. - * Each mask contains LeftWeight1, RightWeight1, LeftWeight2, RightWeight2. So a pair of pixels - * will later be processed each pass through the horizontal resize loop. - * - * The weights are scaled 0-256 and the left and right weights will sum to 256 for each pixel. - */ -static void init_tables_yuy2(int newwidth, int newheight, int oldwidth, int oldheight, - int Interlaced, double hWarp, double vWarp, - uint32_t *hControl, uint32_t *vOffsets, uint32_t *vWeights ) -{ - int i; - int j; - int k; - int wY1; - int wY2; - int wUV1; - int wUV2; - DBG("init_yuy2: %dx%d->%dx%d hWarp %1.3lf vWarp %1.3lf\n", - oldwidth, oldheight, newwidth, newheight, hWarp, vWarp); - - /* First set up horizontal table */ - for(i=0; i < newwidth; i+=2) { - /* first make even pixel control */ - if (hWarp==1) /* if no warp factor */ - j = i * 256 * (oldwidth-1) / (newwidth-1); - else /* stretch and warp somehow */ - j = (int) (256 * WarpFactor(i / (newwidth-1.0), hWarp) * (oldwidth-1)); - - k = j>>8; - wY2 = j - (k << 8); /* luma weight of right pixel */ - wY1 = 256 - wY2; /* luma weight of left pixel */ - wUV2 = (k%2) ? 128 + (wY2 >> 1) : wY2 >> 1; - wUV1 = 256 - wUV2; - - if (k > oldwidth - 2) { - hControl[i*3+4] = oldwidth - 1; /* point to last byte */ - hControl[i*3] = 0x00000100; /* use 100% of rightmost Y */ - hControl[i*3+2] = 0x00000100; /* use 100% of rightmost U */ - } else { - hControl[i*3+4] = k; /* pixel offset */ - hControl[i*3] = wY2 << 16 | wY1; /* luma weights */ - hControl[i*3+2] = wUV2 << 16 | wUV1; /* chroma weights */ - } - - /* now make odd pixel control */ - if (hWarp==1) /* if no warp factor */ - j = (i+1) * 256 * (oldwidth-1) / (newwidth-1); - else /* stretch and warp somehow */ - j = (int) (256 * WarpFactor((i+1) / (newwidth-1.0), hWarp) * (oldwidth-1)); - - k = j>>8; - wY2 = j - (k << 8); /* luma weight of right pixel */ - wY1 = 256 - wY2; /* luma weight of left pixel */ - wUV2 = (k%2) ? 128 + (wY2 >> 1) : wY2 >> 1; - wUV1 = 256 - wUV2; - - if (k > oldwidth - 2) { - hControl[i*3+5] = oldwidth - 1; /* point to last byte */ - hControl[i*3+1] = 0x00000100; /* use 100% of rightmost Y */ - hControl[i*3+3] = 0x00000100; /* use 100% of rightmost V */ - } else { - hControl[i*3+5] = k; /* pixel offset */ - hControl[i*3+1] = wY2 << 16 | wY1; /* luma weights */ - /* hControl[i*3+3] = wUV2 << 16 | wUV1; // chroma weights */ - /* horiz chroma weights should be same as for even pixel - trbarry 09/16/2002 */ - hControl[i*3+3] = hControl[i*3+2]; /* chroma weights */ - } - } - - hControl[newwidth*3+4] = 2 * (oldwidth-1); /* give it something to prefetch at end */ - hControl[newwidth*3+5] = 2 * (oldwidth-1); - - /* Next set up vertical table. The offsets are measured in lines and will be mult */ - /* by the source pitch later */ - for(i=0; i< newheight; ++i) { - if (vWarp==1) /* if no warp factor */ - j = i * 256 * (oldheight-1) / (newheight-1); - else /* stretch and warp somehow */ - j = (int) (256 * WarpFactor(i / (newheight-1.0), vWarp) * (oldheight-1)); - - if (Interlaced) { /* do hard way? */ - if (i%2) { /* is odd output line? */ - if (j < 256) { /* before 1st odd input line */ - vOffsets[i] = 1; /* all from line 1 */ - vWeights[i] = 0; /* weight to give to 2nd line */ - } else { - k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */ - vOffsets[i] = k; - wY2 = j - (k << 8); - vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */ - } - } else { /* is even output line */ - k = (j >> 9) << 1; /* next lower even line */ - vOffsets[i] = k; - wY2 = j - (k << 8); - vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */ - } - } else { /* simple way, do as progressive */ - k = j >> 8; - vOffsets[i] = k; - wY2 = j - (k << 8); - vWeights[i] = wY2; /* weight to give to 2nd line */ - } - } -} - -/* Register allocation */ -/* index/counter registers (REGA, REGC) are loaded from 32bit vars/arrays ! */ -#define REGEA "eax" -#define REGEB "ebx" -#if defined(__x86_64__) -# define REGA "rax" -# define REGB "rbx" -# define REGC "ecx" -# define REGD "rdx" -# define REGDI "rdi" -# define REGSI "rsi" -#elif defined(__i386__) -# define REGA "eax" -# define REGB "ebx" -# define REGC "ecx" -# define REGD "edx" -# define REGDI "edi" -# define REGSI "esi" -#endif - -/* variables accessed from assembler code */ -#define _FPround1 "%0" -#define _vWeight1 "%1" -#define _vWeight2 "%2" -#define _YMask "%3" -#define _src_row_size "%4" -#define _EndOffset "%5" -#define _pControl "%6" -#define _row_size "%7" -#define _vWorkYW "%8" -#define _dstp "%9" -#define _vWorkUVW "%10" -#define _FPround2 "%11" -#define _srcp1 "%12" -#define _srcp2 "%13" -#if !defined(__x86_64__) -#define _oldbx "%14" -#define _SSEMMXenabledW "%15" -#define _SSE2enabledW "%16" -#endif - -/* Labels */ -#define vMaybeSSEMMX "1" -#define LessThan8 "2" -#define LessThan4 "3" -#define AllDone "4" -#define LastOne "5" -#define vLoopSSE2_Fetch "6" -#define vLoopSSE2 "7" -#define vLoopSSEMMX_Fetch "8" -#define vLoopSSEMMX "9" -#define vLoopMMX "10" -#define MoreSpareChange "11" -#define DoHorizontal "12" -#define hLoopMMX "13" -#define hLoopMMXSSE "14" - - -/* structure for mmx constants */ -typedef union { - uint64_t uq[1]; /* Unsigned Quadword */ - uint32_t ud[2]; /* Unsigned Doubleword */ -} ATTR_ALIGN(16) mmx_t; - -/* structure for sse2 constants */ -typedef union { - uint64_t uq[2]; /* Unsigned Quadword */ - uint32_t ud[4]; /* Unsigned Doubleword */ -} ATTR_ALIGN(16) sse2_t; - - -static int do_warp_yuy2(uint8_t *dst, const uint8_t *src, - const int dst_pitch, const int src_pitch, - const int dst_width, const int dst_height, - const int src_width, const int src_height, - const int Interlaced, const uint32_t * const hControl, - const uint32_t * const vOffsets, const uint32_t * const vWeights, - uint32_t *vWorkY, uint32_t *vWorkUV, - int dst_start) -{ -#if defined(__i386__) || defined(__x86_64__) - sse2_t YMask = {uq:{UINT64_C(0x00ff00ff00ff00ff),UINT64_C(0x00ff00ff00ff00ff)}}; /* keeps only luma */ - sse2_t FPround1 = {uq:{UINT64_C(0x0080008000800080),UINT64_C(0x0080008000800080)}}; /* round words */ - sse2_t FPround2 = {uq:{UINT64_C(0x0000008000000080),UINT64_C(0x0000008000000080)}}; /* round dwords */ - sse2_t vWeight1; - sse2_t vWeight2; - - const uint32_t *pControl = &hControl[0]; - const uint32_t *vWorkYW = vWorkY; - const uint32_t *vWorkUVW = vWorkUV; - const uint8_t *srcp = src; - const uint8_t *srcp1; - const uint8_t *srcp2; - uint8_t *dstp = dst + dst_pitch*dst_start; - - const uint32_t src_row_size = src_width * 2; - const uint32_t row_size = dst_width * 2; - const uint32_t EndOffset = src_row_size / 2; - -#if !defined(__x86_64__) - const int accel = xine_mm_accel(); - const uint32_t SSE2enabledW = !!(accel & MM_ACCEL_X86_SSE2); /* in local storage for asm */ - const uint32_t SSEMMXenabledW = !!(accel & MM_ACCEL_X86_MMXEXT); /* in local storage for asm */ - long int oldbx; -#endif - int y; - - for (y = dst_start; y < dst_height; y++) { - - if(vOffsets[y] >= src_height) { - /* slice completed */ - /*DBG("do_warp_yuy2: max input height reached: need line %d, height %d\n -> Returning next output line: %d\n", - vOffsets[y], src_height, y);*/ - return y; - } - - vWeight1.ud[0] = vWeight1.ud[1] = vWeight1.ud[2] = vWeight1.ud[3] = - (256-vWeights[y]) << 16 | (256-vWeights[y]); - vWeight2.ud[0] = vWeight2.ud[1] = vWeight2.ud[2] = vWeight2.ud[3] = - vWeights[y] << 16 | vWeights[y]; - - srcp1 = srcp + vOffsets[y] * src_pitch; - if (Interlaced) - srcp2 = (y < dst_height-2) ? srcp1 + 2 * src_pitch : srcp1; - else - srcp2 = (y < dst_height-1) ? srcp1 + src_pitch : srcp1; - - __asm__ __volatile__ ( -#if !defined(__x86_64__) - /* store ebx (PIC) */ - "mov %%"REGB", "_oldbx" \n\t" -#endif - "movl "_src_row_size", %%"REGC" \n\t" - "shrl $3, %%"REGC" \n\t" /* 8 bytes a time */ - "mov "_srcp1", %%"REGSI" \n\t" /* top of 2 src lines to get */ - "mov "_srcp2", %%"REGD" \n\t" /* next " */ - "mov "_vWorkYW", %%"REGDI" \n\t" /* luma work destination line */ - "mov "_vWorkUVW", %%"REGB" \n\t" /* luma work destination line */ - "xor %%"REGA", %%"REGA" \n\t" -#if !defined(__x86_64__) - /* Let's check here to see if we are on a P4 or higher and can use SSE2 instructions. - * This first loop is not the performance bottleneck anyway but it is trivial to tune - * using SSE2 if we have proper alignment. - */ - "testl $1, "_SSE2enabledW" \n\t" /* is SSE2 supported?*/ - "jz "vMaybeSSEMMX"f \n\t" /* n, can't do anyway*/ -#endif - "cmpl $2, %%"REGC" \n\t" /* we have at least 16 bytes, 2 qwords? */ - "jl "vMaybeSSEMMX"f \n\t" /* n, don't bother*/ - - "shrl $1, %%"REGC" \n\t" /* do 16 bytes at a time instead*/ - "decl %%"REGC" \n" /* jigger loop ct */ - - ".align 16 \n\t" - - "movdqa "_FPround1", %%xmm0 \n\t" - "movdqa "_vWeight1", %%xmm5 \n\t" - "movdqa "_vWeight2", %%xmm6 \n\t" - "movdqa "_YMask", %%xmm7 \n" - - ""vLoopSSE2_Fetch": \n\t" -#ifdef PREFETCH - " prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t" - " prefetcht0 16(%%"REGD", %%"REGA", 2) \n" -#endif - ""vLoopSSE2": \n\t" - " movdqu (%%"REGSI", %%"REGA", 2), %%xmm1 \n\t" /* top of 2 lines to interpolate */ - " movdqu (%%"REGD", %%"REGA", 2), %%xmm2 \n\t" /* 2nd of 2 lines */ - - " movdqa %%xmm1, %%xmm3 \n\t" /* get chroma bytes */ - " pand %%xmm7, %%xmm1 \n\t" /* keep only luma */ - " psrlw $8, %%xmm3 \n\t" /* right just chroma */ - " pmullw %%xmm5, %%xmm1 \n\t" /* mult by weighting factor */ - " pmullw %%xmm5, %%xmm3 \n\t" /* mult by weighting factor */ - - " movdqa %%xmm2, %%xmm4 \n\t" /* get chroma bytes */ - " pand %%xmm7, %%xmm2 \n\t" /* keep only luma */ - " psrlw $8, %%xmm4 \n\t" /* right just chroma */ - " pmullw %%xmm6, %%xmm2 \n\t" /* mult by weighting factor */ - " pmullw %%xmm6, %%xmm4 \n\t" /* mult by weighting factor */ - - " paddw %%xmm2, %%xmm1 \n\t" /* combine lumas */ - " paddusw %%xmm0, %%xmm1 \n\t" /* round */ - " psrlw $8, %%xmm1 \n\t" /* right adjust luma */ -#ifdef STREAMING_STORE_TMP - " movntdq %%xmm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */ -#else - " movdqu %%xmm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */ -#endif - " paddw %%xmm4, %%xmm3 \n\t" /* combine chromas */ - " paddusw %%xmm0, %%xmm3 \n\t" /* round */ - " psrlw $8, %%xmm3 \n\t" /* right adjust chroma */ - " packuswb %%xmm3, %%xmm3 \n\t" /* pack UV's into low dword */ - " movdq2q %%xmm3, %%mm1 \n\t" /* save in our work area */ -#ifdef STREAMING_STORE_TMP - " movntq %%mm1, (%%"REGB", %%"REGA") \n\t" /* save in our work area */ -#else - " movq %%mm1, (%%"REGB", %%"REGA") \n\t" /* save in our work area */ -#endif - " lea 8(%%"REGA"), %%"REGA" \n\t" - " decl %%"REGC" \n\t" - - " jg "vLoopSSE2_Fetch"b \n\t" /* if not on last one loop, prefetch */ - " jz "vLoopSSE2"b \n\t" /* or just loop, or not */ - - /* done with our SSE2 fortified loop but we may need to pick up the spare change */ -#ifdef STREAMING_STORE_TMP - " sfence \n\t" -#endif - " movl "_src_row_size", %%"REGC" \n\t" /* get count again */ - " andl $15, %%"REGC" \n\t" /* just need mod 16 */ - - " movq "_YMask", %%mm7 \n\t" /* useful luma mask constant - lazy dupl init */ - " movq "_vWeight1", %%mm5 \n\t" - " movq "_vWeight2", %%mm6 \n\t" - " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */ - - " shrl $3, %%"REGC" \n\t" /* 8 bytes at a time, any? */ - " jz "MoreSpareChange"f \n" /* n, did them all */ - - /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions. - * This first loop is not the performance bottleneck anyway but it is trivial to tune - * using SSE if we have proper alignment. - */ - ""vMaybeSSEMMX": \n\t" - - " movq "_YMask", %%mm7 \n\t" /* useful luma mask constant - lazy dupl init */ - " movq "_vWeight1", %%mm5 \n\t" - " movq "_vWeight2", %%mm6 \n\t" - " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */ -#if !defined(__x86_64__) - " testl $1, "_SSEMMXenabledW" \n\t" /* MMXEXTsupported? */ - " jz "vLoopMMX"f \n\t" /* n, can't do anyway */ -#endif - " decl %%"REGC" \n" /* jigger loop ctr */ - - ".align 16 \n" - ""vLoopSSEMMX_Fetch": \n\t" -#ifdef PREFETCH - " prefetcht0 8(%%"REGSI", %%"REGA", 2) \n\t" - " prefetcht0 8(%%"REGD", %%"REGA", 2) \n" -#endif - ""vLoopSSEMMX": \n\t" - " movq (%%"REGSI", %%"REGA", 2), %%mm1 \n\t" /* top of 2 lines to interpolate */ - " movq (%%"REGD", %%"REGA", 2), %%mm2 \n\t" /* 2nd of 2 lines */ - - " movq %%mm1, %%mm3 \n\t" /* copy top bytes */ - " pand %%mm7, %%mm1 \n\t" /* keep only luma */ - " pxor %%mm1, %%mm3 \n\t" /* keep only chroma */ - " psrlw $8, %%mm3 \n\t" /* right just chroma */ - " pmullw %%mm5, %%mm1 \n\t" /* mult by weighting factor */ - " pmullw %%mm5, %%mm3 \n\t" /* mult by weighting factor */ - - " movq %%mm2, %%mm4 \n\t" /* copy 2nd bytes */ - " pand %%mm7, %%mm2 \n\t" /* keep only luma */ - " pxor %%mm2, %%mm4 \n\t" /* keep only chroma */ - " psrlw $8, %%mm4 \n\t" /* right just chroma */ - " pmullw %%mm6, %%mm2 \n\t" /* mult by weighting factor */ - " pmullw %%mm6, %%mm4 \n\t" /* mult by weighting factor */ - - " paddw %%mm2, %%mm1 \n\t" /* combine lumas */ - " paddusw %%mm0, %%mm1 \n\t" /* round */ - " psrlw $8, %%mm1 \n\t" /* right adjust luma */ -#ifdef STREAMING_STORE_TMP - " movntq %%mm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */ -#else - " movq %%mm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */ -#endif - " paddw %%mm4, %%mm3 \n\t" /* combine chromas */ - " paddusw %%mm0, %%mm3 \n\t" /* round */ - " psrlw $8, %%mm3 \n\t" /* right adjust chroma */ - " packuswb %%mm3, %%mm3 \n\t" /* pack UV's into low dword */ - " movd %%mm3, (%%"REGB", %%"REGA") \n\t" /* save in our work area */ - - " lea 4(%%"REGA"), %%"REGA" \n\t" - " decl %%"REGC" \n\t" - " jg "vLoopSSEMMX_Fetch"b \n\t" /* if not on last one loop, prefetch */ - " jz "vLoopSSEMMX"b \n\t" /* or just loop, or not */ -#ifdef STREAMING_STORE_TMP - " sfence \n\t" -#endif - " jmp "MoreSpareChange"f \n" /* all done with vertical */ - - ".align 16 \n" - ""vLoopMMX": \n\t" - - " movq (%%"REGSI", %%"REGA", 2), %%mm1 \n\t" /* top of 2 lines to interpolate */ - " movq (%%"REGD", %%"REGA", 2), %%mm2 \n\t" /* 2nd of 2 lines */ - - " movq %%mm1, %%mm3 \n\t" /* copy top bytes */ - " pand %%mm7, %%mm1 \n\t" /* keep only luma */ - " pxor %%mm1, %%mm3 \n\t" /* keep only chroma */ - " psrlw $8, %%mm3 \n\t" /* right just chroma */ - " pmullw %%mm5, %%mm1 \n\t" /* mult by weighting factor */ - " pmullw %%mm5, %%mm3 \n\t" /* mult by weighting factor */ - - " movq %%mm2, %%mm4 \n\t" /* copy 2nd bytes */ - " pand %%mm7, %%mm2 \n\t" /* keep only luma */ - " pxor %%mm2, %%mm4 \n\t" /* keep only chroma */ - " psrlw $8, %%mm4 \n\t" /* right just chroma */ - " pmullw %%mm6, %%mm2 \n\t" /* mult by weighting factor */ - " pmullw %%mm6, %%mm4 \n\t" /* mult by weighting factor */ - - " paddw %%mm2, %%mm1 \n\t" /* combine lumas */ - " paddusw %%mm0, %%mm1 \n\t" /* round */ - " psrlw $8, %%mm1 \n\t" /* right adjust luma */ - " movq %%mm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */ - - " paddw %%mm4, %%mm3 \n\t" /* combine chromas */ - " paddusw %%mm0, %%mm3 \n\t" /* round */ - " psrlw $8, %%mm3 \n\t" /* right adjust chroma */ - " packuswb %%mm3, %%mm3 \n\t" /* pack UV's into low dword */ - " movd %%mm3, (%%"REGB", %%"REGA") \n\t" /* save in our work area */ - - " lea 4(%%"REGA"), %%"REGA" \n\t" - " loop "vLoopMMX"b \n" - - /* Add a little code here to check if we have 2 more pixels to do and, if so, make one - * more pass thru vLoopMMX. We were processing in multiples of 4 pixels and alway have - * an even number so there will never be more than 2 left. trbarry 7/29/2002 - */ - ""MoreSpareChange": \n\t" - - " cmpl "_EndOffset", %%"REGEA" \n\t" /* did we get them all */ - " jnl "DoHorizontal"f \n\t" /* yes, else have 2 left */ - " movl $1, %%"REGC" \n\t" /* jigger loop ct */ - " sub $2, %%"REGA" \n\t" /* back up 2 pixels (4 bytes, but eax carried as 1/2) */ - " jmp "vLoopMMX"b \n" - - /* We've taken care of the vertical scaling, now do horizontal */ - ""DoHorizontal": \n\t" - - " movq "_YMask", %%mm7 \n\t" /* useful 0U0U.. mask constant */ - " movq "_FPround2", %%mm6 \n\t" /* useful rounding constant, dwords */ - " mov "_pControl", %%"REGSI" \n\t" /* @ horiz control bytes */ - " movl "_row_size", %%"REGC" \n\t" - " shrl $2, %%"REGC" \n\t" /* bytes a time, 2 pixels */ - " mov "_vWorkYW", %%"REGD" \n\t" /* our luma data, as 0Y0Y 0Y0Y.. */ - " mov "_dstp", %%"REGDI" \n\t" /* the destination line */ - " mov "_vWorkUVW", %%"REGB" \n" /* chroma data, as UVUV UVUV... */ - - ".align 16 \n" - ""hLoopMMX": \n\t" - - /* x86_64: must use movl (accessing table of uint32's) */ - " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */ - " movd (%%"REGD", %%"REGA", 2), %%mm0 \n\t" /* copy luma pair */ - " shr $1, %%"REGA" \n\t" /* div offset by 2 */ - " movd (%%"REGB", %%"REGA", 2), %%mm1 \n\t" /* copy UV pair VUVU */ - " psllw $8, %%mm1 \n\t" /* shift out V, keep 0000U0U0 */ - - /* we need to use both even and odd croma from same location - trb 9/2002 */ - " punpckldq (%%"REGB", %%"REGA", 2), %%mm1 \r\n" /* copy UV pair VUVU */ - " psrlw $8, %%mm1 \r\n" /* shift out U0, keep 0V0V 0U0U */ - " movl 20(%%"REGSI"), %%"REGEA" \r\n" /* get data offset in pixels, 2nd pixel pair */ - " punpckldq (%%"REGD", %%"REGA", 2), %%mm0 \r\n" /* copy luma pair */ - - " pmaddwd (%%"REGSI"), %%mm0 \r\n" /* mult and sum lumas by ctl weights */ - " paddusw %%mm6, %%mm0 \r\n" /* round */ - " psrlw $8, %%mm0 \r\n" /* right just 2 luma pixel value 000Y,000Y */ - - " pmaddwd 8(%%"REGSI"), %%mm1 \r\n" /* mult and sum chromas by ctl weights */ - " paddusw %%mm6, %%mm1 \r\n" /* round */ - " pslld $8, %%mm1 \r\n" /* shift into low bytes of different words */ - " pand %%mm7, %%mm1 \r\n" /* keep only 2 chroma values 0V00,0U00 */ - " por %%mm1, %%mm0 \r\n" /* combine luma and chroma, 0V0Y,0U0Y */ - " packuswb %%mm0, %%mm0 \r\n" /* pack all into low dword, xxxxVYUY */ - " movd %%mm0, (%%"REGDI") \n\t" /* done with 2 pixels */ - - " lea 24(%%"REGSI"), %%"REGSI" \n\t" /* bump to next control bytest */ - " lea 4(%%"REGDI"), %%"REGDI" \n\t" /* bump to next output pixel addr */ - - " loop "hLoopMMX"b \n\t" /* loop for more */ - - "emms \n\t" - /* done with one line */ - -#if !defined(__x86_64__) - "mov "_oldbx", %%"REGB" \n\t" -#endif - :: - "m" /*0*/(FPround1), - "m" /*1*/(vWeight1), - "m" /*2*/(vWeight2), - "m" /*3*/(YMask), - "m" /*4*/(src_row_size), - "m" /*5*/(EndOffset), - "m" /*6*/(pControl), - "m" /*7*/(row_size), - "m" /*8*/(vWorkYW), - "m" /*9*/(dstp), - "m" /*10*/(vWorkUVW), - "m" /*11*/(FPround2), - "m" /*12*/(srcp1), - "m" /*13*/(srcp2) -#if !defined(__x86_64__) - , - "m" /*14*/(oldbx), - "m" /*15*/(SSEMMXenabledW), - "m" /*16*/(SSE2enabledW) - : REGA, /*REGB,*/ REGC, REGD, REGSI, REGDI -#else - : REGA, REGB, REGC, REGD, REGSI, REGDI -#endif - ); - - dstp += dst_pitch; - } -#endif - return 0; -} - -static int do_warp_yv12(uint8_t *dst, const uint8_t * const src, - const int dst_pitch, const int src_pitch, - const int dst_width, const int dst_height, - const int src_width, const int src_height, - const int Interlaced, const uint32_t * const hControl, - const uint32_t * vOffsets, const uint32_t * vWeights, - uint32_t *vWorkY, int dst_start) -{ -#if defined(__i386__) || defined(__x86_64__) - sse2_t FPround1 = {uq:{UINT64_C(0x0080008000800080),UINT64_C(0x0080008000800080)}}; /* round words */ - sse2_t FPround2 = {uq:{UINT64_C(0x0000008000000080),UINT64_C(0x0000008000000080)}}; /* round dwords */ - sse2_t vWeight1; - sse2_t vWeight2; - - const uint32_t *pControl = &hControl[0]; - const uint32_t *vWorkYW = vWorkY; - const uint8_t *srcp = src; - const uint8_t *srcp1; - const uint8_t *srcp2; - uint8_t *dstp = dst + dst_pitch*dst_start; - - const uint32_t src_row_size = src_width; - const uint32_t row_size = dst_width; - -#if !defined(__x86_64__) - const int accel = xine_mm_accel(); - const uint32_t SSE2enabledW = !!(accel & MM_ACCEL_X86_SSE2); /* in local storage for asm */ - const uint32_t SSEMMXenabledW = !!(accel & MM_ACCEL_X86_MMXEXT); /* in local storage for asm */ - long int oldbx; -#endif - int y; - - /* Operation in sliced mode: - * - continue until required next source line is out of slice - * - return next output line - * - at next call, continue from next souce line - */ - - for (y = dst_start; y < dst_height; y++) { - if(vOffsets[y] >= src_height) { - /* slice completed */ - /*DBG("do_warp_yv12: max input height reached: need line %d, height %d\n -> Returning next output line: %d , start was %d\n", - (int)vOffsets[y], (int)src_height, (int)y, (int)dst_start);*/ - return y; - } - - vWeight1.ud[0] = vWeight1.ud[1] = vWeight1.ud[2] = vWeight1.ud[3] = - (256-vWeights[y]) << 16 | (256-vWeights[y]); - vWeight2.ud[0] = vWeight2.ud[1] = vWeight2.ud[2] = vWeight2.ud[3] = - vWeights[y] << 16 | vWeights[y]; - - srcp1 = srcp + vOffsets[y] * src_pitch; - - if (Interlaced) - srcp2 = (y < dst_height-2) ? srcp1 + 2 * src_pitch : srcp1; - else - srcp2 = (y < dst_height-1) ? srcp1 + src_pitch : srcp1; - - __asm__ __volatile__( - "movl "_src_row_size", %%"REGC" \n\t" - "shr $3, %%"REGC" \n\t" /* 8 bytes a time */ - "mov "_srcp1", %%"REGSI" \n\t" /* top of 2 src lines to get */ - "mov "_srcp2", %%"REGD" \n\t" /* next " */ - "mov "_vWorkYW", %%"REGDI" \n\t" /* luma work destination line */ - "xor %%"REGA", %%"REGA" \n\t" -#if !defined(__x86_64__) - /* Let's check here to see if we are on a P4 or higher and can use SSE2 instructions. - * This first loop is not the performance bottleneck anyway but it is trivial to tune - * using SSE2 if we have proper alignment. - */ - "testl $1, "_SSE2enabledW" \n\t" /* is SSE2 supported? */ - "jz "vMaybeSSEMMX"f \n\t" /* n, can't do anyway */ -#endif - "cmpl $2, %%"REGC" \n\t" /* we have at least 16 byts, 2 qwords? */ - "jl "vMaybeSSEMMX"f \n\t" /* n, don't bother */ - - "mov %%"REGSI", %%"REGB" \n\t" - "or %%"REGD", %%"REGB" \n\t" - "test $15, %%"REGB" \n\t" /* both src rows 16 byte aligned? */ - "jnz "vMaybeSSEMMX"f \n\t" /* n, don't use sse2 */ - - "shr $1, %%"REGC" \n\t" /* do 16 bytes at a time instead */ - "dec %%"REGC" \n\t" /* jigger loop ct */ - - "movdqa "_FPround1", %%xmm0 \n\t" - "movdqa "_vWeight1", %%xmm5 \n\t" - "movdqa "_vWeight2", %%xmm6 \n\t" - "pxor %%xmm7, %%xmm7 \n" - - ".align 16 \n" - ""vLoopSSE2_Fetch": \n\t" -#ifdef PREFETCH - " prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t" - " prefetcht0 16(%%"REGD", %%"REGA", 2) \n" -#endif - ""vLoopSSE2": \n\t" - /* we're already checked pointers to be on dqword aligned */ - " movdqa (%%"REGSI", %%"REGA"), %%xmm1 \n\t" /* top of 2 lines to interpolate */ - " movdqa (%%"REGD", %%"REGA"), %%xmm3 \n\t" /* 2nd of 2 lines */ - " movdqa %%xmm1, %%xmm2 \n\t" - " movdqa %%xmm3, %%xmm4 \n\t" - - " punpcklbw %%xmm7, %%xmm1 \n\t" /* make words */ - " punpckhbw %%xmm7, %%xmm2 \n\t" /* " */ - " punpcklbw %%xmm7, %%xmm3 \n\t" /* " */ - " punpckhbw %%xmm7, %%xmm4 \n\t" /* " */ - - " pmullw %%xmm5, %%xmm1 \n\t" /* mult by top weighting factor */ - " pmullw %%xmm5, %%xmm2 \n\t" /* " */ - " pmullw %%xmm6, %%xmm3 \n\t" /* mult by bot weighting factor */ - " pmullw %%xmm6, %%xmm4 \n\t" /* " */ - - " paddw %%xmm3, %%xmm1 \n\t" /* combine lumas low */ - " paddw %%xmm4, %%xmm2 \n\t" /* combine lumas high */ - - " paddusw %%xmm0, %%xmm1 \n\t" /* round */ - " paddusw %%xmm0, %%xmm2 \n\t" /* round */ - - " psrlw $8, %%xmm1 \n\t" /* right adjust luma */ - " psrlw $8, %%xmm2 \n\t" /* right adjust luma */ - - " packuswb %%xmm2, %%xmm1 \n\t" /* pack words to our 16 byte answer */ -#ifdef STREAMING_STORE_TMP - " movntdq %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */ -#else - " movdqu %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */ -#endif - " lea 16(%%"REGA"), %%"REGA" \n\t" - " decl %%"REGC" \n\t" - - " jg "vLoopSSE2_Fetch"b \n\t" /* if not on last one loop, prefetch */ - " jz "vLoopSSE2"b \n\t" /* or just loop, or not */ - - /* done with our SSE2 fortified loop but we may need to pick up the spare change */ -#ifdef STREAMING_STORE_TMP - " sfence \n\t" -#endif - " movl "_src_row_size", %%"REGC" \n\t" /* get count again */ - " andl $15, %%"REGC" \n\t" /* just need mod 16 */ - " movq "_vWeight1", %%mm5 \n\t" - " movq "_vWeight2", %%mm6 \n\t" - " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */ - - " shrl $3, %%"REGC" \n\t" /* 8 bytes at a time, any? */ - " jz "MoreSpareChange"f \n" /* n, did them all */ - - /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions. - * This first loop is not the performance bottleneck anyway but it is trivial to tune - * using SSE if we have proper alignment. - */ - ""vMaybeSSEMMX": \n\t" - - " movq "_vWeight1", %%mm5 \n\t" - " movq "_vWeight2", %%mm6 \n\t" - " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */ - " pxor %%mm7, %%mm7 \n\t" -#if !defined(__x86_64__) - " testl $1, "_SSEMMXenabledW" \n\t"/* MMXEXTsupported? */ - " jz "vLoopMMX"f \n\t" /* n, can't do anyway */ -#endif - " decl %%"REGC" \n" /* jigger loop ctr */ - - ".align 16 \n" - ""vLoopSSEMMX_Fetch": \n\t" -#ifdef PREFETCH - " prefetcht0 8(%%"REGSI", %%"REGA") \n\t" - " prefetcht0 8(%%"REGD", %%"REGA") \n" -#endif - ""vLoopSSEMMX": \n\t" - - " movq (%%"REGSI", %%"REGA"), %%mm1 \n\t" /* top of 2 lines to interpolate */ - " movq (%%"REGD", %%"REGA"), %%mm3 \n\t" /* 2nd of 2 lines */ - - " movq %%mm1, %%mm2 \n\t" - " movq %%mm3, %%mm4 \n\t" - - " punpcklbw %%mm7, %%mm1 \n\t" /* make words */ - " punpckhbw %%mm7, %%mm2 \n\t" /* " */ - " punpcklbw %%mm7, %%mm3 \n\t" /* " */ - " punpckhbw %%mm7, %%mm4 \n\t" /* " */ - - " pmullw %%mm5, %%mm1 \n\t" /* mult by top weighting factor */ - " pmullw %%mm5, %%mm2 \n\t" /* " */ - " pmullw %%mm6, %%mm3 \n\t" /* mult by bot weighting factor */ - " pmullw %%mm6, %%mm4 \n\t" /* " */ - - " paddw %%mm3, %%mm1 \n\t" /* combine lumas low */ - " paddw %%mm4, %%mm2 \n\t" /* combine lumas high */ - - " paddusw %%mm0, %%mm1 \n\t" /* round */ - " paddusw %%mm0, %%mm2 \n\t" /* round */ - - " psrlw $8, %%mm1 \n\t" /* right adjust luma */ - " psrlw $8, %%mm2 \n\t" /* right adjust luma */ - - " packuswb %%mm2, %%mm1 \n\t" /* pack words to our 8 byte answer */ -#ifdef STREAMING_STORE_TMP - " movntq %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */ -#else - " movq %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */ -#endif - " lea 8(%%"REGA"), %%"REGA" \n\t" - " decl %%"REGC" \n\t" - - " jg "vLoopSSEMMX_Fetch"b \n\t" /* if not on last one loop, prefetch */ - " jz "vLoopSSEMMX"b \n\t" /* or just loop, or not */ -#ifdef STREAMING_STORE_TMP - " sfence \n\t" -#endif - " jmp "MoreSpareChange"f \n" /* all done with vertical */ - - ".align 16 \n" - ""vLoopMMX": \n\t" - - " movq (%%"REGSI", %%"REGA"), %%mm1 \n\t" /* top of 2 lines to interpolate */ - " movq (%%"REGD", %%"REGA"), %%mm3 \n\t" /* 2nd of 2 lines */ - - " movq %%mm1, %%mm2 \n\t" - " movq %%mm3, %%mm4 \n\t" - - " punpcklbw %%mm7, %%mm1 \n\t" /* make words */ - " punpckhbw %%mm7, %%mm2 \n\t" /* " */ - " punpcklbw %%mm7, %%mm3 \n\t" /* " */ - " punpckhbw %%mm7, %%mm4 \n\t" /* " */ - - " pmullw %%mm5, %%mm1 \n\t" /* mult by top weighting factor */ - " pmullw %%mm5, %%mm2 \n\t" /* " */ - " pmullw %%mm6, %%mm3 \n\t" /* mult by bot weighting factor */ - " pmullw %%mm6, %%mm4 \n\t" /* " */ - - " paddw %%mm3, %%mm1 \n\t" /* combine lumas low */ - " paddw %%mm4, %%mm2 \n\t" /* combine lumas high */ - - " paddusw %%mm0, %%mm1 \n\t" /* round */ - " paddusw %%mm0, %%mm2 \n\t" /* round */ - - " psrlw $8, %%mm1 \n\t" /* right adjust luma */ - " psrlw $8, %%mm2 \n\t" /* right adjust luma */ - - " packuswb %%mm2, %%mm1 \n\t" /* pack words to our 8 byte answer */ - " movq %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */ - - " lea 8(%%"REGA"), %%"REGA" \n\t" - " loop "vLoopMMX"b \n" - - /* Add a little code here to check if we have more pixels to do and, if so, make one - * more pass thru vLoopMMX. We were processing in multiples of 8 pixels and alway have - * an even number so there will never be more than 7 left. - */ - ""MoreSpareChange": \n\t" - - " cmpl "_src_row_size", %%"REGEA" \n\t" /* did we get them all */ - " jnl "DoHorizontal"f \n\t" /* yes, else have 2 left */ - " movl $1, %%"REGC" \n\t" /* jigger loop ct */ - " movl "_src_row_size", %%"REGEA" \n\t" - " sub $8, %%"REGA" \n\t" /* back up to last 8 pixels */ - " jmp "vLoopMMX"b \n" - - /* We've taken care of the vertical scaling, now do horizontal */ - ""DoHorizontal": \n\t" - " pxor %%mm7, %%mm7 \n\t" - " movq "_FPround2", %%mm6 \n\t" /* useful rounding constant, dwords */ - " mov "_pControl", %%"REGSI" \n\t" /* @ horiz control bytes */ - " movl "_row_size", %%"REGC" \n\t" - " shrl $2, %%"REGC" \n\t" /* 4 bytes a time, 4 pixels */ - " mov "_vWorkYW", %%"REGD" \n\t" /* our luma data, as 0Y0Y 0Y0Y.. */ - " mov "_dstp", %%"REGDI" \n\t" /* the destination line */ -#if !defined(__x86_64__) - " testl $1, "_SSEMMXenabledW" \n\t" /* MMXEXTsupported? */ - " jz "hLoopMMX"f \n\t" /* n, can't do anyway */ -#endif - /* With SSE support we will make 8 pixels (from 8 pairs) at a time */ - " shrl $1, %%"REGC" \n\t" /* 8 bytes a time instead of 4 */ - " jz "LessThan8"f \n" - - ".align 16 \n" - ""hLoopMMXSSE": \n\t" - - - /* handle first 2 pixels */ - /* phi: must use movl here (x86_64, reading from table of uint_32's) */ - " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */ - " movl 20(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 2nd pixel pair */ - - " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */ - " punpcklwd (%%"REGD", %%"REGB"), %%mm0 \r\n" /* 2nd luma pair, now xxxxYYYY */ - " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ - " movl 16+24(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 3st pixel pair */ - " movl 20+24(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 4nd pixel pair */ - " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */ - " paddusw %%mm6, %%mm0 \n\t" /* round */ - " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ - - /* handle 3rd and 4th pixel pairs */ - " movd (%%"REGD", %%"REGA"), %%mm1 \n\t" /* copy luma pair 0000xxYY */ - " punpcklwd (%%"REGD", %%"REGB"), %%mm1 \r\n" /* 2nd luma pair, now xxxxYYYY */ - " punpcklbw %%mm7, %%mm1 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ - " movl 16+48(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 5st pixel pair */ - " movl 20+48(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 6nd pixel pair */ - " pmaddwd 24(%%"REGSI"), %%mm1 \n\t" /* mult and sum lumas by ctl weights */ - " paddusw %%mm6, %%mm1 \n\t" /* round */ - " psrlw $8, %%mm1 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ - - /* handle 5th and 6th pixel pairs */ - " movd (%%"REGD", %%"REGA"), %%mm2 \n\t" /* copy luma pair 0000xxYY */ - " punpcklwd (%%"REGD", %%"REGB"), %%mm2 \r\n" /* 2nd luma pair, now xxxxYYYY */ - " punpcklbw %%mm7, %%mm2 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ - " movl 16+72(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 7st pixel pair */ - " movl 20+72(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 8nd pixel pair */ - " pmaddwd 48(%%"REGSI"), %%mm2 \n\t" /* mult and sum lumas by ctl weights */ - " paddusw %%mm6, %%mm2 \n\t" /* round */ - " psrlw $8, %%mm2 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ - - /* handle 7th and 8th pixel pairs */ - " movd (%%"REGD", %%"REGA"), %%mm3 \n\t" /* copy luma pair 0000xxYY */ - " punpcklwd (%%"REGD", %%"REGB"), %%mm3 \r\n" /* 2nd luma pair, now xxxxYYYY */ - " punpcklbw %%mm7, %%mm3 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ - " pmaddwd 72(%%"REGSI"), %%mm3 \n\t" /* mult and sum lumas by ctl weights */ - " paddusw %%mm6, %%mm3 \n\t" /* round */ - " psrlw $8, %%mm3 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ - - /* combine, store, and loop */ - " packuswb %%mm1, %%mm0 \n\t" /* pack into qword, 0Y0Y0Y0Y */ - " packuswb %%mm3, %%mm2 \n\t" /* pack into qword, 0Y0Y0Y0Y */ - " packuswb %%mm2, %%mm0 \n\t" /* and again into YYYYYYYY */ -#ifdef STREAMING_STORE - " movntq %%mm0, (%%"REGDI") \n\t" /* done with 4 pixels */ -#else - " movq %%mm0, (%%"REGDI") \n\t" /* done with 4 pixels */ -#endif - - " lea 96(%%"REGSI"), %%"REGSI" \n\t" - " lea 8(%%"REGDI"), %%"REGDI" \n\t" - " decl %%"REGC" \n\t" - " jg "hLoopMMXSSE"b \n\t" /* loop for more */ -#ifdef STREAMING_STORE - " sfence \n" -#endif - ""LessThan8": \n\t" - " movl "_row_size", %%"REGC" \n\t" - " andl $7, %%"REGC" \n\t" /* we have done all but maybe this */ - " shrl $2, %%"REGC" \n\t" /* now do only 4 bytes at a time */ - " jz "LessThan4"f \n" - - ".align 16 \n" - ""hLoopMMX": \n\t" - - /* handle first 2 pixels */ - " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */ - " movl 20(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 2nd pixel pair */ - " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */ - " punpcklwd (%%"REGD", %%"REGB"), %%mm0 \r\n" /* 2nd luma pair, now xxxxYYYY */ - " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ - " movl 16+24(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 3st pixel pair */ - " movl 20+24(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 4nd pixel pair */ - " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */ - " paddusw %%mm6, %%mm0 \n\t" /* round */ - " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ - - /* handle 3rd and 4th pixel pairs */ - " movd (%%"REGD", %%"REGA"), %%mm1 \n\t" /* copy luma pair 0000xxYY */ - " punpckldq (%%"REGD", %%"REGB"), %%mm1 \r\n" /* 2nd luma pair, now xxxxYYYY */ - " punpcklbw %%mm7, %%mm1 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ - " pmaddwd 24(%%"REGSI"), %%mm1 \n\t" /* mult and sum lumas by ctl weights */ - " paddusw %%mm6, %%mm1 \n\t" /* round */ - " psrlw $8, %%mm1 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ - - /* combine, store, and loop */ - " packuswb %%mm1, %%mm0 \n\t" /* pack into qword, 0Y0Y0Y0Y */ - " packuswb %%mm7, %%mm0 \n\t" /* and again into 0000YYYY */ - " movd %%mm0, (%%"REGDI") \n\t" /* done with 4 pixels */ - " lea 48(%%"REGSI"), %%"REGSI" \n\t" - " lea 4(%%"REGDI"), %%"REGDI" \n\t" - - " loop "hLoopMMX"b \n" /* loop for more */ - - /* test to see if we have a mod 4 size row, if not then more spare change */ - ""LessThan4": \n\t" - " movl "_row_size", %%"REGC" \n\t" - " andl $3, %%"REGC" \n\t" /* remainder side mod 4 */ - " cmpl $2, %%"REGC" \n\t" - " jl "LastOne"f \n\t" /* none, none */ - - /* handle 2 more pixels */ - " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */ - " movl 20(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 2nd pixel pair */ - " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */ - " punpcklwd (%%"REGD", %%"REGB"), %%mm0 \r\n" /* 2nd luma pair, now xxxxYYYY */ - " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ - - " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */ - " paddusw %%mm6, %%mm0 \n\t" /* round */ - " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ - " packuswb %%mm7, %%mm0 \n\t" /* pack into qword, 00000Y0Y */ - " packuswb %%mm7, %%mm0 \n\t" /* and again into 000000YY */ - " movd %%mm0, (%%"REGDI") \n\t" /* store, we are guarrenteed room in buffer (8 byte mult) */ - " subl $2, %%"REGC" \n\t" - - " lea 24(%%"REGSI"), %%"REGSI" \n\t" /* bump to next control bytes */ - " lea 2(%%"REGDI"), %%"REGDI" \n" /* bump to next output pixel addr */ - - /* maybe one last pixel */ - ""LastOne": \n\t" - " cmpl $0, %%"REGC" \r\n" /* still more ? */ - " jz "AllDone"f \r\n" /* n, done */ - " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */ - " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */ - " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ - - " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */ - " paddusw %%mm6, %%mm0 \n\t" /* round */ - " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ - " movd %%mm0, %%"REGEA" \n\t" - " movb %%al, (%%"REGDI") \n" /* store last one */ - - ""AllDone": \n\t" - " emms \n\t" -#if !defined(__x86_64__) - "mov "_oldbx", %%"REGB" \n\t" -#endif - :: - "m" /*0*/(FPround1), - "m" /*1*/(vWeight1), - "m" /*2*/(vWeight2), - "m" /*3*/(y/*YMask[0]*/), - "m" /*4*/(src_row_size), - "m" /*5*/(y/*EndOffset*/), - "m" /*6*/(pControl), - "m" /*7*/(row_size), - "m" /*8*/(vWorkYW), - "m" /*9*/(dstp), - "m" /*10*/(y/*vWorkUVW*/), - "m" /*11*/(FPround2), - "m" /*12*/(srcp1), - "m" /*13*/(srcp2) -#if !defined(__x86_64__) - , - "m" /*14*/(oldbx), - "m" /*15*/(SSEMMXenabledW), - "m" /*16*/(SSE2enabledW) - : REGA, /*REGB,*/ REGC, REGD, REGSI, REGDI -#else - : REGA, REGB, REGC, REGD, REGSI, REGDI -#endif - ); - - dstp += dst_pitch; - } -#endif - return 0; -} - -/* - * tools - */ - -#ifndef ALIGN -# define ALIGN(b,p) ((void*)((((unsigned long)(p)) + (b)-1) & (~((b)-1)))) -#endif -#ifndef MIN -# define MIN(a,b) ((a) < (b) ? (a) : (b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a) > (b) ? (a) : (b)) -#endif -#ifndef FABS -# define FABS(x) ((x) < 0.0 ? -(x) : (x)) -#endif - -/* - * xine plugin - */ - -#define PLUGIN_ID "warp" -#define PLUGIN_DESCR "(non-)linear software scaling post plugin"; -#define PLUGIN_T warp_plugin_t -/*#define POST_THREADS*/ -/*#define POST_SLICES*/ -#include "xine/post_util.h" - - -/* plugin class initialization function */ -void *warp_init_plugin(xine_t *xine, void *); - -/* plugin class functions */ -static post_plugin_t *open_plugin(post_class_t *class_gen, int inputs, - xine_audio_port_t **audio_target, - xine_video_port_t **video_target); - -/* plugin instance functions */ -static void warp_dispose(post_plugin_t *this_gen); - -/* vo_frame functions */ -static vo_frame_t *got_frame(vo_frame_t *frame); -static void draw_internal(vo_frame_t *frame, vo_frame_t *new_frame); - - -/* parameter functions */ -static xine_post_api_descr_t *warp_get_param_descr(void); -static int warp_set_parameters(xine_post_t *this_gen, void *param_gen); -static int warp_get_parameters(xine_post_t *this_gen, void *param_gen); -static char *warp_get_help(void); - - -typedef struct warp_parameters_s { - int output_width; - int output_height; - double output_aspect; - int no_downscaling; -} warp_parameters_t; - -START_PARAM_DESCR(warp_parameters_t) -PARAM_ITEM(POST_PARAM_TYPE_INT, output_width, NULL, 640, 1920, 0, - "output video width") -PARAM_ITEM(POST_PARAM_TYPE_INT, output_height, NULL, 480, 1080, 0, - "output video height") -PARAM_ITEM(POST_PARAM_TYPE_DOUBLE, output_aspect, NULL, 1, 3, 0, - "output video aspect ratio") -PARAM_ITEM(POST_PARAM_TYPE_BOOL, no_downscaling,NULL, 0, 1, 0, - "disable downscaling") -END_PARAM_DESCR(warp_param_descr) - - -typedef struct { - post_plugin_t post; - - xine_post_in_t parameter_input; - - /* User config (changes to actual config are delayed) */ - warp_parameters_t config; - - /* Current config */ - int enable; - int output_width; - int output_height; - double output_aspect; - double factor_x; - double factor_y; - - /* Last seen input frame */ - int input_width; - int input_height; - int input_format; - int input_interlaced; - double input_aspect; - - /* working buffers */ - uint32_t *vWorkY; - uint32_t *vWorkUV; - - /* scaling tables */ - uint32_t *hControl; - uint32_t *hControlUV; - uint32_t *vOffsets; - uint32_t *vOffsetsUV; - uint32_t *vWeights; - uint32_t *vWeightsUV; - - /* memory for work areas and scaling tables */ - void *pMem; - -} warp_plugin_t; - -/* - * - */ - -static void init_tables(warp_plugin_t *this) -{ -#define BP(x) ((uint8_t*)(x)) - /* allocate memory for scaling tables and workspace */ - free(this->pMem); - this->pMem = malloc(this->input_width*3 + this->output_width*sizeof(uint32_t)*3*2 + - this->output_height*sizeof(uint32_t)*4 + 2*9*128); - - /* - aligned for P4 cache line */ - this->vWorkY = (uint32_t*)ALIGN(128, this->pMem); - this->vWorkUV = (uint32_t*)ALIGN(128, BP(this->vWorkY) + this->input_width*2 + 128); - this->hControl = (uint32_t*)ALIGN(128, BP(this->vWorkUV) + this->input_width + 128); - this->vOffsets = (uint32_t*)ALIGN(128, BP(this->hControl) + this->output_width * sizeof(uint32_t) * 3 + 128); - this->vWeights = (uint32_t*)ALIGN(128, BP(this->vOffsets) + this->output_height * sizeof(uint32_t) + 128); - - if (this->input_format == XINE_IMGFMT_YV12) { - this->vOffsetsUV = (uint32_t*)ALIGN(128, BP(this->vWeights) + this->output_height * sizeof(uint32_t) + 128); - this->vWeightsUV = (uint32_t*)ALIGN(128, BP(this->vOffsetsUV) + this->output_height * sizeof(uint32_t) + 128); - this->hControlUV = (uint32_t*)ALIGN(128, BP(this->vWeightsUV) + this->output_height * sizeof(uint32_t) + 128); - - init_tables_yv12(this->output_width, this->output_height, - this->input_width, this->input_height, - this->input_interlaced, this->factor_x, this->factor_y, - this->hControl, this->vOffsets, this->vWeights, - this->hControlUV, this->vOffsetsUV, this->vWeightsUV ); - - } else if (this->input_format == XINE_IMGFMT_YUY2) { - - init_tables_yuy2(this->output_width, this->output_height, - this->input_width, this->input_height, - this->input_interlaced, this->factor_x, this->factor_y, - this->hControl, this->vOffsets, this->vWeights ); - } -} - -static void calculate_factors(warp_plugin_t *this) -{ - /* try to guess amount to stretch/shrink */ - double adiff = this->input_aspect - this->output_aspect; - this->factor_x = 1.0; - this->factor_y = 1.0; - - if (adiff > 0.1) { - - if (adiff > 0.1 + ((16.0-12.0)/9.0)) { - /* >16:9 -> >4:3 */ - DBG("aspect ratio diff %1.3lf > 0 : too large !\n", adiff); - this->factor_x = 0.95; - this->factor_y = 1.15; - this->output_aspect += (adiff - 4.0/9.0); - DBG(" changing target ratio to %3.1lf\n", this->output_aspect); - } else { - /* 16:9 ... 12:9 -> 4:3 */ - DBG("aspect ratio diff %1.3lf > 0 : 16.9...12:9 -> 4:3\n", adiff); - this->factor_x = 1.0 - 0.05 * adiff * 9.0/4.0; - this->factor_y = 1.0 + 0.15 * adiff * 9.0/4.0; - } - - } else if (adiff < -0.1) { - - if(adiff < -0.1-((16.0-12.0)/9.0)) { - /* <4:3 -> <16:9 */ - DBG("aspect ratio diff %1.3lf > 0 : too large !\n", adiff); - this->factor_x = 1.05; - this->factor_y = 0.85; - this->output_aspect += (adiff + 4.0/9.0); - DBG(" changing target ratio to %3.1lf\n", this->output_aspect); - } else { - /* 4:3...16:9 -> 16:9 */ - DBG("aspect ratio diff %1.3lf < 0 : 4:3...16:9 -> 16:9\n", adiff); - this->factor_x = 1.0 + 0.05 * adiff * 9.0/4.0; - this->factor_y = 1.0 - 0.15 * adiff * 9.0/4.0; - } - - } else { - DBG("aspect ratio matches, no warp\n"); - this->factor_x = 1.0; - this->factor_y = 1.0; - } - - DBG("factor_x = %1.3lf factor_y = %1.3lf output ratio = %1.3lf\n", - this->factor_x, this->factor_y, this->output_aspect); -} - -/* - * - */ - -void *warp_init_plugin(xine_t *xine, void *data) -{ -#if !defined(__x86_64__) - /* Need at least MMX */ - if (!(xine_mm_accel() & MM_ACCEL_X86_MMX)) { - fprintf(stderr, "warp_init_plugin: ERROR: at least MMX required\n"); - return NULL; - } -#endif - - return init_plugin(xine, data); -} - -static post_plugin_t *open_plugin(post_class_t *class_gen, int inputs, - xine_audio_port_t **audio_target, - xine_video_port_t **video_target) -{ - warp_plugin_t *this = calloc(1, sizeof(warp_plugin_t)); - post_plugin_t *this_gen = (post_plugin_t *) this; - post_in_t *input; - post_out_t *output; - xine_post_in_t *input_param; - post_video_port_t *port; - - static xine_post_api_t post_api = - { warp_set_parameters, warp_get_parameters, warp_get_param_descr, warp_get_help }; - - if (!this || !video_target || !video_target[0]) { - free(this); - return NULL; - } - - _x_post_init(this_gen, 0, 1); - - port = _x_post_intercept_video_port(this_gen, video_target[0], &input, &output); - port->intercept_frame = intercept_frame_yuy; - port->new_frame->draw = post_draw; - input->xine_in.name = "video"; - output->xine_out.name = "video (scaled)"; - this_gen->xine_post.video_input[0] = &port->new_port; - - this_gen->dispose = warp_dispose; - - input_param = &this->parameter_input; - input_param->name = "parameters"; - input_param->type = XINE_POST_DATA_PARAMETERS; - input_param->data = &post_api; - xine_list_push_back(this_gen->input, input_param); - - this->config.output_aspect = 0.0; /* -> do not change aspect ratio */ - this->config.output_width = 0; /* -> do not change width */ - this->config.output_height = 0; /* -> do not change height */ - this->config.no_downscaling = 0; - - this->input_width = 0; /* not known yet, triggers initialization later */ - this->input_height = 0; - - return this_gen; -} - -static void warp_dispose(post_plugin_t *this_gen) -{ - if (_x_post_dispose(this_gen)) { - warp_plugin_t *this = (warp_plugin_t *) this_gen; - - DBG("dispose\n"); - - free(this->pMem); - free(this); - } -} - -static vo_frame_t *got_frame(vo_frame_t *frame) -{ - post_video_port_t *port = (post_video_port_t *)frame->port; - warp_plugin_t *this = (warp_plugin_t *)port->post; - double adiff = this->input_aspect - frame->ratio; - - if (this->input_width != frame->width || this->input_height != frame->height || - this->input_format != frame->format || FABS(adiff)>0.1 || - this->input_interlaced != !!(frame->flags & VO_INTERLACED_FLAG)) { - - DBG("detected frame format change: %dx%d -> %dx%d, interlaced %d->%d, aspect %1.3lf->%1.3lf, %s->%s\n", - this->input_width, this->input_height, frame->width, frame->height, - this->input_interlaced, !!(frame->flags & VO_INTERLACED_FLAG), - this->input_aspect, frame->ratio, - this->input_format==XINE_IMGFMT_YV12 ? "yv12":"yuy2", - frame->format==XINE_IMGFMT_YV12 ? "yv12":"yuy2" ); - - /* free tables and buffers */ - free(this->pMem); - this->pMem = NULL; - - /* remember frame properties to detect changes in video format */ - this->input_width = frame->width; - this->input_height = frame->height; - this->input_format = frame->format; - this->input_aspect = frame->ratio; - this->input_interlaced = !!(frame->flags & VO_INTERLACED_FLAG); - - /* re-configure target size and aspect ratio */ - this->output_aspect = this->config.output_aspect ?: frame->ratio; - if (!this->config.no_downscaling) { - this->output_width = this->config.output_width ?: frame->width; - this->output_height = this->config.output_height ?: frame->height; - } else { - this->output_width = MAX(this->config.output_width, frame->width); - this->output_height = MAX(this->config.output_height, frame->height); - } - - /* calculate warp function factors */ - calculate_factors(this); - - adiff = this->input_aspect - this->output_aspect; - if(this->output_width == frame->width && - this->output_height == frame->height && - adiff < 0.1 && - adiff > -0.1 ) { - this->enable = 0; - DBG("--> nothing to do, disabling processing for now\n"); - return NULL; - } - - this->enable = 1; - - init_tables(this); - } - - if (!this->enable) - return NULL; - - return port->original_port->get_frame(port->original_port, - this->output_width, this->output_height, - this->output_aspect, frame->format, - frame->flags | VO_BOTH_FIELDS); -} - -static void draw_internal(vo_frame_t *frame, vo_frame_t *new_frame) -{ - post_video_port_t *port = (post_video_port_t *)frame->port; - warp_plugin_t *this = (warp_plugin_t *)port->post; - int proc_height = frame->height; - - if (frame->format == XINE_IMGFMT_YV12) { - - do_warp_yv12(new_frame->base[0], frame->base[0], - new_frame->pitches[0], frame->pitches[0], - this->output_width, this->output_height, - frame->width, proc_height, - this->input_interlaced, - this->hControl, this->vOffsets, this->vWeights, - this->vWorkY, - 0); - proc_height /= 2; - do_warp_yv12(new_frame->base[1], frame->base[1], - new_frame->pitches[1], frame->pitches[1], - this->output_width/2, this->output_height/2, - frame->width/2, proc_height, - this->input_interlaced, - this->hControlUV, this->vOffsetsUV, this->vWeightsUV, - this->vWorkUV, - 0); - do_warp_yv12(new_frame->base[2], frame->base[2], - new_frame->pitches[2], frame->pitches[2], - this->output_width/2, this->output_height/2, - frame->width/2, proc_height, - this->input_interlaced, - this->hControlUV, this->vOffsetsUV, this->vWeightsUV, - this->vWorkUV, - 0); - - } else if (frame->format == XINE_IMGFMT_YUY2) { - do_warp_yuy2(new_frame->base[0], frame->base[0], - new_frame->pitches[0], frame->pitches[0], - this->output_width, this->output_height, - frame->width, proc_height, - this->input_interlaced, - this->hControl, this->vOffsets, this->vWeights, - this->vWorkY, this->vWorkUV, - 0); - } -} - -/* - * parameter functions - */ - -static xine_post_api_descr_t *warp_get_param_descr(void) -{ - return &warp_param_descr; -} - -static int warp_set_parameters(xine_post_t *this_gen, void *param_gen) -{ - warp_plugin_t *this = (warp_plugin_t *)this_gen; - warp_parameters_t *params = (warp_parameters_t *)param_gen; - - memcpy(&this->config, params, sizeof(warp_parameters_t)); - this->input_width = this->input_height = 0; - - if(this->config.output_aspect > 999) - this->config.output_aspect /= 1000.0; - - DBG("warp_set_parameters: " - "output_width=%d, output_height=%d, output_aspect=%4.3lf, no_downscaling=%d\n", - this->config.output_width, this->config.output_height, this->config.output_aspect, - this->config.no_downscaling); - - return 1; -} - -static int warp_get_parameters(xine_post_t *this_gen, void *param_gen) -{ - warp_plugin_t *this = (warp_plugin_t *)this_gen; - warp_parameters_t *params = (warp_parameters_t *)param_gen; - - DBG("warp_get_parameters\n"); - memcpy(params, &this->config, sizeof(warp_parameters_t)); - - return 1; -} - -static char *warp_get_help(void) { - return _( - "The warp plugin scales video to another resolution. " - "It supports non-linear stretching to change video aspect ratio. " - "\n" - "Parameters\n" - " output_width: Scale video to width\n" - " (0 -> do not change video width)\n" - " output_height: Scale video to height\n" - " (0 -> do not change video height)\n" - " output_aspect: Adjust aspect ratio using non-linear scaling\n" - " (0 -> do not change video aspect ratio)\n" - " no_downscaling: Do not downscale video\n" - "\n" - ); -} - - -/* - * plugin info - */ - -static post_info_t info = { XINE_POST_TYPE_VIDEO_FILTER }; - -const plugin_info_t xine_plugin_info[] __attribute__((visibility("default"))) = -{ - /* type, API, "name", version, special_info, init_function */ - { PLUGIN_POST, POST_PLUGIN_IFACE_VERSION, "warp", XINE_VERSION_CODE, &info, &warp_init_plugin }, - { PLUGIN_POST, POST_PLUGIN_IFACE_VERSION, "swscale", XINE_VERSION_CODE, &info, &warp_init_plugin }, - { PLUGIN_NONE, 0, "", 0, NULL, NULL } -}; |