summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--xine_post_swscale.c141
1 files changed, 79 insertions, 62 deletions
diff --git a/xine_post_swscale.c b/xine_post_swscale.c
index bea7ce0a..2d2470c0 100644
--- a/xine_post_swscale.c
+++ b/xine_post_swscale.c
@@ -17,7 +17,7 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*
- * $Id: xine_post_swscale.c,v 1.1 2008-02-20 22:31:23 phintuka Exp $
+ * $Id: xine_post_swscale.c,v 1.2 2008-02-21 02:30:53 phintuka Exp $
*
* Simple (faster) resize for avisynth
* Copyright (C) 2002 Tom Barry
@@ -443,6 +443,23 @@ static void init_tables_yuy2(int newwidth, int newheight, int oldwidth, int oldh
#define _SSE2enabledW "%16"
#endif
+/* Labels */
+#define vMaybeSSEMMX "1"
+#define LessThan8 "2"
+#define LessThan4 "3"
+#define AllDone "4"
+#define LastOne "5"
+#define vLoopSSE2_Fetch "6"
+#define vLoopSSE2 "7"
+#define vLoopSSEMMX_Fetch "8"
+#define vLoopSSEMMX "9"
+#define vLoopMMX "10"
+#define MoreSpareChange "11"
+#define DoHorizontal "12"
+#define hLoopMMX "13"
+#define hLoopMMXSSE "14"
+
+
/* structure for mmx constants */
typedef union {
uint64_t uq[1]; /* Unsigned Quadword */
@@ -530,10 +547,10 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src,
* using SSE2 if we have proper alignment.
*/
"testl $1, "_SSE2enabledW" \n\t" /* is SSE2 supported?*/
- "jz vMaybeSSEMMX \n\t" /* n, can't do anyway*/
+ "jz "vMaybeSSEMMX"f \n\t" /* n, can't do anyway*/
#endif
"cmpl $2, %%"REGC" \n\t" /* we have at least 16 bytes, 2 qwords? */
- "jl vMaybeSSEMMX \n\t" /* n, don't bother*/
+ "jl "vMaybeSSEMMX"f \n\t" /* n, don't bother*/
"shrl $1, %%"REGC" \n\t" /* do 16 bytes at a time instead*/
"decl %%"REGC" \n" /* jigger loop ct */
@@ -545,12 +562,12 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src,
"movdqa "_vWeight2", %%xmm6 \n\t"
"movdqa "_YMask", %%xmm7 \n"
- "vLoopSSE2_Fetch: \n\t"
+ ""vLoopSSE2_Fetch": \n\t"
#ifdef PREFETCH
" prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t"
" prefetcht0 16(%%"REGD", %%"REGA", 2) \n"
#endif
- "vLoopSSE2: \n\t"
+ ""vLoopSSE2": \n\t"
" movdqu (%%"REGSI", %%"REGA", 2), %%xmm1 \n\t" /* top of 2 lines to interpolate */
" movdqu (%%"REGD", %%"REGA", 2), %%xmm2 \n\t" /* 2nd of 2 lines */
@@ -587,8 +604,8 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src,
" lea 8(%%"REGA"), %%"REGA" \n\t"
" decl %%"REGC" \n\t"
- " jg vLoopSSE2_Fetch \n\t" /* if not on last one loop, prefetch */
- " jz vLoopSSE2 \n\t" /* or just loop, or not */
+ " jg "vLoopSSE2_Fetch"b \n\t" /* if not on last one loop, prefetch */
+ " jz "vLoopSSE2"b \n\t" /* or just loop, or not */
/* done with our SSE2 fortified loop but we may need to pick up the spare change */
#ifdef STREAMING_STORE
@@ -602,14 +619,14 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src,
" movq "_vWeight2", %%mm6 \n\t"
" movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */
- " shrl $3, %%"REGC" \n\t" /* 8 bytes at a time, any? */
- " jz MoreSpareChange \n" /* n, did them all */
+ " shrl $3, %%"REGC" \n\t" /* 8 bytes at a time, any? */
+ " jz "MoreSpareChange"f \n" /* n, did them all */
/* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions.
* This first loop is not the performance bottleneck anyway but it is trivial to tune
* using SSE if we have proper alignment.
*/
- "vMaybeSSEMMX: \n\t"
+ ""vMaybeSSEMMX": \n\t"
" movq "_YMask", %%mm7 \n\t" /* useful luma mask constant - lazy dupl init */
" movq "_vWeight1", %%mm5 \n\t"
@@ -617,17 +634,17 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src,
" movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */
#if !defined(__x86_64__)
" testl $1, "_SSEMMXenabledW" \n\t" /* MMXEXTsupported? */
- " jz vLoopMMX \n\t" /* n, can't do anyway */
+ " jz "vLoopMMX"f \n\t" /* n, can't do anyway */
#endif
" decl %%"REGC" \n" /* jigger loop ctr */
- ".align 16 \n"
- "vLoopSSEMMX_Fetch: \n\t"
+ ".align 16 \n"
+ ""vLoopSSEMMX_Fetch": \n\t"
#ifdef PREFETCH
" prefetcht0 8(%%"REGSI", %%"REGA", 2) \n\t"
" prefetcht0 8(%%"REGD", %%"REGA", 2) \n"
#endif
- "vLoopSSEMMX: \n\t"
+ ""vLoopSSEMMX": \n\t"
" movq (%%"REGSI", %%"REGA", 2), %%mm1 \n\t" /* top of 2 lines to interpolate */
" movq (%%"REGD", %%"REGA", 2), %%mm2 \n\t" /* 2nd of 2 lines */
@@ -661,15 +678,15 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src,
" lea 4(%%"REGA"), %%"REGA" \n\t"
" decl %%"REGC" \n\t"
- " jg vLoopSSEMMX_Fetch \n\t" /* if not on last one loop, prefetch */
- " jz vLoopSSEMMX \n\t" /* or just loop, or not */
+ " jg "vLoopSSEMMX_Fetch"b \n\t" /* if not on last one loop, prefetch */
+ " jz "vLoopSSEMMX"b \n\t" /* or just loop, or not */
#ifdef STREAMING_STORE
" sfence \n\t"
#endif
- " jmp MoreSpareChange \n" /* all done with vertical */
+ " jmp "MoreSpareChange"f \n" /* all done with vertical */
".align 16 \n"
- "vLoopMMX: \n\t"
+ ""vLoopMMX": \n\t"
" movq (%%"REGSI", %%"REGA", 2), %%mm1 \n\t" /* top of 2 lines to interpolate */
" movq (%%"REGD", %%"REGA", 2), %%mm2 \n\t" /* 2nd of 2 lines */
@@ -700,22 +717,22 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src,
" movd %%mm3, (%%"REGB", %%"REGA") \n\t" /* save in our work area */
" lea 4(%%"REGA"), %%"REGA" \n\t"
- " loop vLoopMMX \n"
+ " loop "vLoopMMX"b \n"
/* Add a little code here to check if we have 2 more pixels to do and, if so, make one
* more pass thru vLoopMMX. We were processing in multiples of 4 pixels and alway have
* an even number so there will never be more than 2 left. trbarry 7/29/2002
*/
- "MoreSpareChange: \n\t"
+ ""MoreSpareChange": \n\t"
" cmpl "_EndOffset", %%"REGEA" \n\t" /* did we get them all */
- " jnl DoHorizontal \n\t" /* yes, else have 2 left */
- " movl $1, %%"REGC" \n\t" /* jigger loop ct */
- " sub $2, %%"REGA" \n\t" /* back up 2 pixels (4 bytes, but eax carried as 1/2) */
- " jmp vLoopMMX \n"
+ " jnl "DoHorizontal"f \n\t" /* yes, else have 2 left */
+ " movl $1, %%"REGC" \n\t" /* jigger loop ct */
+ " sub $2, %%"REGA" \n\t" /* back up 2 pixels (4 bytes, but eax carried as 1/2) */
+ " jmp "vLoopMMX"b \n"
/* We've taken care of the vertical scaling, now do horizontal */
- "DoHorizontal: \n\t"
+ ""DoHorizontal": \n\t"
" movq "_YMask", %%mm7 \n\t" /* useful 0U0U.. mask constant */
" movq "_FPround2", %%mm6 \n\t" /* useful rounding constant, dwords */
@@ -727,7 +744,7 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src,
" mov "_vWorkUVW", %%"REGB" \n" /* chroma data, as UVUV UVUV... */
".align 16 \n"
- "hLoopMMX: \n\t"
+ ""hLoopMMX": \n\t"
/* x86_64: must use movl (accessing table of uint32's) */
" movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */
@@ -757,7 +774,7 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src,
" lea 24(%%"REGSI"), %%"REGSI" \n\t" /* bump to next control bytest */
" lea 4(%%"REGDI"), %%"REGDI" \n\t" /* bump to next output pixel addr */
- " loop hLoopMMX \n\t" /* loop for more */
+ " loop "hLoopMMX"b \n\t" /* loop for more */
"emms \n\t"
/* done with one line */
@@ -868,15 +885,15 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
* using SSE2 if we have proper alignment.
*/
"testl $1, "_SSE2enabledW" \n\t" /* is SSE2 supported? */
- "jz vMaybeSSEMMX_12 \n\t" /* n, can't do anyway */
+ "jz "vMaybeSSEMMX"f \n\t" /* n, can't do anyway */
#endif
"cmpl $2, %%"REGC" \n\t" /* we have at least 16 byts, 2 qwords? */
- "jl vMaybeSSEMMX_12 \n\t" /* n, don't bother */
+ "jl "vMaybeSSEMMX"f \n\t" /* n, don't bother */
"mov %%"REGSI", %%"REGB" \n\t"
"or %%"REGD", %%"REGB" \n\t"
"test $15, %%"REGB" \n\t" /* both src rows 16 byte aligned? */
- "jnz vMaybeSSEMMX_12 \n\t" /* n, don't use sse2 */
+ "jnz "vMaybeSSEMMX"f \n\t" /* n, don't use sse2 */
"shr $1, %%"REGC" \n\t" /* do 16 bytes at a time instead */
"dec %%"REGC" \n\t" /* jigger loop ct */
@@ -887,12 +904,12 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
"pxor %%xmm7, %%xmm7 \n"
".align 16 \n"
- "vLoopSSE2_Fetch_12: \n\t"
+ ""vLoopSSE2_Fetch": \n\t"
#ifdef PREFETCH
" prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t"
" prefetcht0 16(%%"REGD", %%"REGA", 2) \n"
#endif
- "vLoopSSE2_12: \n\t"
+ ""vLoopSSE2": \n\t"
/* we're already checked pointers to be on dqword aligned */
" movdqa (%%"REGSI", %%"REGA"), %%xmm1 \n\t" /* top of 2 lines to interpolate */
" movdqa (%%"REGD", %%"REGA"), %%xmm3 \n\t" /* 2nd of 2 lines */
@@ -927,8 +944,8 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
" lea 16(%%"REGA"), %%"REGA" \n\t"
" decl %%"REGC" \n\t"
- " jg vLoopSSE2_Fetch_12 \n\t" /* if not on last one loop, prefetch */
- " jz vLoopSSE2_12 \n\t" /* or just loop, or not */
+ " jg "vLoopSSE2_Fetch"b \n\t" /* if not on last one loop, prefetch */
+ " jz "vLoopSSE2"b \n\t" /* or just loop, or not */
/* done with our SSE2 fortified loop but we may need to pick up the spare change */
#ifdef STREAMING_STORE
@@ -941,13 +958,13 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
" movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */
" shrl $3, %%"REGC" \n\t" /* 8 bytes at a time, any? */
- " jz MoreSpareChange_12 \n" /* n, did them all */
+ " jz "MoreSpareChange"f \n" /* n, did them all */
/* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions.
* This first loop is not the performance bottleneck anyway but it is trivial to tune
* using SSE if we have proper alignment.
*/
- "vMaybeSSEMMX_12: \n\t"
+ ""vMaybeSSEMMX": \n\t"
" movq "_vWeight1", %%mm5 \n\t"
" movq "_vWeight2", %%mm6 \n\t"
@@ -955,17 +972,17 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
" pxor %%mm7, %%mm7 \n\t"
#if !defined(__x86_64__)
" testl $1, "_SSEMMXenabledW" \n\t"/* MMXEXTsupported? */
- " jz vLoopMMX_12 \n\t" /* n, can't do anyway */
+ " jz "vLoopMMX"f \n\t" /* n, can't do anyway */
#endif
" decl %%"REGC" \n" /* jigger loop ctr */
".align 16 \n"
- "vLoopSSEMMX_Fetch_12: \n\t"
+ ""vLoopSSEMMX_Fetch": \n\t"
#ifdef PREFETCH
" prefetcht0 8(%%"REGSI", %%"REGA") \n\t"
" prefetcht0 8(%%"REGD", %%"REGA") \n"
#endif
- "vLoopSSEMMX_12: \n\t"
+ ""vLoopSSEMMX": \n\t"
" movq (%%"REGSI", %%"REGA"), %%mm1 \n\t" /* top of 2 lines to interpolate */
" movq (%%"REGD", %%"REGA"), %%mm3 \n\t" /* 2nd of 2 lines */
@@ -1001,15 +1018,15 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
" lea 8(%%"REGA"), %%"REGA" \n\t"
" decl %%"REGC" \n\t"
- " jg vLoopSSEMMX_Fetch_12 \n\t" /* if not on last one loop, prefetch */
- " jz vLoopSSEMMX_12 \n\t" /* or just loop, or not */
+ " jg "vLoopSSEMMX_Fetch"b \n\t" /* if not on last one loop, prefetch */
+ " jz "vLoopSSEMMX"b \n\t" /* or just loop, or not */
#ifdef STREAMING_STORE
" sfence \n\t"
#endif
- " jmp MoreSpareChange_12 \n" /* all done with vertical */
+ " jmp "MoreSpareChange"f \n" /* all done with vertical */
".align 16 \n"
- "vLoopMMX_12: \n\t"
+ ""vLoopMMX": \n\t"
" movq (%%"REGSI", %%"REGA"), %%mm1 \n\t" /* top of 2 lines to interpolate */
" movq (%%"REGD", %%"REGA"), %%mm3 \n\t" /* 2nd of 2 lines */
@@ -1040,23 +1057,23 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
" movq %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
" lea 8(%%"REGA"), %%"REGA" \n\t"
- " loop vLoopMMX_12 \n"
+ " loop "vLoopMMX"b \n"
/* Add a little code here to check if we have more pixels to do and, if so, make one
* more pass thru vLoopMMX. We were processing in multiples of 8 pixels and alway have
* an even number so there will never be more than 7 left.
*/
- "MoreSpareChange_12: \n\t"
+ ""MoreSpareChange": \n\t"
" cmpl "_src_row_size", %%"REGEA" \n\t" /* did we get them all */
- " jnl DoHorizontal_12 \n\t" /* yes, else have 2 left */
+ " jnl "DoHorizontal"f \n\t" /* yes, else have 2 left */
" movl $1, %%"REGC" \n\t" /* jigger loop ct */
" movl "_src_row_size", %%"REGEA" \n\t"
" sub $8, %%"REGA" \n\t" /* back up to last 8 pixels */
- " jmp vLoopMMX_12 \n"
+ " jmp "vLoopMMX"b \n"
/* We've taken care of the vertical scaling, now do horizontal */
- "DoHorizontal_12: \n\t"
+ ""DoHorizontal": \n\t"
" pxor %%mm7, %%mm7 \n\t"
" movq "_FPround2", %%mm6 \n\t" /* useful rounding constant, dwords */
" mov "_pControl", %%"REGSI" \n\t" /* @ horiz control bytes */
@@ -1066,14 +1083,14 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
" mov "_dstp", %%"REGDI" \n\t" /* the destination line */
#if !defined(__x86_64__)
" testl $1, "_SSEMMXenabledW" \n\t" /* MMXEXTsupported? */
- " jz hLoopMMX_12 \n\t" /* n, can't do anyway */
+ " jz "hLoopMMX"f \n\t" /* n, can't do anyway */
#endif
/* With SSE support we will make 8 pixels (from 8 pairs) at a time */
" shrl $1, %%"REGC" \n\t" /* 8 bytes a time instead of 4 */
- " jz LessThan8 \n"
+ " jz "LessThan8"f \n"
".align 16 \n"
- "hLoopMMXSSE_12: \n\t"
+ ""hLoopMMXSSE": \n\t"
/* handle first 2 pixels */
@@ -1131,18 +1148,18 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
" lea 96(%%"REGSI"), %%"REGSI" \n\t"
" lea 8(%%"REGDI"), %%"REGDI" \n\t"
" decl %%"REGC" \n\t"
- " jg hLoopMMXSSE_12 \n\t" /* loop for more */
+ " jg "hLoopMMXSSE"b \n\t" /* loop for more */
#ifdef STREAMING_STORE
" sfence \n"
#endif
- "LessThan8: \n\t"
+ ""LessThan8": \n\t"
" movl "_row_size", %%"REGC" \n\t"
" andl $7, %%"REGC" \n\t" /* we have done all but maybe this */
" shrl $2, %%"REGC" \n\t" /* now do only 4 bytes at a time */
- " jz LessThan4 \n"
+ " jz "LessThan4"f \n"
".align 16 \n"
- "hLoopMMX_12: \n\t"
+ ""hLoopMMX": \n\t"
/* handle first 2 pixels */
" movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */
@@ -1171,14 +1188,14 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
" lea 48(%%"REGSI"), %%"REGSI" \n\t"
" lea 4(%%"REGDI"), %%"REGDI" \n\t"
- " loop hLoopMMX_12 \n" /* loop for more */
+ " loop "hLoopMMX"b \n" /* loop for more */
/* test to see if we have a mod 4 size row, if not then more spare change */
- "LessThan4: \n\t"
+ ""LessThan4": \n\t"
" movl "_row_size", %%"REGC" \n\t"
" andl $3, %%"REGC" \n\t" /* remainder side mod 4 */
" cmpl $2, %%"REGC" \n\t"
- " jl LastOne \n\t" /* none, none */
+ " jl "LastOne"f \n\t" /* none, none */
/* handle 2 more pixels */
" movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */
@@ -1199,9 +1216,9 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
" lea 2(%%"REGDI"), %%"REGDI" \n" /* bump to next output pixel addr */
/* maybe one last pixel */
- "LastOne: \n\t"
+ ""LastOne": \n\t"
" cmpl $0, %%"REGC" \r\n" /* still more ? */
- " jz AllDone \r\n" /* n, done */
+ " jz "AllDone"f \r\n" /* n, done */
" movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */
" movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */
" punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */
@@ -1212,8 +1229,8 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
" movd %%mm0, %%"REGEA" \n\t"
" movb %%al, (%%"REGDI") \n" /* store last one */
- "AllDone: \n\t"
- " emms \n\t"
+ ""AllDone": \n\t"
+ " emms \n\t"
#if !defined(__x86_64__)
"mov "_oldbx", %%"REGB" \n\t"
#endif