diff options
author | phintuka <phintuka> | 2008-02-21 02:30:53 +0000 |
---|---|---|
committer | phintuka <phintuka> | 2008-02-21 02:30:53 +0000 |
commit | dd51de2ed44ab3281934873bac940513d7df6d30 (patch) | |
tree | 49cf8ddf23ff38cd3e3e09dfa2bd0bdeb1eddbc2 | |
parent | 7475353b8f2ae2d7171ee80e8634b5ce102808a8 (diff) | |
download | xineliboutput-dd51de2ed44ab3281934873bac940513d7df6d30.tar.gz xineliboutput-dd51de2ed44ab3281934873bac940513d7df6d30.tar.bz2 |
Use local labels in assembler code.
This should fix assembler errors when function(s) are inlined.
-rw-r--r-- | xine_post_swscale.c | 141 |
1 files changed, 79 insertions, 62 deletions
diff --git a/xine_post_swscale.c b/xine_post_swscale.c index bea7ce0a..2d2470c0 100644 --- a/xine_post_swscale.c +++ b/xine_post_swscale.c @@ -17,7 +17,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA * - * $Id: xine_post_swscale.c,v 1.1 2008-02-20 22:31:23 phintuka Exp $ + * $Id: xine_post_swscale.c,v 1.2 2008-02-21 02:30:53 phintuka Exp $ * * Simple (faster) resize for avisynth * Copyright (C) 2002 Tom Barry @@ -443,6 +443,23 @@ static void init_tables_yuy2(int newwidth, int newheight, int oldwidth, int oldh #define _SSE2enabledW "%16" #endif +/* Labels */ +#define vMaybeSSEMMX "1" +#define LessThan8 "2" +#define LessThan4 "3" +#define AllDone "4" +#define LastOne "5" +#define vLoopSSE2_Fetch "6" +#define vLoopSSE2 "7" +#define vLoopSSEMMX_Fetch "8" +#define vLoopSSEMMX "9" +#define vLoopMMX "10" +#define MoreSpareChange "11" +#define DoHorizontal "12" +#define hLoopMMX "13" +#define hLoopMMXSSE "14" + + /* structure for mmx constants */ typedef union { uint64_t uq[1]; /* Unsigned Quadword */ @@ -530,10 +547,10 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src, * using SSE2 if we have proper alignment. */ "testl $1, "_SSE2enabledW" \n\t" /* is SSE2 supported?*/ - "jz vMaybeSSEMMX \n\t" /* n, can't do anyway*/ + "jz "vMaybeSSEMMX"f \n\t" /* n, can't do anyway*/ #endif "cmpl $2, %%"REGC" \n\t" /* we have at least 16 bytes, 2 qwords? */ - "jl vMaybeSSEMMX \n\t" /* n, don't bother*/ + "jl "vMaybeSSEMMX"f \n\t" /* n, don't bother*/ "shrl $1, %%"REGC" \n\t" /* do 16 bytes at a time instead*/ "decl %%"REGC" \n" /* jigger loop ct */ @@ -545,12 +562,12 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src, "movdqa "_vWeight2", %%xmm6 \n\t" "movdqa "_YMask", %%xmm7 \n" - "vLoopSSE2_Fetch: \n\t" + ""vLoopSSE2_Fetch": \n\t" #ifdef PREFETCH " prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t" " prefetcht0 16(%%"REGD", %%"REGA", 2) \n" #endif - "vLoopSSE2: \n\t" + ""vLoopSSE2": \n\t" " movdqu (%%"REGSI", %%"REGA", 2), %%xmm1 \n\t" /* top of 2 lines to interpolate */ " movdqu (%%"REGD", %%"REGA", 2), %%xmm2 \n\t" /* 2nd of 2 lines */ @@ -587,8 +604,8 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src, " lea 8(%%"REGA"), %%"REGA" \n\t" " decl %%"REGC" \n\t" - " jg vLoopSSE2_Fetch \n\t" /* if not on last one loop, prefetch */ - " jz vLoopSSE2 \n\t" /* or just loop, or not */ + " jg "vLoopSSE2_Fetch"b \n\t" /* if not on last one loop, prefetch */ + " jz "vLoopSSE2"b \n\t" /* or just loop, or not */ /* done with our SSE2 fortified loop but we may need to pick up the spare change */ #ifdef STREAMING_STORE @@ -602,14 +619,14 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src, " movq "_vWeight2", %%mm6 \n\t" " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */ - " shrl $3, %%"REGC" \n\t" /* 8 bytes at a time, any? */ - " jz MoreSpareChange \n" /* n, did them all */ + " shrl $3, %%"REGC" \n\t" /* 8 bytes at a time, any? */ + " jz "MoreSpareChange"f \n" /* n, did them all */ /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions. * This first loop is not the performance bottleneck anyway but it is trivial to tune * using SSE if we have proper alignment. */ - "vMaybeSSEMMX: \n\t" + ""vMaybeSSEMMX": \n\t" " movq "_YMask", %%mm7 \n\t" /* useful luma mask constant - lazy dupl init */ " movq "_vWeight1", %%mm5 \n\t" @@ -617,17 +634,17 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src, " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */ #if !defined(__x86_64__) " testl $1, "_SSEMMXenabledW" \n\t" /* MMXEXTsupported? */ - " jz vLoopMMX \n\t" /* n, can't do anyway */ + " jz "vLoopMMX"f \n\t" /* n, can't do anyway */ #endif " decl %%"REGC" \n" /* jigger loop ctr */ - ".align 16 \n" - "vLoopSSEMMX_Fetch: \n\t" + ".align 16 \n" + ""vLoopSSEMMX_Fetch": \n\t" #ifdef PREFETCH " prefetcht0 8(%%"REGSI", %%"REGA", 2) \n\t" " prefetcht0 8(%%"REGD", %%"REGA", 2) \n" #endif - "vLoopSSEMMX: \n\t" + ""vLoopSSEMMX": \n\t" " movq (%%"REGSI", %%"REGA", 2), %%mm1 \n\t" /* top of 2 lines to interpolate */ " movq (%%"REGD", %%"REGA", 2), %%mm2 \n\t" /* 2nd of 2 lines */ @@ -661,15 +678,15 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src, " lea 4(%%"REGA"), %%"REGA" \n\t" " decl %%"REGC" \n\t" - " jg vLoopSSEMMX_Fetch \n\t" /* if not on last one loop, prefetch */ - " jz vLoopSSEMMX \n\t" /* or just loop, or not */ + " jg "vLoopSSEMMX_Fetch"b \n\t" /* if not on last one loop, prefetch */ + " jz "vLoopSSEMMX"b \n\t" /* or just loop, or not */ #ifdef STREAMING_STORE " sfence \n\t" #endif - " jmp MoreSpareChange \n" /* all done with vertical */ + " jmp "MoreSpareChange"f \n" /* all done with vertical */ ".align 16 \n" - "vLoopMMX: \n\t" + ""vLoopMMX": \n\t" " movq (%%"REGSI", %%"REGA", 2), %%mm1 \n\t" /* top of 2 lines to interpolate */ " movq (%%"REGD", %%"REGA", 2), %%mm2 \n\t" /* 2nd of 2 lines */ @@ -700,22 +717,22 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src, " movd %%mm3, (%%"REGB", %%"REGA") \n\t" /* save in our work area */ " lea 4(%%"REGA"), %%"REGA" \n\t" - " loop vLoopMMX \n" + " loop "vLoopMMX"b \n" /* Add a little code here to check if we have 2 more pixels to do and, if so, make one * more pass thru vLoopMMX. We were processing in multiples of 4 pixels and alway have * an even number so there will never be more than 2 left. trbarry 7/29/2002 */ - "MoreSpareChange: \n\t" + ""MoreSpareChange": \n\t" " cmpl "_EndOffset", %%"REGEA" \n\t" /* did we get them all */ - " jnl DoHorizontal \n\t" /* yes, else have 2 left */ - " movl $1, %%"REGC" \n\t" /* jigger loop ct */ - " sub $2, %%"REGA" \n\t" /* back up 2 pixels (4 bytes, but eax carried as 1/2) */ - " jmp vLoopMMX \n" + " jnl "DoHorizontal"f \n\t" /* yes, else have 2 left */ + " movl $1, %%"REGC" \n\t" /* jigger loop ct */ + " sub $2, %%"REGA" \n\t" /* back up 2 pixels (4 bytes, but eax carried as 1/2) */ + " jmp "vLoopMMX"b \n" /* We've taken care of the vertical scaling, now do horizontal */ - "DoHorizontal: \n\t" + ""DoHorizontal": \n\t" " movq "_YMask", %%mm7 \n\t" /* useful 0U0U.. mask constant */ " movq "_FPround2", %%mm6 \n\t" /* useful rounding constant, dwords */ @@ -727,7 +744,7 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src, " mov "_vWorkUVW", %%"REGB" \n" /* chroma data, as UVUV UVUV... */ ".align 16 \n" - "hLoopMMX: \n\t" + ""hLoopMMX": \n\t" /* x86_64: must use movl (accessing table of uint32's) */ " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */ @@ -757,7 +774,7 @@ static int do_warp_yuy2(uint8_t *dst, const uint8_t *src, " lea 24(%%"REGSI"), %%"REGSI" \n\t" /* bump to next control bytest */ " lea 4(%%"REGDI"), %%"REGDI" \n\t" /* bump to next output pixel addr */ - " loop hLoopMMX \n\t" /* loop for more */ + " loop "hLoopMMX"b \n\t" /* loop for more */ "emms \n\t" /* done with one line */ @@ -868,15 +885,15 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src, * using SSE2 if we have proper alignment. */ "testl $1, "_SSE2enabledW" \n\t" /* is SSE2 supported? */ - "jz vMaybeSSEMMX_12 \n\t" /* n, can't do anyway */ + "jz "vMaybeSSEMMX"f \n\t" /* n, can't do anyway */ #endif "cmpl $2, %%"REGC" \n\t" /* we have at least 16 byts, 2 qwords? */ - "jl vMaybeSSEMMX_12 \n\t" /* n, don't bother */ + "jl "vMaybeSSEMMX"f \n\t" /* n, don't bother */ "mov %%"REGSI", %%"REGB" \n\t" "or %%"REGD", %%"REGB" \n\t" "test $15, %%"REGB" \n\t" /* both src rows 16 byte aligned? */ - "jnz vMaybeSSEMMX_12 \n\t" /* n, don't use sse2 */ + "jnz "vMaybeSSEMMX"f \n\t" /* n, don't use sse2 */ "shr $1, %%"REGC" \n\t" /* do 16 bytes at a time instead */ "dec %%"REGC" \n\t" /* jigger loop ct */ @@ -887,12 +904,12 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src, "pxor %%xmm7, %%xmm7 \n" ".align 16 \n" - "vLoopSSE2_Fetch_12: \n\t" + ""vLoopSSE2_Fetch": \n\t" #ifdef PREFETCH " prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t" " prefetcht0 16(%%"REGD", %%"REGA", 2) \n" #endif - "vLoopSSE2_12: \n\t" + ""vLoopSSE2": \n\t" /* we're already checked pointers to be on dqword aligned */ " movdqa (%%"REGSI", %%"REGA"), %%xmm1 \n\t" /* top of 2 lines to interpolate */ " movdqa (%%"REGD", %%"REGA"), %%xmm3 \n\t" /* 2nd of 2 lines */ @@ -927,8 +944,8 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src, " lea 16(%%"REGA"), %%"REGA" \n\t" " decl %%"REGC" \n\t" - " jg vLoopSSE2_Fetch_12 \n\t" /* if not on last one loop, prefetch */ - " jz vLoopSSE2_12 \n\t" /* or just loop, or not */ + " jg "vLoopSSE2_Fetch"b \n\t" /* if not on last one loop, prefetch */ + " jz "vLoopSSE2"b \n\t" /* or just loop, or not */ /* done with our SSE2 fortified loop but we may need to pick up the spare change */ #ifdef STREAMING_STORE @@ -941,13 +958,13 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src, " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */ " shrl $3, %%"REGC" \n\t" /* 8 bytes at a time, any? */ - " jz MoreSpareChange_12 \n" /* n, did them all */ + " jz "MoreSpareChange"f \n" /* n, did them all */ /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions. * This first loop is not the performance bottleneck anyway but it is trivial to tune * using SSE if we have proper alignment. */ - "vMaybeSSEMMX_12: \n\t" + ""vMaybeSSEMMX": \n\t" " movq "_vWeight1", %%mm5 \n\t" " movq "_vWeight2", %%mm6 \n\t" @@ -955,17 +972,17 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src, " pxor %%mm7, %%mm7 \n\t" #if !defined(__x86_64__) " testl $1, "_SSEMMXenabledW" \n\t"/* MMXEXTsupported? */ - " jz vLoopMMX_12 \n\t" /* n, can't do anyway */ + " jz "vLoopMMX"f \n\t" /* n, can't do anyway */ #endif " decl %%"REGC" \n" /* jigger loop ctr */ ".align 16 \n" - "vLoopSSEMMX_Fetch_12: \n\t" + ""vLoopSSEMMX_Fetch": \n\t" #ifdef PREFETCH " prefetcht0 8(%%"REGSI", %%"REGA") \n\t" " prefetcht0 8(%%"REGD", %%"REGA") \n" #endif - "vLoopSSEMMX_12: \n\t" + ""vLoopSSEMMX": \n\t" " movq (%%"REGSI", %%"REGA"), %%mm1 \n\t" /* top of 2 lines to interpolate */ " movq (%%"REGD", %%"REGA"), %%mm3 \n\t" /* 2nd of 2 lines */ @@ -1001,15 +1018,15 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src, " lea 8(%%"REGA"), %%"REGA" \n\t" " decl %%"REGC" \n\t" - " jg vLoopSSEMMX_Fetch_12 \n\t" /* if not on last one loop, prefetch */ - " jz vLoopSSEMMX_12 \n\t" /* or just loop, or not */ + " jg "vLoopSSEMMX_Fetch"b \n\t" /* if not on last one loop, prefetch */ + " jz "vLoopSSEMMX"b \n\t" /* or just loop, or not */ #ifdef STREAMING_STORE " sfence \n\t" #endif - " jmp MoreSpareChange_12 \n" /* all done with vertical */ + " jmp "MoreSpareChange"f \n" /* all done with vertical */ ".align 16 \n" - "vLoopMMX_12: \n\t" + ""vLoopMMX": \n\t" " movq (%%"REGSI", %%"REGA"), %%mm1 \n\t" /* top of 2 lines to interpolate */ " movq (%%"REGD", %%"REGA"), %%mm3 \n\t" /* 2nd of 2 lines */ @@ -1040,23 +1057,23 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src, " movq %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */ " lea 8(%%"REGA"), %%"REGA" \n\t" - " loop vLoopMMX_12 \n" + " loop "vLoopMMX"b \n" /* Add a little code here to check if we have more pixels to do and, if so, make one * more pass thru vLoopMMX. We were processing in multiples of 8 pixels and alway have * an even number so there will never be more than 7 left. */ - "MoreSpareChange_12: \n\t" + ""MoreSpareChange": \n\t" " cmpl "_src_row_size", %%"REGEA" \n\t" /* did we get them all */ - " jnl DoHorizontal_12 \n\t" /* yes, else have 2 left */ + " jnl "DoHorizontal"f \n\t" /* yes, else have 2 left */ " movl $1, %%"REGC" \n\t" /* jigger loop ct */ " movl "_src_row_size", %%"REGEA" \n\t" " sub $8, %%"REGA" \n\t" /* back up to last 8 pixels */ - " jmp vLoopMMX_12 \n" + " jmp "vLoopMMX"b \n" /* We've taken care of the vertical scaling, now do horizontal */ - "DoHorizontal_12: \n\t" + ""DoHorizontal": \n\t" " pxor %%mm7, %%mm7 \n\t" " movq "_FPround2", %%mm6 \n\t" /* useful rounding constant, dwords */ " mov "_pControl", %%"REGSI" \n\t" /* @ horiz control bytes */ @@ -1066,14 +1083,14 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src, " mov "_dstp", %%"REGDI" \n\t" /* the destination line */ #if !defined(__x86_64__) " testl $1, "_SSEMMXenabledW" \n\t" /* MMXEXTsupported? */ - " jz hLoopMMX_12 \n\t" /* n, can't do anyway */ + " jz "hLoopMMX"f \n\t" /* n, can't do anyway */ #endif /* With SSE support we will make 8 pixels (from 8 pairs) at a time */ " shrl $1, %%"REGC" \n\t" /* 8 bytes a time instead of 4 */ - " jz LessThan8 \n" + " jz "LessThan8"f \n" ".align 16 \n" - "hLoopMMXSSE_12: \n\t" + ""hLoopMMXSSE": \n\t" /* handle first 2 pixels */ @@ -1131,18 +1148,18 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src, " lea 96(%%"REGSI"), %%"REGSI" \n\t" " lea 8(%%"REGDI"), %%"REGDI" \n\t" " decl %%"REGC" \n\t" - " jg hLoopMMXSSE_12 \n\t" /* loop for more */ + " jg "hLoopMMXSSE"b \n\t" /* loop for more */ #ifdef STREAMING_STORE " sfence \n" #endif - "LessThan8: \n\t" + ""LessThan8": \n\t" " movl "_row_size", %%"REGC" \n\t" " andl $7, %%"REGC" \n\t" /* we have done all but maybe this */ " shrl $2, %%"REGC" \n\t" /* now do only 4 bytes at a time */ - " jz LessThan4 \n" + " jz "LessThan4"f \n" ".align 16 \n" - "hLoopMMX_12: \n\t" + ""hLoopMMX": \n\t" /* handle first 2 pixels */ " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */ @@ -1171,14 +1188,14 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src, " lea 48(%%"REGSI"), %%"REGSI" \n\t" " lea 4(%%"REGDI"), %%"REGDI" \n\t" - " loop hLoopMMX_12 \n" /* loop for more */ + " loop "hLoopMMX"b \n" /* loop for more */ /* test to see if we have a mod 4 size row, if not then more spare change */ - "LessThan4: \n\t" + ""LessThan4": \n\t" " movl "_row_size", %%"REGC" \n\t" " andl $3, %%"REGC" \n\t" /* remainder side mod 4 */ " cmpl $2, %%"REGC" \n\t" - " jl LastOne \n\t" /* none, none */ + " jl "LastOne"f \n\t" /* none, none */ /* handle 2 more pixels */ " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */ @@ -1199,9 +1216,9 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src, " lea 2(%%"REGDI"), %%"REGDI" \n" /* bump to next output pixel addr */ /* maybe one last pixel */ - "LastOne: \n\t" + ""LastOne": \n\t" " cmpl $0, %%"REGC" \r\n" /* still more ? */ - " jz AllDone \r\n" /* n, done */ + " jz "AllDone"f \r\n" /* n, done */ " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */ " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */ " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ @@ -1212,8 +1229,8 @@ static int do_warp_yv12(uint8_t *dst, const uint8_t * const src, " movd %%mm0, %%"REGEA" \n\t" " movb %%al, (%%"REGDI") \n" /* store last one */ - "AllDone: \n\t" - " emms \n\t" + ""AllDone": \n\t" + " emms \n\t" #if !defined(__x86_64__) "mov "_oldbx", %%"REGB" \n\t" #endif |