diff options
Diffstat (limited to 'src/post/deinterlace/plugins/greedyh.asm')
-rw-r--r-- | src/post/deinterlace/plugins/greedyh.asm | 123 |
1 files changed, 69 insertions, 54 deletions
diff --git a/src/post/deinterlace/plugins/greedyh.asm b/src/post/deinterlace/plugins/greedyh.asm index d2b3f3566..1b7561b09 100644 --- a/src/post/deinterlace/plugins/greedyh.asm +++ b/src/post/deinterlace/plugins/greedyh.asm @@ -17,6 +17,21 @@ ///////////////////////////////////////////////////////////////////////////// #include "x86-64_macros.inc" +#include <mangle.h> + +#if !defined(MASKS_DEFINED) +#define MASKS_DEFINED +static const int64_t __attribute__((__used__)) YMask = 0x00ff00ff00ff00ffull; // to keep only luma +static const int64_t __attribute__((__used__)) UVMask = 0xff00ff00ff00ff00ull; // to keep only chroma +static const int64_t __attribute__((__used__)) ShiftMask = 0xfefffefffefffeffull; // to avoid shifting chroma to luma +static const int64_t __attribute__((__used__)) QW256 = 0x0100010001000100ull; // 4 256's + +static int64_t MaxComb; +static int64_t MotionThreshold; +static int64_t MotionSense; +static int64_t QW256B; + +#endif static void FUNCT_NAME(uint8_t *output, int outstride, deinterlace_frame_data_t *data, @@ -26,16 +41,6 @@ static void FUNCT_NAME(uint8_t *output, int outstride, int stride = (width*2); int InfoIsOdd = bottom_field; - // in tight loop some vars are accessed faster in local storage - int64_t YMask = 0x00ff00ff00ff00ffull; // to keep only luma - int64_t UVMask = 0xff00ff00ff00ff00ull; // to keep only chroma - int64_t ShiftMask = 0xfefffefffefffeffull; // to avoid shifting chroma to luma - int64_t QW256 = 0x0100010001000100ull; // 4 256's - - int64_t MaxComb; - int64_t MotionThreshold; - int64_t MotionSense; - int Line; long LoopCtr; long oldbx; @@ -49,7 +54,6 @@ static void FUNCT_NAME(uint8_t *output, int outstride, unsigned char* L2P; // ptr to prev Line2 unsigned char* Dest = output; - int64_t QW256B; int64_t LastAvg=0; //interp value from left qword // Set up our two parms that are actually evaluated for each pixel @@ -61,7 +65,6 @@ static void FUNCT_NAME(uint8_t *output, int outstride, i = GreedyMotionSense; // scale to range of 0-257 MotionSense = i << 48 | i << 32 | i << 16 | i; - i = 0xffffffff - 256; QW256B = i << 48 | i << 32 | i << 16 | i; // save a couple instr on PMINSW instruct. @@ -105,20 +108,40 @@ static void FUNCT_NAME(uint8_t *output, int outstride, for (Line = 0; Line < (FieldHeight - 1); ++Line) { LoopCtr = stride / 8 - 1; // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop +/* Hans-Dieter Kosch writes: + * + * > The older compilers do not understand the syntax + * > __asm__ ( "command %[name0]" : : [name0] "x"(arg0) ) + * > They only understand + * > __asm__ ( "command %0" : : "x"(arg0) ) + * + * now we define the arguments to make the asm code less ugly. + */ +#ifndef asmLastAvg +#define asmLastAvg "%0" +#define asmL1 "%1" +#define asmL3 "%2" +#define asmL2P "%3" +#define asmL2 "%4" +#define asmDest "%5" +#define asmLoopCtr "%6" +#define asmoldbx "%7" +#endif + // For ease of reading, the comments below assume that we're operating on an odd // field (i.e., that InfoIsOdd is true). Assume the obvious for even lines.. __asm__ __volatile__ ( // save ebx (-fPIC) - MOVX" %%"XBX", %[oldbx]\n\t" + MOVX" %%"XBX", "asmoldbx"\n\t" - MOVX" %[L1], %%"XAX"\n\t" + MOVX" "asmL1", %%"XAX"\n\t" LEAX" 8(%%"XAX"), %%"XBX"\n\t" // next qword needed by DJR - MOVX" %[L3], %%"XCX"\n\t" + MOVX" "asmL3", %%"XCX"\n\t" SUBX" %%"XAX", %%"XCX"\n\t" // carry L3 addr as an offset - MOVX" %[L2P], %%"XDX"\n\t" - MOVX" %[L2], %%"XSI"\n\t" - MOVX" %[Dest], %%"XDI"\n\t" // DL1 if Odd or DL2 if Even + MOVX" "asmL2P", %%"XDX"\n\t" + MOVX" "asmL2", %%"XSI"\n\t" + MOVX" "asmDest", %%"XDI"\n\t" // DL1 if Odd or DL2 if Even ".align 8\n\t" "1:\n\t" @@ -129,15 +152,15 @@ static void FUNCT_NAME(uint8_t *output, int outstride, "movq (%%"XAX", %%"XCX"), %%mm3\n\t" // L3, next odd row "movq %%mm1, %%mm6\n\t" // L1 - get simple single pixel interp // pavgb mm6, mm3 // use macro below - V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]") + V_PAVGB ("%%mm6", "%%mm3", "%%mm4", MANGLE(ShiftMask)) // DJR - Diagonal Jaggie Reduction // In the event that we are going to use an average (Bob) pixel we do not want a jagged // stair step effect. To combat this we avg in the 2 horizontally adjacen pixels into the // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels. - "movq %[LastAvg], %%mm4\n\t" // the bob value from prev qword in row - "movq %%mm6, %[LastAvg]\n\t" // save for next pass + "movq "asmLastAvg", %%mm4\n\t" // the bob value from prev qword in row + "movq %%mm6, "asmLastAvg"\n\t" // save for next pass "psrlq $48, %%mm4\n\t" // right justify 1 pixel "movq %%mm6, %%mm7\n\t" // copy of simple bob pixel "psllq $16, %%mm7\n\t" // left justify 3 pixels @@ -145,22 +168,22 @@ static void FUNCT_NAME(uint8_t *output, int outstride, "movq (%%"XBX"), %%mm5\n\t" // next horiz qword from L1 // pavgb mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below - V_PAVGB ("%%mm5", "(%%"XBX",%%"XCX")", "%%mm7", "%[ShiftMask]") + V_PAVGB ("%%mm5", "(%%"XBX",%%"XCX")", "%%mm7", MANGLE(ShiftMask)) "psllq $48, %%mm5\n\t" // left just 1 pixel "movq %%mm6, %%mm7\n\t" // another copy of simple bob pixel "psrlq $16, %%mm7\n\t" // right just 3 pixels "por %%mm7, %%mm5\n\t" // combine // pavgb mm4, mm5 // avg of forward and prev by 1 pixel, use macro - V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]") // mm5 gets modified if MMX + V_PAVGB ("%%mm4", "%%mm5", "%%mm5", MANGLE(ShiftMask)) // mm5 gets modified if MMX // pavgb mm6, mm4 // avg of center and surround interp vals, use macro - V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") + V_PAVGB ("%%mm6", "%%mm4", "%%mm7", MANGLE(ShiftMask)) // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors. #ifndef IS_MMX // pavgb mm4, mm6 // 1/4 center, 3/4 adjacent - V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]") + V_PAVGB ("%%mm4", "%%mm6", "%%mm7", MANGLE(ShiftMask)) // pavgb mm6, mm4 // 3/8 center, 5/8 adjacent - V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") + V_PAVGB ("%%mm6", "%%mm4", "%%mm7", MANGLE(ShiftMask)) #endif // get abs value of possible L2 comb @@ -213,34 +236,34 @@ static void FUNCT_NAME(uint8_t *output, int outstride, // pminub mm5, mm3 // now = Min(L1,L3), use macro V_PMINUB ("%%mm5", "%%mm3", "%%mm7") // allow the value to be above the high or below the low by amt of MaxComb - "psubusb %[MaxComb], %%mm5\n\t" // lower min by diff - "paddusb %[MaxComb], %%mm2\n\t" // increase max by diff + "psubusb "MANGLE(MaxComb)", %%mm5\n\t" // lower min by diff + "paddusb "MANGLE(MaxComb)", %%mm2\n\t" // increase max by diff // pmaxub mm4, mm5 // now = Max(best,Min(L1,L3) use macro V_PMAXUB ("%%mm4", "%%mm5") // pminub mm4, mm2 // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped V_PMINUB ("%%mm4", "%%mm2", "%%mm7") // Blend weave pixel with bob pixel, depending on motion val in mm0 - "psubusb %[MotionThreshold], %%mm0\n\t"// test Threshold, clear chroma change >>>?? - "pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits - "movq %[QW256], %%mm7\n\t" + "psubusb "MANGLE(MotionThreshold)", %%mm0\n\t"// test Threshold, clear chroma change >>>?? + "pmullw "MANGLE(MotionSense)", %%mm0\n\t" // mul by user factor, keep low 16 bits + "movq "MANGLE(QW256)", %%mm7\n\t" #ifdef IS_SSE "pminsw %%mm7, %%mm0\n\t" // max = 256 #else - "paddusw %[QW256B], %%mm0\n\t" // add, may sat at fff.. - "psubusw %[QW256B], %%mm0\n\t" // now = Min(L1,256) + "paddusw "MANGLE(QW256B)", %%mm0\n\t" // add, may sat at fff.. + "psubusw "MANGLE(QW256B)", %%mm0\n\t" // now = Min(L1,256) #endif "psubusw %%mm0, %%mm7\n\t" // so the 2 sum to 256, weighted avg "movq %%mm4, %%mm2\n\t" // save weave chroma info before trashing - "pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value + "pand "MANGLE(YMask)", %%mm4\n\t" // keep only luma from calc'd value "pmullw %%mm7, %%mm4\n\t" // use more weave for less motion - "pand %[YMask], %%mm6\n\t" // keep only luma from calc'd value + "pand "MANGLE(YMask)", %%mm6\n\t" // keep only luma from calc'd value "pmullw %%mm0, %%mm6\n\t" // use more bob for large motion "paddusw %%mm6, %%mm4\n\t" // combine "psrlw $8, %%mm4\n\t" // div by 256 to get weighted avg // chroma comes from weave pixel - "pand %[UVMask], %%mm2\n\t" // keep chroma + "pand "MANGLE(UVMask)", %%mm2\n\t" // keep chroma "por %%mm4, %%mm2\n\t" // and combine V_MOVNTQ ("(%%"XDI")", "%%mm2") // move in our clipped best, use macro @@ -251,7 +274,7 @@ static void FUNCT_NAME(uint8_t *output, int outstride, LEAX" 8(%%"XDX"), %%"XDX"\n\t" LEAX" 8(%%"XDI"), %%"XDI"\n\t" LEAX" 8(%%"XSI"), %%"XSI"\n\t" - DECX" %[LoopCtr]\n\t" + DECX" "asmLoopCtr"\n\t" "jg 1b\n\t" // loop if not to last line // note P-III default assumes backward branches taken "jl 1f\n\t" // done @@ -259,26 +282,18 @@ static void FUNCT_NAME(uint8_t *output, int outstride, "jmp 1b\n\t" "1:\n\t" - MOVX" %[oldbx], %%"XBX"\n\t" + MOVX" "asmoldbx", %%"XBX"\n\t" : /* no outputs */ - : [LastAvg] "m"(LastAvg), - [L1] "m"(L1), - [L3] "m"(L3), - [L2P] "m"(L2P), - [L2] "m"(L2), - [Dest] "m"(Dest), - [ShiftMask] "m"(ShiftMask), - [MaxComb] "m"(MaxComb), - [MotionThreshold] "m"(MotionThreshold), - [MotionSense] "m"(MotionSense), - [QW256B] "m"(QW256B), - [YMask] "m"(YMask), - [UVMask] "m"(UVMask), - [LoopCtr] "m"(LoopCtr), - [QW256] "m"(QW256), - [oldbx] "m"(oldbx) + : "m"(LastAvg), + "m"(L1), + "m"(L3), + "m"(L2P), + "m"(L2), + "m"(Dest), + "m"(LoopCtr), + "m"(oldbx) : XAX, XCX, XDX, XSI, XDI, #ifdef ARCH_X86 |