/***************************************************************************** ** $Id: greedy2frame_template.c,v 1.10 2006/12/21 09:54:45 dgp85 Exp $ ****************************************************************************** ** Copyright (c) 2000 John Adcock, Tom Barry, Steve Grimm All rights reserved. ** port copyright (c) 2003 Miguel Freitas ****************************************************************************** ** ** This file is subject to the terms of the GNU General Public License as ** published by the Free Software Foundation. A copy of this license is ** included with this software distribution in the file COPYING. If you ** do not have a copy, you may obtain a copy by writing to the Free ** Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. ** ** This software is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details ****************************************************************************** ** CVS Log ** ** $Log: greedy2frame_template.c,v $ ** Revision 1.10 2006/12/21 09:54:45 dgp85 ** Apply the textrel patch from Gentoo, thanks to PaX team for providing it. The patch was applied and tested for a while in Gentoo and Pardus, and solves also Debian's problems with non-PIC code. If problems will arise, they'll be debugged. ** ** Revision 1.9 2006/02/04 14:06:29 miguelfreitas ** Enable AMD64 mmx/sse support in some plugins (tvtime, libmpeg2, goom...) ** patch by dani3l ** ** Revision 1.8 2005/06/05 16:00:06 miguelfreitas ** quite some hacks for gcc 2.95 compatibility ** ** Revision 1.7 2004/04/09 02:57:06 miguelfreitas ** tvtime deinterlacing algorithms assumed top_field_first=1 ** top_field_first=0 (aka bottom_field_first) should now work as expected ** ** Revision 1.6 2004/02/12 20:53:31 mroi ** my gcc (partly 3.4 already) optimizes these away, because they are only used ** inside inline assembler (which the compiler does not recognize); so actually ** the code is wrong (the asm parts should list these as inputs), but telling ** the compiler to keep them is the easier fix ** ** Revision 1.5 2004/01/05 12:15:55 siggi ** wonder why Mike isn't complaining about C++ style comments, any more... ** ** Revision 1.4 2004/01/05 01:47:26 tmmm ** DOS/Win CRs are forbidden, verboten, interdit ** ** Revision 1.3 2004/01/02 20:53:43 miguelfreitas ** better MANGLE from ffmpeg ** ** Revision 1.2 2004/01/02 20:47:03 miguelfreitas ** my small contribution to the cygwin port ;-) ** ** Revision 1.1 2003/06/22 17:30:03 miguelfreitas ** use our own port of greedy2frame (tvtime port is currently broken) ** ** Revision 1.8 2001/11/23 17:18:54 adcockj ** Fixed silly and/or confusion ** ** Revision 1.7 2001/11/22 22:27:00 adcockj ** Bug Fixes ** ** Revision 1.6 2001/11/21 15:21:40 adcockj ** Renamed DEINTERLACE_INFO to TDeinterlaceInfo in line with standards ** Changed TDeinterlaceInfo structure to have history of pictures. ** ** Revision 1.5 2001/07/31 06:48:33 adcockj ** Fixed index bug spotted by Peter Gubanov ** ** Revision 1.4 2001/07/13 16:13:33 adcockj ** Added CVS tags and removed tabs ** *****************************************************************************/ /* * This is the implementation of the Greedy 2-frame deinterlace algorithm * described in DI_Greedy2Frame.c. It's in a separate file so we can compile * variants for different CPU types; most of the code is the same in the * different variants. */ /**************************************************************************** ** Field 1 | Field 2 | Field 3 | Field 4 | ** T0 | | T1 | | ** | M0 | | M1 | ** B0 | | B1 | | */ /* debugging feature */ /* output the value of mm4 at this point which is pink where we will weave */ /* and green were we are going to bob */ /* uncomment next line to see this */ /* #define CHECK_BOBWEAVE */ #if !defined(MASKS_DEFINED) #define MASKS_DEFINED static const int64_t __attribute__((__used__)) YMask = 0x00ff00ff00ff00ffll; static const int64_t __attribute__((__used__)) Mask = 0x7f7f7f7f7f7f7f7fll; static const int64_t __attribute__((__used__)) DwordOne = 0x0000000100000001ll; static const int64_t __attribute__((__used__)) DwordTwo = 0x0000000200000002ll; static int64_t qwGreedyTwoFrameThreshold; #endif #include #if defined(IS_SSE) static void DeinterlaceGreedy2Frame_SSE(uint8_t *output, int outstride, deinterlace_frame_data_t *data, int bottom_field, int second_field, int width, int height ) #elif defined(IS_3DNOW) static void DeinterlaceGreedy2Frame_3DNOW(uint8_t *output, int outstride, deinterlace_frame_data_t *data, int bottom_field, int second_field, int width, int height ) #else static void DeinterlaceGreedy2Frame_MMX(uint8_t *output, int outstride, deinterlace_frame_data_t *data, int bottom_field, int second_field, int width, int height ) #endif { #if defined(ARCH_X86) || defined(ARCH_X86_64) int Line; int stride = width * 2; register uint8_t* M1; register uint8_t* M0; register uint8_t* T0; register uint8_t* T1; register uint8_t* B1; register uint8_t* B0; uint8_t* Dest = output; register uint8_t* Dest2; register int count; uint32_t Pitch = stride*2; uint32_t LineLength = stride; uint32_t PitchRest = Pitch - (LineLength >> 3)*8; qwGreedyTwoFrameThreshold = GreedyTwoFrameThreshold; qwGreedyTwoFrameThreshold += (GreedyTwoFrameThreshold2 << 8); qwGreedyTwoFrameThreshold += (qwGreedyTwoFrameThreshold << 48) + (qwGreedyTwoFrameThreshold << 32) + (qwGreedyTwoFrameThreshold << 16); if( second_field ) { M1 = data->f0; T1 = data->f0; M0 = data->f1; T0 = data->f1; } else { M1 = data->f0; T1 = data->f1; M0 = data->f1; T0 = data->f2; } if( bottom_field ) { M1 += stride; T1 += 0; B1 = T1 + Pitch; M0 += stride; T0 += 0; B0 = T0 + Pitch; } else { M1 += Pitch; T1 += stride; B1 = T1 + Pitch; M0 += Pitch; T0 += stride; B0 = T0 + Pitch; xine_fast_memcpy(Dest, M1, LineLength); Dest += outstride; } for (Line = 0; Line < (height / 2) - 1; ++Line) { /* Always use the most recent data verbatim. By definition it's correct * (it'd be shown on an interlaced display) and our job is to fill in * the spaces between the new lines. */ xine_fast_memcpy(Dest, T1, stride); Dest += outstride; Dest2 = Dest; count = LineLength >> 3; do { asm volatile( /* Figure out what to do with the scanline above the one we just copied. * See above for a description of the algorithm. */ ".align 8 \n\t" "movq %4, %%mm6 \n\t" "movq %0, %%mm1 \n\t" // T1 "movq %1, %%mm0 \n\t" // M1 "movq %2, %%mm3 \n\t" // B1 "movq %3, %%mm2 \n\t" // M0 : /* no output */ : "m" (*T1), "m" (*M1), "m" (*B1), "m" (*M0), "m" (Mask) ); asm volatile( /* Figure out what to do with the scanline above the one we just copied. * See above for a description of the algorithm. * Average T1 and B1 so we can do interpolated bobbing if we bob onto T1 */ "movq %%mm3, %%mm7 \n\t" /* mm7 = B1 */ #if defined(IS_SSE) "pavgb %%mm1, %%mm7 \n\t" #elif defined(IS_3DNOW) "pavgusb %%mm1, %%mm7 \n\t" #else "movq %%mm1, %%mm5 \n\t" /* mm5 = T1 */ "psrlw $1, %%mm7 \n\t" /* mm7 = B1 / 2 */ "pand %%mm6, %%mm7 \n\t" /* mask off lower bits */ "psrlw $1, %%mm5 \n\t" /* mm5 = T1 / 2 */ "pand %%mm6, %%mm5 \n\t" /* mask off lower bits */ "paddw %%mm5, %%mm7 \n\t" /* mm7 = (T1 + B1) / 2 */ #endif /* calculate |M1-M0| put result in mm4 need to keep mm0 intact * if we have a good processor then make mm0 the average of M1 and M0 * which should make weave look better when there is small amounts of * movement */ #if defined(IS_SSE) "movq %%mm0, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" "psubusb %%mm2, %%mm4 \n\t" "psubusb %%mm0, %%mm5 \n\t" "por %%mm5, %%mm4 \n\t" "psrlw $1, %%mm4 \n\t" "pavgb %%mm2, %%mm0 \n\t" "pand %%mm6, %%mm4 \n\t" #elif defined(IS_3DNOW) "movq %%mm0, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" "psubusb %%mm2, %%mm4 \n\t" "psubusb %%mm0, %%mm5 \n\t" "por %%mm5, %%mm4 \n\t" "psrlw $1, %%mm4 \n\t" "pavgusb %%mm2, %%mm0 \n\t" "pand %%mm6, %%mm4 \n\t" #else "movq %%mm0, %%mm4 \n\t" "psubusb %%mm2, %%mm4 \n\t" "psubusb %%mm0, %%mm2 \n\t" "por %%mm2, %%mm4 \n\t" "psrlw $1, %%mm4 \n\t" "pand %%mm6, %%mm4 \n\t" #endif /* if |M1-M0| > Threshold we want dword worth of twos */ "pcmpgtb %3, %%mm4 \n\t" "pand %4, %%mm4 \n\t" /* get rid of sign bit */ "pcmpgtd %5, %%mm4 \n\t" /* do we want to bob */ "pandn %6, %%mm4 \n\t" "movq %1, %%mm2 \n\t" /* mm2 = T0 */ /* calculate |T1-T0| put result in mm5 */ "movq %%mm2, %%mm5 \n\t" "psubusb %%mm1, %%mm5 \n\t" "psubusb %%mm2, %%mm1 \n\t" "por %%mm1, %%mm5 \n\t" "psrlw $1, %%mm5 \n\t" "pand %%mm6, %%mm5 \n\t" /* if |T1-T0| > Threshold we want dword worth of ones */ "pcmpgtb %3, %%mm5 \n\t" "pand %%mm6, %%mm5 \n\t" /* get rid of sign bit */ "pcmpgtd %5, %%mm5 \n\t" "pandn %5, %%mm5 \n\t" "paddd %%mm5, %%mm4 \n\t" "movq %2, %%mm2 \n\t" /* B0 */ /* calculate |B1-B0| put result in mm5 */ "movq %%mm2, %%mm5 \n\t" "psubusb %%mm3, %%mm5 \n\t" "psubusb %%mm2, %%mm3 \n\t" "por %%mm3, %%mm5 \n\t" "psrlw $1, %%mm5 \n\t" "pand %%mm6, %%mm5 \n\t" /* if |B1-B0| > Threshold we want dword worth of ones */ "pcmpgtb %3, %%mm5 \n\t" "pand %%mm6, %%mm5 \n\t" /* get rid of any sign bit */ "pcmpgtd %5, %%mm5 \n\t" "pandn %5, %%mm5 \n\t" "paddd %%mm5, %%mm4 \n\t" "pcmpgtd %6, %%mm4 \n\t" /* debugging feature * output the value of mm4 at this point which is pink where we will weave * and green were we are going to bob */ #ifdef CHECK_BOBWEAVE #ifdef IS_SSE "movntq %%mm4, %0 \n\t" #else "movq %%mm4, %0 \n\t" #endif #else "movq %%mm4, %%mm5 \n\t" /* mm4 now is 1 where we want to weave and 0 where we want to bob */ "pand %%mm0, %%mm4 \n\t" "pandn %%mm7, %%mm5 \n\t" "por %%mm5, %%mm4 \n\t" #ifdef IS_SSE "movntq %%mm4, %0 \n\t" #else "movq %%mm4, %0 \n\t" #endif #endif : "=m" (*Dest2) : "m" (*T0), "m" (*B0), "m" (qwGreedyTwoFrameThreshold), "m" (Mask), "m" (DwordOne), "m" (DwordTwo) ); /* Advance to the next set of pixels. */ T1 += 8; M1 += 8; B1 += 8; M0 += 8; T0 += 8; B0 += 8; Dest2 += 8; } while( --count ); Dest += outstride; M1 += PitchRest; T1 += PitchRest; B1 += PitchRest; M0 += PitchRest; T0 += PitchRest; B0 += PitchRest; } #ifdef IS_SSE asm("sfence\n\t"); #endif if( bottom_field ) { xine_fast_memcpy(Dest, T1, stride); Dest += outstride; xine_fast_memcpy(Dest, M1, stride); } else { xine_fast_memcpy(Dest, T1, stride); } /* clear out the MMX registers ready for doing floating point again */ asm("emms\n\t"); #endif }