///////////////////////////////////////////////////////////////////////////// // $Id: greedy2frame_template.c,v 1.1 2003/06/22 17:30:03 miguelfreitas Exp $ ///////////////////////////////////////////////////////////////////////////// // Copyright (c) 2000 John Adcock, Tom Barry, Steve Grimm All rights reserved. // port copyright (c) 2003 Miguel Freitas ///////////////////////////////////////////////////////////////////////////// // // This file is subject to the terms of the GNU General Public License as // published by the Free Software Foundation. A copy of this license is // included with this software distribution in the file COPYING. If you // do not have a copy, you may obtain a copy by writing to the Free // Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. // // This software is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details ///////////////////////////////////////////////////////////////////////////// // CVS Log // // $Log: greedy2frame_template.c,v $ // Revision 1.1 2003/06/22 17:30:03 miguelfreitas // use our own port of greedy2frame (tvtime port is currently broken) // // Revision 1.8 2001/11/23 17:18:54 adcockj // Fixed silly and/or confusion // // Revision 1.7 2001/11/22 22:27:00 adcockj // Bug Fixes // // Revision 1.6 2001/11/21 15:21:40 adcockj // Renamed DEINTERLACE_INFO to TDeinterlaceInfo in line with standards // Changed TDeinterlaceInfo structure to have history of pictures. // // Revision 1.5 2001/07/31 06:48:33 adcockj // Fixed index bug spotted by Peter Gubanov // // Revision 1.4 2001/07/13 16:13:33 adcockj // Added CVS tags and removed tabs // ///////////////////////////////////////////////////////////////////////////// // This is the implementation of the Greedy 2-frame deinterlace algorithm described in // DI_Greedy2Frame.c. It's in a separate file so we can compile variants for different // CPU types; most of the code is the same in the different variants. /////////////////////////////////////////////////////////////////////////////// // Field 1 | Field 2 | Field 3 | Field 4 | // T0 | | T1 | | // | M0 | | M1 | // B0 | | B1 | | // // debugging feature // output the value of mm4 at this point which is pink where we will weave // and green were we are going to bob // uncomment next line to see this //#define CHECK_BOBWEAVE #if !defined(MASKS_DEFINED) #define MASKS_DEFINED static const int64_t YMask = 0x00ff00ff00ff00ff; static const int64_t Mask = 0x7f7f7f7f7f7f7f7f; static const int64_t DwordOne = 0x0000000100000001; static const int64_t DwordTwo = 0x0000000200000002; static int64_t qwGreedyTwoFrameThreshold; #endif #if defined(IS_SSE) static void DeinterlaceGreedy2Frame_SSE(uint8_t *output, int outstride, deinterlace_frame_data_t *data, int bottom_field, int width, int height ) #elif defined(IS_3DNOW) static void DeinterlaceGreedy2Frame_3DNOW(uint8_t *output, int outstride, deinterlace_frame_data_t *data, int bottom_field, int width, int height ) #else static void DeinterlaceGreedy2Frame_MMX(uint8_t *output, int outstride, deinterlace_frame_data_t *data, int bottom_field, int width, int height ) #endif { #ifdef ARCH_X86 int Line; int stride = width * 2; register uint8_t* M1; register uint8_t* M0; register uint8_t* T0; register uint8_t* T1; register uint8_t* B1; register uint8_t* B0; uint8_t* Dest = output; register uint8_t* Dest2; register int count; uint32_t Pitch = stride*2; uint32_t LineLength = stride; uint32_t PitchRest = Pitch - (LineLength >> 3)*8; qwGreedyTwoFrameThreshold = GreedyTwoFrameThreshold; qwGreedyTwoFrameThreshold += (GreedyTwoFrameThreshold2 << 8); qwGreedyTwoFrameThreshold += (qwGreedyTwoFrameThreshold << 48) + (qwGreedyTwoFrameThreshold << 32) + (qwGreedyTwoFrameThreshold << 16); if( bottom_field ) { M1 = data->f0 + stride; T1 = data->f0; B1 = T1 + Pitch; M0 = data->f1 + stride; T0 = data->f1; B0 = T0 + Pitch; } else { M1 = data->f0 + Pitch; T1 = data->f1 + stride; B1 = T1 + Pitch; M0 = data->f1 + Pitch; T0 = data->f2 + stride; B0 = T0 + Pitch; xine_fast_memcpy(Dest, M1, LineLength); Dest += outstride; } for (Line = 0; Line < (height / 2) - 1; ++Line) { // Always use the most recent data verbatim. By definition it's correct (it'd // be shown on an interlaced display) and our job is to fill in the spaces // between the new lines. xine_fast_memcpy(Dest, T1, stride); Dest += outstride; Dest2 = Dest; count = LineLength >> 3; do { asm volatile( // Figure out what to do with the scanline above the one we just copied. // See above for a description of the algorithm. ".align 8 \n\t" "movq Mask, %%mm6 \n\t" "movq %0, %%mm1 \n\t" // T1 "movq %1, %%mm0 \n\t" // M1 "movq %2, %%mm3 \n\t" // B1 "movq %3, %%mm2 \n\t" // M0 : /* no output */ : "m" (*T1), "m" (*M1), "m" (*B1), "m" (*M0) ); asm volatile( // Figure out what to do with the scanline above the one we just copied. // See above for a description of the algorithm. // Average T1 and B1 so we can do interpolated bobbing if we bob onto T1. "movq %%mm3, %%mm7 \n\t" // mm7 = B1 #if defined(IS_SSE) "pavgb %%mm1, %%mm7 \n\t" #elif defined(IS_3DNOW) "pavgusb %%mm1, %%mm7 \n\t" #else "movq %%mm1, %%mm5 \n\t" // mm5 = T1 "psrlw $1, %%mm7 \n\t" // mm7 = B1 / 2 "pand %%mm6, %%mm7 \n\t" // mask off lower bits "psrlw $1, %%mm5 \n\t" // mm5 = T1 / 2 "pand %%mm6, %%mm5 \n\t" // mask off lower bits "paddw %%mm5, %%mm7 \n\t" // mm7 = (T1 + B1) / 2 #endif // calculate |M1-M0| put result in mm4 need to keep mm0 intact // if we have a good processor then make mm0 the average of M1 and M0 // which should make weave look better when there is small amounts of // movement #if defined(IS_SSE) "movq %%mm0, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" "psubusb %%mm2, %%mm4 \n\t" "psubusb %%mm0, %%mm5 \n\t" "por %%mm5, %%mm4 \n\t" "psrlw $1, %%mm4 \n\t" "pavgb %%mm2, %%mm0 \n\t" "pand %%mm6, %%mm4 \n\t" #elif defined(IS_3DNOW) "movq %%mm0, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" "psubusb %%mm2, %%mm4 \n\t" "psubusb %%mm0, %%mm5 \n\t" "por %%mm5, %%mm4 \n\t" "psrlw $1, %%mm4 \n\t" "pavgusb %%mm2, %%mm0 \n\t" "pand %%mm6, %%mm4 \n\t" #else "movq %%mm0, %%mm4 \n\t" "psubusb %%mm2, %%mm4 \n\t" "psubusb %%mm0, %%mm2 \n\t" "por %%mm2, %%mm4 \n\t" "psrlw $1, %%mm4 \n\t" "pand %%mm6, %%mm4 \n\t" #endif // if |M1-M0| > Threshold we want dword worth of twos "pcmpgtb qwGreedyTwoFrameThreshold, %%mm4 \n\t" "pand Mask, %%mm4 \n\t" // get rid of any sign bit "pcmpgtd DwordOne, %%mm4 \n\t" // do we want to bob "pandn DwordTwo, %%mm4 \n\t" "movq %1, %%mm2 \n\t" // mm2 = T0 // calculate |T1-T0| put result in mm5 "movq %%mm2, %%mm5 \n\t" "psubusb %%mm1, %%mm5 \n\t" "psubusb %%mm2, %%mm1 \n\t" "por %%mm1, %%mm5 \n\t" "psrlw $1, %%mm5 \n\t" "pand %%mm6, %%mm5 \n\t" // if |T1-T0| > Threshold we want dword worth of ones "pcmpgtb qwGreedyTwoFrameThreshold, %%mm5 \n\t" "pand %%mm6, %%mm5 \n\t" // get rid of any sign bit "pcmpgtd DwordOne, %%mm5 \n\t" "pandn DwordOne, %%mm5 \n\t" "paddd %%mm5, %%mm4 \n\t" "movq %2, %%mm2 \n\t" // B0 // calculate |B1-B0| put result in mm5 "movq %%mm2, %%mm5 \n\t" "psubusb %%mm3, %%mm5 \n\t" "psubusb %%mm2, %%mm3 \n\t" "por %%mm3, %%mm5 \n\t" "psrlw $1, %%mm5 \n\t" "pand %%mm6, %%mm5 \n\t" // if |B1-B0| > Threshold we want dword worth of ones "pcmpgtb qwGreedyTwoFrameThreshold, %%mm5 \n\t" "pand %%mm6, %%mm5 \n\t" // get rid of any sign bit "pcmpgtd DwordOne, %%mm5 \n\t" "pandn DwordOne, %%mm5 \n\t" "paddd %%mm5, %%mm4 \n\t" "pcmpgtd DwordTwo, %%mm4 \n\t" // debugging feature // output the value of mm4 at this point which is pink where we will weave // and green were we are going to bob #ifdef CHECK_BOBWEAVE #ifdef IS_SSE "movntq %%mm4, %0 \n\t" #else "movq %%mm4, %0 \n\t" #endif #else "movq %%mm4, %%mm5 \n\t" // mm4 now is 1 where we want to weave and 0 where we want to bob "pand %%mm0, %%mm4 \n\t" "pandn %%mm7, %%mm5 \n\t" "por %%mm5, %%mm4 \n\t" #ifdef IS_SSE "movntq %%mm4, %0 \n\t" #else "movq %%mm4, %0 \n\t" #endif #endif : "=m" (*Dest2) : "m" (*T0), "m" (*B0) ); // Advance to the next set of pixels. T1 += 8; M1 += 8; B1 += 8; M0 += 8; T0 += 8; B0 += 8; Dest2 += 8; } while( --count ); Dest += outstride; M1 += PitchRest; T1 += PitchRest; B1 += PitchRest; M0 += PitchRest; T0 += PitchRest; B0 += PitchRest; } #ifdef IS_SSE asm("sfence\n\t"); #endif if( bottom_field ) { xine_fast_memcpy(Dest, T1, stride); Dest += outstride; xine_fast_memcpy(Dest, M1, stride); } else { xine_fast_memcpy(Dest, T1, stride); } // clear out the MMX registers ready for doing floating point // again asm("emms\n\t"); #endif }