/*****************************************************************************
** Copyright (c) 2000 John Adcock, Tom Barry, Steve Grimm  All rights reserved.
** port copyright (c) 2003 Miguel Freitas
******************************************************************************
**
**  This file is subject to the terms of the GNU General Public License as
**  published by the Free Software Foundation.  A copy of this license is
**  included with this software distribution in the file COPYING.  If you
**  do not have a copy, you may obtain a copy by writing to the Free
**  Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
**
**  This software is distributed in the hope that it will be useful,
**  but WITHOUT ANY WARRANTY; without even the implied warranty of
**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**  GNU General Public License for more details
******************************************************************************
** CVS Log
**
** Revision 1.10  2006/12/21 09:54:45  dgp85
** Apply the textrel patch from Gentoo, thanks to PaX team for providing it. The patch was applied and tested for a while in Gentoo and Pardus, and solves also Debian's problems with non-PIC code. If problems will arise, they'll be debugged.
**
** Revision 1.9  2006/02/04 14:06:29  miguelfreitas
** Enable AMD64 mmx/sse support in some plugins (tvtime, libmpeg2, goom...)
** patch by dani3l
**
** Revision 1.8  2005/06/05 16:00:06  miguelfreitas
** quite some hacks for gcc 2.95 compatibility
**
** Revision 1.7  2004/04/09 02:57:06  miguelfreitas
** tvtime deinterlacing algorithms assumed top_field_first=1
** top_field_first=0 (aka bottom_field_first) should now work as expected
**
** Revision 1.6  2004/02/12 20:53:31  mroi
** my gcc (partly 3.4 already) optimizes these away, because they are only used
** inside inline assembler (which the compiler does not recognize); so actually
** the code is wrong (the asm parts should list these as inputs), but telling
** the compiler to keep them is the easier fix
**
** Revision 1.5  2004/01/05 12:15:55  siggi
** wonder why Mike isn't complaining about C++ style comments, any more...
**
** Revision 1.4  2004/01/05 01:47:26  tmmm
** DOS/Win CRs are forbidden, verboten, interdit
**
** Revision 1.3  2004/01/02 20:53:43  miguelfreitas
** better MANGLE from ffmpeg
**
** Revision 1.2  2004/01/02 20:47:03  miguelfreitas
** my small contribution to the cygwin port ;-)
**
** Revision 1.1  2003/06/22 17:30:03  miguelfreitas
** use our own port of greedy2frame (tvtime port is currently broken)
**
** Revision 1.8  2001/11/23 17:18:54  adcockj
** Fixed silly and/or confusion
**
** Revision 1.7  2001/11/22 22:27:00  adcockj
** Bug Fixes
**
** Revision 1.6  2001/11/21 15:21:40  adcockj
** Renamed DEINTERLACE_INFO to TDeinterlaceInfo in line with standards
** Changed TDeinterlaceInfo structure to have history of pictures.
**
** Revision 1.5  2001/07/31 06:48:33  adcockj
** Fixed index bug spotted by Peter Gubanov
**
** Revision 1.4  2001/07/13 16:13:33  adcockj
** Added CVS tags and removed tabs
**
*****************************************************************************/

/*
 * This is the implementation of the Greedy 2-frame deinterlace algorithm
 * described in DI_Greedy2Frame.c.  It's in a separate file so we can compile
 * variants for different CPU types; most of the code is the same in the
 * different variants.
 */


/****************************************************************************
** Field 1 | Field 2 | Field 3 | Field 4 |
**   T0    |         |    T1   |         |
**         |   M0    |         |    M1   |
**   B0    |         |    B1   |         |
*/

#if defined(ARCH_X86) || defined(ARCH_X86_64)
static const sse_t Mask128 = { uq: { 0x7f7f7f7f7f7f7f7fll, 0x7f7f7f7f7f7f7f7fll} };
#define TP GREEDYTWOFRAMETHRESHOLD, GREEDYTWOFRAMETHRESHOLD2
static const sse_t GreedyTwoFrameThreshold128 = { ub: {TP, TP, TP, TP, TP, TP, TP, TP} };
#undef TP
#endif

static void DeinterlaceGreedy2Frame_SSE2(uint8_t *output, int outstride,
                                         deinterlace_frame_data_t *data,
                                         int bottom_field, int second_field,
                                         int width, int height )
{
#if defined(ARCH_X86) || defined(ARCH_X86_64)
    int Line;
    int stride = width * 2;
    register uint8_t* M1;
    register uint8_t* M0;
    register uint8_t* T1;
    register uint8_t* T0;
    uint8_t* Dest = output;
    register uint8_t* Dest2;
    register uint8_t* Destc;
    register int count;
    uint32_t Pitch = stride * 2;
    uint32_t LineLength = stride;
    uint32_t PitchRest = Pitch - (LineLength >> 4)*16;

    if( second_field ) {
        M1 = data->f0;
        T1 = data->f0;
        M0 = data->f1;
        T0 = data->f1;
    } else {
        M1 = data->f0;
        T1 = data->f1;
        M0 = data->f1;
        T0 = data->f2;
    }

    if( bottom_field ) {
        M1 += stride;
        T1 += 0;
        M0 += stride;
        T0 += 0;
    } else {
        M1 += Pitch;
        T1 += stride;
        M0 += Pitch;
        T0 += stride;

        xine_fast_memcpy(Dest, M1, LineLength);
        Dest += outstride;
    }

    for (Line = 0; Line < (height / 2) - 1; ++Line)
    {
      /* Always use the most recent data verbatim.  By definition it's correct
       * (it'd be shown on an interlaced display) and our job is to fill in
       * the spaces between the new lines.
       */
      /* xine_fast_memcpy would be pretty pointless here as we load the same
       * data anyway it's just one additional mov per loop...
       * XXX I believe some cpus with sse2 (early A64?) only have one write
       * buffer. Using movntdq with 2 different streams may have quite
       * bad performance consequences on such cpus.
       */

        Destc = Dest;
        Dest += outstride;
        Dest2 = Dest;

        /* just rely on gcc not using xmm regs... */
        do {
          asm volatile(
            "movdqa  %0, %%xmm6			\n\t"     // xmm6 = Mask
            "pxor    %%xmm7, %%xmm7		\n\t"     // xmm7 = zero
            : /* no output */
            : "m" (Mask128) );
        } while (0);

        count = LineLength >> 4;
        do {
          asm volatile(
       /* Figure out what to do with the scanline above the one we copy.
        * See above for a description of the algorithm.
        * weave if (weave(M) AND (weave(T) OR weave(B)))
        */
            "movdqa  (%2), %%xmm1		\n\t" /* xmm1 = T1 */
            "movdqa  (%3), %%xmm0		\n\t" /* xmm0 = T0 */
            "movdqa  (%q4,%2), %%xmm3		\n\t" /* xmm3 = B1 */
            "movdqa  (%q4,%3), %%xmm2		\n\t" /* xmm2 = B0 */

            /* calculate |T1-T0| keep T1 put result in xmm5 */
            "movdqa  %%xmm1, %%xmm5		\n\t"
            "psubusb %%xmm0, %%xmm5		\n\t"
            "psubusb %%xmm1, %%xmm0		\n\t"
            "por     %%xmm0, %%xmm5		\n\t"

            /* T1 is data for line to copy */
            "movntdq  %%xmm1, %1		\n\t"

            /* if |T1-T0| > Threshold we want 0 else dword minus one */
            "psrlw   $1, %%xmm5			\n\t"
            "pand    %%xmm6, %%xmm5		\n\t"
            "pcmpgtb %0, %%xmm5			\n\t"
            "pcmpeqd %%xmm7, %%xmm5		\n\t"

            "prefetcht0  64(%q4,%2)		\n\t"
            "prefetcht0  64(%q4,%3)		\n\t"
          :
          : "m" (GreedyTwoFrameThreshold128),
            "m" (*Destc), "r" (T1), "r" (T0), "r" (Pitch) );

          asm volatile (
            /* calculate |B1-B0| keep B1 put result in xmm4 */
            "movdqa  %%xmm3, %%xmm4		\n\t"
            "psubusb %%xmm2, %%xmm4		\n\t"
            "psubusb %%xmm3, %%xmm2		\n\t"
            "por     %%xmm2, %%xmm4		\n\t"

            "movdqa  (%0), %%xmm0		\n\t" /* xmm0 = M1 */
            "movdqa  (%1), %%xmm2		\n\t" /* xmm2 = M0 */

            /* if |B1-B0| > Threshold we want 0 else dword minus one */
            "psrlw   $1, %%xmm4			\n\t"
            "pand    %%xmm6, %%xmm4		\n\t"
            "pcmpgtb %2, %%xmm4			\n\t"
            "pcmpeqd %%xmm7, %%xmm4		\n\t"

            "por     %%xmm4, %%xmm5		\n\t"

            /* Average T1 and B1 so we can do interpolated bobbing if we bob
             * onto T1 */
            "pavgb   %%xmm3, %%xmm1		\n\t" /* xmm1 = avg(T1,B1) */

            "prefetcht0  64(%0)			\n\t"
            "prefetcht0  64(%1)			\n\t"

            /* make mm0 the average of M1 and M0 which should make weave
             * look better when there is small amounts of movement */
            "movdqa  %%xmm2, %%xmm3		\n\t"
            "pavgb   %%xmm0, %%xmm3		\n\t" /* xmm3 = avg(M1,M0) */

            /* calculate |M1-M0| put result in xmm4 */
            "movdqa  %%xmm0, %%xmm4		\n\t"
            "psubusb %%xmm2, %%xmm4		\n\t"
            "psubusb %%xmm0, %%xmm2		\n\t"
            "por     %%xmm2, %%xmm4		\n\t"

            /* if |M1-M0| > Threshold we want 0 else dword minus one */
            "psrlw   $1, %%xmm4			\n\t"
            "pand    %%xmm6, %%xmm4		\n\t"
            "pcmpgtb %2, %%xmm4			\n\t"
            "pcmpeqd %%xmm7, %%xmm4		\n\t" /* do we want to bob */

            "pand   %%xmm5, %%xmm4		\n\t"

/* debugging feature
 * output the value of xmm4 at this point which is pink where we will weave
 * and green where we are going to bob
 */
#ifdef CHECK_BOBWEAVE
            "movntdq  %%xmm4, %3		\n\t"
#else
            /* xmm4 now is 1 where we want to weave and 0 where we want to bob */
            "pand    %%xmm4, %%xmm3		\n\t"
            "pandn   %%xmm1, %%xmm4		\n\t"
            "por     %%xmm3, %%xmm4		\n\t"
            "movntdq  %%xmm4, %3		\n\t"
#endif
          :
          : "r" (M1), "r" (M0), "m" (GreedyTwoFrameThreshold128),
            "m" (*Dest2));

          /* Advance to the next set of pixels. */
          T1 += 16;
          M1 += 16;
          M0 += 16;
          T0 += 16;
          Dest2 += 16;
          Destc += 16;

        } while( --count );

        Dest += outstride;

        M1 += PitchRest;
        T1 += PitchRest;
        M0 += PitchRest;
        T0 += PitchRest;
    }

    asm("sfence\n\t");

    if( bottom_field )
    {
        xine_fast_memcpy(Dest, T1, stride);
        Dest += outstride;
        xine_fast_memcpy(Dest, M1, stride);
    }
    else
    {
        xine_fast_memcpy(Dest, T1, stride);
    }
#endif
}