From 81823acd03c99d28cb16562afd4e523d42e40f5b Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <rscheidegger_lists@hispeed.ch>
Date: Tue, 22 May 2012 11:18:17 +0300
Subject: Added SSE2 version of DeinterlaceGreedy2Frame

---
 src/post/deinterlace/plugins/Makefile.am           |   2 +-
 src/post/deinterlace/plugins/greedy2frame.c        |  35 ++-
 .../plugins/greedy2frame_template_sse2.c           | 291 +++++++++++++++++++++
 3 files changed, 326 insertions(+), 2 deletions(-)
 create mode 100644 src/post/deinterlace/plugins/greedy2frame_template_sse2.c

(limited to 'src')

diff --git a/src/post/deinterlace/plugins/Makefile.am b/src/post/deinterlace/plugins/Makefile.am
index 800e6b0fa..b2068a019 100644
--- a/src/post/deinterlace/plugins/Makefile.am
+++ b/src/post/deinterlace/plugins/Makefile.am
@@ -23,7 +23,7 @@ AM_CFLAGS   = $(VISIBILITY_FLAG)
 AM_CPPFLAGS = -I$(top_srcdir)/src/post/deinterlace \
               -I$(top_srcdir)/src/xine-utils
 
-EXTRA_DIST = greedy2frame_template.c greedyh.asm \
+EXTRA_DIST = greedy2frame_template.c greedy2frame_template_sse2.c greedyh.asm \
 	tomsmocomp/SearchLoop0A.inc tomsmocomp/SearchLoopBottom.inc \
 	tomsmocomp/SearchLoopEdgeA.inc tomsmocomp/SearchLoopEdgeA8.inc \
 	tomsmocomp/SearchLoopOddA.inc tomsmocomp/SearchLoopOddA2.inc \
diff --git a/src/post/deinterlace/plugins/greedy2frame.c b/src/post/deinterlace/plugins/greedy2frame.c
index ef7486300..964c490a9 100644
--- a/src/post/deinterlace/plugins/greedy2frame.c
+++ b/src/post/deinterlace/plugins/greedy2frame.c
@@ -52,6 +52,39 @@
 #include "greedy2frame_template.c"
 #undef IS_MMXEXT
 
+#include "greedy2frame_template_sse2.c"
+
+static void DeinterlaceGreedy2Frame(uint8_t *output, int outstride,
+                                    deinterlace_frame_data_t *data,
+                                    int bottom_field, int second_field, int width, int height )
+
+{
+    if (xine_mm_accel() & MM_ACCEL_X86_SSE2) {
+        if (((uintptr_t)output & 15) || (outstride & 15) ||
+            width & 7 ||
+            ((uintptr_t)data->f0 & 15) || ((uintptr_t)data->f1 & 15)) {
+            /*
+             * instead of using an unaligned sse2 version just fall back to mmx
+             * which has no alignment restriction (though might be slow unaliged,
+             * but shouldn't hit this hopefully anyway). Plus in my experiments this
+             * was at least as fast as a naive unaligned sse2 version anyway (due to
+             * the inability to use streaming stores).
+             */
+            DeinterlaceGreedy2Frame_MMXEXT(output, outstride, data,
+                                           bottom_field, second_field, width, height );
+        } else {
+            DeinterlaceGreedy2Frame_SSE2(output, outstride, data,
+                                         bottom_field, second_field, width, height );
+        }
+    }
+    else {
+        DeinterlaceGreedy2Frame_MMXEXT(output, outstride, data,
+                                       bottom_field, second_field, width, height );
+        /* could fall back to 3dnow/mmx here too */
+    }
+}
+
+
 static deinterlace_method_t greedy2framemethod =
 {
     "Greedy 2-frame (DScaler)",
@@ -62,7 +95,7 @@ static deinterlace_method_t greedy2framemethod =
     0,
     0,
     0,
-    DeinterlaceGreedy2Frame_MMXEXT,
+    DeinterlaceGreedy2Frame,
     1,
     NULL
 };
diff --git a/src/post/deinterlace/plugins/greedy2frame_template_sse2.c b/src/post/deinterlace/plugins/greedy2frame_template_sse2.c
new file mode 100644
index 000000000..ac570a8eb
--- /dev/null
+++ b/src/post/deinterlace/plugins/greedy2frame_template_sse2.c
@@ -0,0 +1,291 @@
+/*****************************************************************************
+** Copyright (c) 2000 John Adcock, Tom Barry, Steve Grimm  All rights reserved.
+** port copyright (c) 2003 Miguel Freitas
+******************************************************************************
+**
+**  This file is subject to the terms of the GNU General Public License as
+**  published by the Free Software Foundation.  A copy of this license is
+**  included with this software distribution in the file COPYING.  If you
+**  do not have a copy, you may obtain a copy by writing to the Free
+**  Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+**
+**  This software is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+**  GNU General Public License for more details
+******************************************************************************
+** CVS Log
+**
+** Revision 1.10  2006/12/21 09:54:45  dgp85
+** Apply the textrel patch from Gentoo, thanks to PaX team for providing it. The patch was applied and tested for a while in Gentoo and Pardus, and solves also Debian's problems with non-PIC code. If problems will arise, they'll be debugged.
+**
+** Revision 1.9  2006/02/04 14:06:29  miguelfreitas
+** Enable AMD64 mmx/sse support in some plugins (tvtime, libmpeg2, goom...)
+** patch by dani3l
+**
+** Revision 1.8  2005/06/05 16:00:06  miguelfreitas
+** quite some hacks for gcc 2.95 compatibility
+**
+** Revision 1.7  2004/04/09 02:57:06  miguelfreitas
+** tvtime deinterlacing algorithms assumed top_field_first=1
+** top_field_first=0 (aka bottom_field_first) should now work as expected
+**
+** Revision 1.6  2004/02/12 20:53:31  mroi
+** my gcc (partly 3.4 already) optimizes these away, because they are only used
+** inside inline assembler (which the compiler does not recognize); so actually
+** the code is wrong (the asm parts should list these as inputs), but telling
+** the compiler to keep them is the easier fix
+**
+** Revision 1.5  2004/01/05 12:15:55  siggi
+** wonder why Mike isn't complaining about C++ style comments, any more...
+**
+** Revision 1.4  2004/01/05 01:47:26  tmmm
+** DOS/Win CRs are forbidden, verboten, interdit
+**
+** Revision 1.3  2004/01/02 20:53:43  miguelfreitas
+** better MANGLE from ffmpeg
+**
+** Revision 1.2  2004/01/02 20:47:03  miguelfreitas
+** my small contribution to the cygwin port ;-)
+**
+** Revision 1.1  2003/06/22 17:30:03  miguelfreitas
+** use our own port of greedy2frame (tvtime port is currently broken)
+**
+** Revision 1.8  2001/11/23 17:18:54  adcockj
+** Fixed silly and/or confusion
+**
+** Revision 1.7  2001/11/22 22:27:00  adcockj
+** Bug Fixes
+**
+** Revision 1.6  2001/11/21 15:21:40  adcockj
+** Renamed DEINTERLACE_INFO to TDeinterlaceInfo in line with standards
+** Changed TDeinterlaceInfo structure to have history of pictures.
+**
+** Revision 1.5  2001/07/31 06:48:33  adcockj
+** Fixed index bug spotted by Peter Gubanov
+**
+** Revision 1.4  2001/07/13 16:13:33  adcockj
+** Added CVS tags and removed tabs
+**
+*****************************************************************************/
+
+/*
+ * This is the implementation of the Greedy 2-frame deinterlace algorithm
+ * described in DI_Greedy2Frame.c.  It's in a separate file so we can compile
+ * variants for different CPU types; most of the code is the same in the
+ * different variants.
+ */
+
+
+/****************************************************************************
+** Field 1 | Field 2 | Field 3 | Field 4 |
+**   T0    |         |    T1   |         |
+**         |   M0    |         |    M1   |
+**   B0    |         |    B1   |         |
+*/
+
+
+static const sse_t Mask128 = { uq: { 0x7f7f7f7f7f7f7f7fll, 0x7f7f7f7f7f7f7f7fll} };
+#define TP GREEDYTWOFRAMETHRESHOLD, GREEDYTWOFRAMETHRESHOLD2
+static const sse_t GreedyTwoFrameThreshold128 = { ub: {TP, TP, TP, TP, TP, TP, TP, TP} };
+#undef TP
+
+static void DeinterlaceGreedy2Frame_SSE2(uint8_t *output, int outstride,
+                                         deinterlace_frame_data_t *data,
+                                         int bottom_field, int second_field,
+                                         int width, int height )
+{
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+    int Line;
+    int stride = width * 2;
+    register uint8_t* M1;
+    register uint8_t* M0;
+    register uint8_t* T1;
+    register uint8_t* T0;
+    uint8_t* Dest = output;
+    register uint8_t* Dest2;
+    register uint8_t* Destc;
+    register int count;
+    uint32_t Pitch = stride * 2;
+    uint32_t LineLength = stride;
+    uint32_t PitchRest = Pitch - (LineLength >> 4)*16;
+
+    if( second_field ) {
+        M1 = data->f0;
+        T1 = data->f0;
+        M0 = data->f1;
+        T0 = data->f1;
+    } else {
+        M1 = data->f0;
+        T1 = data->f1;
+        M0 = data->f1;
+        T0 = data->f2;
+    }
+
+    if( bottom_field ) {
+        M1 += stride;
+        T1 += 0;
+        M0 += stride;
+        T0 += 0;
+    } else {
+        M1 += Pitch;
+        T1 += stride;
+        M0 += Pitch;
+        T0 += stride;
+
+        xine_fast_memcpy(Dest, M1, LineLength);
+        Dest += outstride;
+    }
+
+    for (Line = 0; Line < (height / 2) - 1; ++Line)
+    {
+      /* Always use the most recent data verbatim.  By definition it's correct
+       * (it'd be shown on an interlaced display) and our job is to fill in
+       * the spaces between the new lines.
+       */
+      /* xine_fast_memcpy would be pretty pointless here as we load the same
+       * data anyway it's just one additional mov per loop...
+       * XXX I believe some cpus with sse2 (early A64?) only have one write
+       * buffer. Using movntdq with 2 different streams may have quite
+       * bad performance consequences on such cpus.
+       */
+
+        Destc = Dest;
+        Dest += outstride;
+        Dest2 = Dest;
+
+        /* just rely on gcc not using xmm regs... */
+        do {
+          asm volatile(
+            "movdqa  %0, %%xmm6			\n\t"     // xmm6 = Mask
+            "pxor    %%xmm7, %%xmm7		\n\t"     // xmm7 = zero
+            : /* no output */
+            : "m" (Mask128) );
+        } while (0);
+
+        count = LineLength >> 4;
+        do {
+          asm volatile(
+       /* Figure out what to do with the scanline above the one we copy.
+        * See above for a description of the algorithm.
+        * weave if (weave(M) AND (weave(T) OR weave(B)))
+        */
+            "movdqa  (%4), %%xmm1		\n\t" /* xmm1 = T1 */
+            "movdqa  (%5), %%xmm0		\n\t" /* xmm0 = T0 */
+            "movdqa  (%q6,%4), %%xmm3		\n\t" /* xmm3 = B1 */
+            "movdqa  (%q6,%5), %%xmm2		\n\t" /* xmm2 = B0 */
+
+            /* calculate |T1-T0| keep T1 put result in xmm5 */
+            "movdqa  %%xmm1, %%xmm5		\n\t"
+            "psubusb %%xmm0, %%xmm5		\n\t"
+            "psubusb %%xmm1, %%xmm0		\n\t"
+            "por     %%xmm0, %%xmm5		\n\t"
+
+            "movdqa  (%0), %%xmm0		\n\t" /* xmm0 = M1 */
+            /* T1 is data for line to copy */
+            "movntdq  %%xmm1, %3		\n\t"
+
+            /* if |T1-T0| > Threshold we want 0 else dword minus one */
+            "psrlw   $1, %%xmm5			\n\t"
+            "pand    %%xmm6, %%xmm5		\n\t"
+            "pcmpgtb %2, %%xmm5			\n\t"
+            "pcmpeqd %%xmm7, %%xmm5		\n\t"
+
+            /* calculate |B1-B0| keep B1 put result in xmm4 */
+            "movdqa  %%xmm3, %%xmm4		\n\t"
+            "psubusb %%xmm2, %%xmm4		\n\t"
+            "psubusb %%xmm3, %%xmm2		\n\t"
+            "por     %%xmm2, %%xmm4		\n\t"
+
+            "movdqa  (%1), %%xmm2		\n\t" /* xmm2 = M0 */
+
+            /* if |B1-B0| > Threshold we want 0 else dword minus one */
+            "psrlw   $1, %%xmm4			\n\t"
+            "pand    %%xmm6, %%xmm4		\n\t"
+            "pcmpgtb %2, %%xmm4			\n\t"
+            "pcmpeqd %%xmm7, %%xmm4		\n\t"
+
+            "prefetcht0  64(%q6,%4)		\n\t"
+            "prefetcht0  64(%q6,%5)		\n\t"
+
+            "por     %%xmm4, %%xmm5		\n\t"
+
+            /* Average T1 and B1 so we can do interpolated bobbing if we bob
+             * onto T1 */
+            "pavgb   %%xmm3, %%xmm1		\n\t" /* xmm1 = avg(T1,B1) */
+
+            "prefetcht0  64(%0)			\n\t"
+            "prefetcht0  64(%1)			\n\t"
+
+            /* make mm0 the average of M1 and M0 which should make weave
+             * look better when there is small amounts of movement */
+            "movdqa  %%xmm2, %%xmm3		\n\t"
+            "pavgb   %%xmm0, %%xmm3		\n\t" /* xmm3 = avg(M1,M0) */
+
+            /* calculate |M1-M0| put result in xmm4 */
+            "movdqa  %%xmm0, %%xmm4		\n\t"
+            "psubusb %%xmm2, %%xmm4		\n\t"
+            "psubusb %%xmm0, %%xmm2		\n\t"
+            "por     %%xmm2, %%xmm4		\n\t"
+
+            /* if |M1-M0| > Threshold we want 0 else dword minus one */
+            "psrlw   $1, %%xmm4			\n\t"
+            "pand    %%xmm6, %%xmm4		\n\t"
+            "pcmpgtb %2, %%xmm4			\n\t"
+            "pcmpeqd %%xmm7, %%xmm4		\n\t" /* do we want to bob */
+
+            "pand   %%xmm5, %%xmm4		\n\t"
+          :
+          : "r" (M1), "r" (M0), "m" (GreedyTwoFrameThreshold128),
+            "m" (*Destc), "r" (T1), "r" (T0), "r" (Pitch) );
+
+          asm volatile(
+/* debugging feature
+ * output the value of xmm4 at this point which is pink where we will weave
+ * and green where we are going to bob
+ */
+#ifdef CHECK_BOBWEAVE
+            "movntdq  %%xmm4, %0		\n\t"
+#else
+            /* xmm4 now is 1 where we want to weave and 0 where we want to bob */
+            "pand    %%xmm4, %%xmm3		\n\t"
+            "pandn   %%xmm1, %%xmm4		\n\t"
+            "por     %%xmm3, %%xmm4		\n\t"
+            "movntdq  %%xmm4, %0		\n\t"
+#endif
+          :
+          : "m" (*Dest2));
+
+          /* Advance to the next set of pixels. */
+          T1 += 16;
+          M1 += 16;
+          M0 += 16;
+          T0 += 16;
+          Dest2 += 16;
+          Destc += 16;
+
+        } while( --count );
+
+        Dest += outstride;
+
+        M1 += PitchRest;
+        T1 += PitchRest;
+        M0 += PitchRest;
+        T0 += PitchRest;
+    }
+
+    asm("sfence\n\t");
+
+    if( bottom_field )
+    {
+        xine_fast_memcpy(Dest, T1, stride);
+        Dest += outstride;
+        xine_fast_memcpy(Dest, M1, stride);
+    }
+    else
+    {
+        xine_fast_memcpy(Dest, T1, stride);
+    }
+#endif
+}
+
-- 
cgit v1.2.3