diff options
Diffstat (limited to 'src/post/deinterlace')
-rw-r--r-- | src/post/deinterlace/Makefile.am | 11 | ||||
-rw-r--r-- | src/post/deinterlace/deinterlace.c | 2 | ||||
-rw-r--r-- | src/post/deinterlace/plugins/Makefile.am | 53 | ||||
-rw-r--r-- | src/post/deinterlace/plugins/greedy.c | 5 | ||||
-rw-r--r-- | src/post/deinterlace/plugins/greedy2frame.c | 48 | ||||
-rw-r--r-- | src/post/deinterlace/plugins/greedy2frame_template.c | 166 | ||||
-rw-r--r-- | src/post/deinterlace/plugins/greedy2frame_template_sse2.c | 293 | ||||
-rw-r--r-- | src/post/deinterlace/plugins/greedyh.asm | 1 | ||||
-rw-r--r-- | src/post/deinterlace/plugins/kdetv_greedyh.c | 4 | ||||
-rw-r--r-- | src/post/deinterlace/plugins/kdetv_tomsmocomp.c | 4 | ||||
-rw-r--r-- | src/post/deinterlace/plugins/linearblend.c | 5 | ||||
-rw-r--r-- | src/post/deinterlace/plugins/tomsmocomp/TomsMoCompAll.inc | 2 | ||||
-rw-r--r-- | src/post/deinterlace/plugins/vfir.c | 5 | ||||
-rw-r--r-- | src/post/deinterlace/speedy.c | 220 | ||||
-rw-r--r-- | src/post/deinterlace/xine_plugin.c | 30 |
15 files changed, 681 insertions, 168 deletions
diff --git a/src/post/deinterlace/Makefile.am b/src/post/deinterlace/Makefile.am index 079ed5baf..cde794988 100644 --- a/src/post/deinterlace/Makefile.am +++ b/src/post/deinterlace/Makefile.am @@ -1,19 +1,18 @@ +include $(top_srcdir)/misc/Makefile.quiet include $(top_builddir)/misc/Makefile.plugins include $(top_srcdir)/misc/Makefile.common -SUBDIRS = plugins +AM_CFLAGS = $(DEFAULT_OCFLAGS) $(VISIBILITY_FLAG) +AM_LDFLAGS = $(xineplug_ldflags) $(IMPURE_TEXT_LDFLAGS) -EXTRA_DIST = +SUBDIRS = plugins xinepost_LTLIBRARIES = xineplug_post_tvtime.la xineplug_post_tvtime_la_SOURCES = xine_plugin.c \ deinterlace.c pulldown.c speedy.c tvtime.c +xineplug_post_tvtime_la_CFLAGS = $(AM_CFLAGS) -fno-strict-aliasing xineplug_post_tvtime_la_LIBADD = $(XINE_LIB) $(LTLIBINTL) $(PTHREAD_LIBS) \ $(top_builddir)/src/post/deinterlace/plugins/libdeinterlaceplugins.la -xineplug_post_tvtime_la_CFLAGS = $(VISIBILITY_FLAG) -xineplug_post_tvtime_la_LDFLAGS = $(xineplug_ldflags) \ - @IMPURE_TEXT_LDFLAGS@ - noinst_HEADERS = deinterlace.h pulldown.h speedtools.h speedy.h tvtime.h diff --git a/src/post/deinterlace/deinterlace.c b/src/post/deinterlace/deinterlace.c index 8e4a3bb00..5c0356c55 100644 --- a/src/post/deinterlace/deinterlace.c +++ b/src/post/deinterlace/deinterlace.c @@ -31,7 +31,7 @@ */ #include "deinterlace.h" -#include "xine_internal.h" +#include <xine/xine_internal.h> typedef struct methodlist_item_s methodlist_item_t; diff --git a/src/post/deinterlace/plugins/Makefile.am b/src/post/deinterlace/plugins/Makefile.am index 5e50c25f4..b2068a019 100644 --- a/src/post/deinterlace/plugins/Makefile.am +++ b/src/post/deinterlace/plugins/Makefile.am @@ -1,3 +1,4 @@ +include $(top_srcdir)/misc/Makefile.quiet include $(top_srcdir)/misc/Makefile.common # plugins/Makefile.am distributes the plugins that come with tvtime. @@ -17,7 +18,12 @@ include $(top_srcdir)/misc/Makefile.common # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -EXTRA_DIST = greedy2frame_template.c greedyh.asm \ +# libpostproc is here so we can use their nice mangle.h +AM_CFLAGS = $(VISIBILITY_FLAG) +AM_CPPFLAGS = -I$(top_srcdir)/src/post/deinterlace \ + -I$(top_srcdir)/src/xine-utils + +EXTRA_DIST = greedy2frame_template.c greedy2frame_template_sse2.c greedyh.asm \ tomsmocomp/SearchLoop0A.inc tomsmocomp/SearchLoopBottom.inc \ tomsmocomp/SearchLoopEdgeA.inc tomsmocomp/SearchLoopEdgeA8.inc \ tomsmocomp/SearchLoopOddA.inc tomsmocomp/SearchLoopOddA2.inc \ @@ -28,27 +34,26 @@ EXTRA_DIST = greedy2frame_template.c greedyh.asm \ tomsmocomp/TomsMoCompAll2.inc tomsmocomp/WierdBob.inc \ tomsmocomp/tomsmocompmacros.h x86-64_macros.inc -# libpostproc is here so we can use their nice mangle.h -AM_CFLAGS = -I$(top_srcdir)/src/post/deinterlace \ - -I$(top_srcdir)/src/xine-utils - -# Avoid "can't find register" failures with -O0, -O2, -O3 (gcc 4.0) -libdeinterlaceplugins_la-kdetv_greedyh.o libdeinterlaceplugins_la-kdetv_greedyh.lo: CFLAGS=$(shell echo @CFLAGS@ | sed -e 's/$$/ -O1/') - -noinst_LTLIBRARIES = libdeinterlaceplugins.la - -libdeinterlaceplugins_la_SOURCES = \ - double.c \ - greedy.c \ - linear.c \ - linearblend.c \ - vfir.c \ - weave.c \ - greedy2frame.c \ - scalerbob.c \ - kdetv_greedyh.c \ - kdetv_tomsmocomp.c -libdeinterlaceplugins_la_CFLAGS = $(VISIBILITY_FLAG) $(AM_CFLAGS) -libdeinterlaceplugins_la_LDFLAGS = $(xineplug_ldflags) - noinst_HEADERS = plugins.h greedyhmacros.h + +if DEBUG_BUILD +debug_sources = greedy2frame.c +nodebug_sources = +else +debug_sources = +nodebug_sources = greedy2frame.c +endif + +# per-object CFLAGS -- drop optimization on kdetv_greedyh.c so that gcc +# doesn't run out of general registers trying to compile it. + +noinst_LTLIBRARIES = libdeinterlacepluginsO1.la libdeinterlaceplugins.la +libdeinterlacepluginsO1_la_SOURCES = kdetv_greedyh.c $(debug_sources) +libdeinterlacepluginsO1_la_CFLAGS = $(O1_CFLAGS) $(AM_CFLAGS) + +libdeinterlaceplugins_la_SOURCES = double.c greedy.c linear.c linearblend.c \ + vfir.c weave.c scalerbob.c kdetv_tomsmocomp.c \ + $(nodebug_sources) +libdeinterlaceplugins_la_LIBADD = $(XINE_LIB) libdeinterlacepluginsO1.la +libdeinterlaceplugins_la_CFLAGS = $(DEFAULT_OCFLAGS) $(AM_CFLAGS) $(AVUTIL_CFLAGS) +libdeinterlaceplugins_la_LDFLAGS = $(AM_LDFLAGS) $(xineplug_ldflags) diff --git a/src/post/deinterlace/plugins/greedy.c b/src/post/deinterlace/plugins/greedy.c index 925779224..ee401dba6 100644 --- a/src/post/deinterlace/plugins/greedy.c +++ b/src/post/deinterlace/plugins/greedy.c @@ -32,8 +32,9 @@ #include <stdint.h> #endif -#include "attributes.h" -#include "xineutils.h" +#include <xine/attributes.h> +#include <xine/xineutils.h> +#include "xine_mmx.h" #include "deinterlace.h" #include "speedtools.h" #include "speedy.h" diff --git a/src/post/deinterlace/plugins/greedy2frame.c b/src/post/deinterlace/plugins/greedy2frame.c index 57e3228ac..964c490a9 100644 --- a/src/post/deinterlace/plugins/greedy2frame.c +++ b/src/post/deinterlace/plugins/greedy2frame.c @@ -31,8 +31,9 @@ #include <stdint.h> #endif -#include "attributes.h" -#include "xineutils.h" +#include <xine/attributes.h> +#include <xine/xineutils.h> +#include "xine_mmx.h" #include "deinterlace.h" #include "speedtools.h" #include "speedy.h" @@ -44,12 +45,45 @@ // uncomment next line to see this //#define CHECK_BOBWEAVE -static int GreedyTwoFrameThreshold = 4; -static int GreedyTwoFrameThreshold2 = 8; +#define GREEDYTWOFRAMETHRESHOLD 4 +#define GREEDYTWOFRAMETHRESHOLD2 8 -#define IS_SSE 1 +#define IS_MMXEXT 1 #include "greedy2frame_template.c" -#undef IS_SSE +#undef IS_MMXEXT + +#include "greedy2frame_template_sse2.c" + +static void DeinterlaceGreedy2Frame(uint8_t *output, int outstride, + deinterlace_frame_data_t *data, + int bottom_field, int second_field, int width, int height ) + +{ + if (xine_mm_accel() & MM_ACCEL_X86_SSE2) { + if (((uintptr_t)output & 15) || (outstride & 15) || + width & 7 || + ((uintptr_t)data->f0 & 15) || ((uintptr_t)data->f1 & 15)) { + /* + * instead of using an unaligned sse2 version just fall back to mmx + * which has no alignment restriction (though might be slow unaliged, + * but shouldn't hit this hopefully anyway). Plus in my experiments this + * was at least as fast as a naive unaligned sse2 version anyway (due to + * the inability to use streaming stores). + */ + DeinterlaceGreedy2Frame_MMXEXT(output, outstride, data, + bottom_field, second_field, width, height ); + } else { + DeinterlaceGreedy2Frame_SSE2(output, outstride, data, + bottom_field, second_field, width, height ); + } + } + else { + DeinterlaceGreedy2Frame_MMXEXT(output, outstride, data, + bottom_field, second_field, width, height ); + /* could fall back to 3dnow/mmx here too */ + } +} + static deinterlace_method_t greedy2framemethod = { @@ -61,7 +95,7 @@ static deinterlace_method_t greedy2framemethod = 0, 0, 0, - DeinterlaceGreedy2Frame_SSE, + DeinterlaceGreedy2Frame, 1, NULL }; diff --git a/src/post/deinterlace/plugins/greedy2frame_template.c b/src/post/deinterlace/plugins/greedy2frame_template.c index 7fe52519f..e88124886 100644 --- a/src/post/deinterlace/plugins/greedy2frame_template.c +++ b/src/post/deinterlace/plugins/greedy2frame_template.c @@ -85,25 +85,18 @@ */ -/* debugging feature */ -/* output the value of mm4 at this point which is pink where we will weave */ -/* and green were we are going to bob */ -/* uncomment next line to see this */ -/* #define CHECK_BOBWEAVE */ - +#if defined(ARCH_X86) || defined(ARCH_X86_64) #if !defined(MASKS_DEFINED) #define MASKS_DEFINED - static const int64_t __attribute__((__used__)) YMask = 0x00ff00ff00ff00ffll; - static const int64_t __attribute__((__used__)) Mask = 0x7f7f7f7f7f7f7f7fll; - static const int64_t __attribute__((__used__)) DwordOne = 0x0000000100000001ll; - static const int64_t __attribute__((__used__)) DwordTwo = 0x0000000200000002ll; - static int64_t qwGreedyTwoFrameThreshold; +static const mmx_t Mask = { uq: 0x7f7f7f7f7f7f7f7fll }; +#define TP GREEDYTWOFRAMETHRESHOLD, GREEDYTWOFRAMETHRESHOLD2 +static const mmx_t GreedyTwoFrameThreshold = { ub: {TP, TP, TP, TP} }; +#undef TP +#endif #endif -#include <mangle.h> - -#if defined(IS_SSE) -static void DeinterlaceGreedy2Frame_SSE(uint8_t *output, int outstride, +#if defined(IS_MMXEXT) +static void DeinterlaceGreedy2Frame_MMXEXT(uint8_t *output, int outstride, deinterlace_frame_data_t *data, int bottom_field, int second_field, int width, int height ) #elif defined(IS_3DNOW) @@ -132,13 +125,6 @@ static void DeinterlaceGreedy2Frame_MMX(uint8_t *output, int outstride, uint32_t LineLength = stride; uint32_t PitchRest = Pitch - (LineLength >> 3)*8; - qwGreedyTwoFrameThreshold = GreedyTwoFrameThreshold; - qwGreedyTwoFrameThreshold += (GreedyTwoFrameThreshold2 << 8); - qwGreedyTwoFrameThreshold += (qwGreedyTwoFrameThreshold << 48) + - (qwGreedyTwoFrameThreshold << 32) + - (qwGreedyTwoFrameThreshold << 16); - - if( second_field ) { M1 = data->f0; T1 = data->f0; @@ -185,14 +171,15 @@ static void DeinterlaceGreedy2Frame_MMX(uint8_t *output, int outstride, asm volatile( /* Figure out what to do with the scanline above the one we just copied. * See above for a description of the algorithm. - */ - ".align 8 \n\t" - "movq %4, %%mm6 \n\t" - + * weave if (weave(M) AND (weave(T) OR weave(B))) + */ "movq %0, %%mm1 \n\t" // T1 "movq %1, %%mm0 \n\t" // M1 "movq %2, %%mm3 \n\t" // B1 "movq %3, %%mm2 \n\t" // M0 + + "movq %4, %%mm6 \n\t" // Mask + : /* no output */ : "m" (*T1), "m" (*M1), "m" (*B1), "m" (*M0), "m" (Mask) ); @@ -205,7 +192,7 @@ static void DeinterlaceGreedy2Frame_MMX(uint8_t *output, int outstride, */ "movq %%mm3, %%mm7 \n\t" /* mm7 = B1 */ -#if defined(IS_SSE) +#if defined(IS_MMXEXT) "pavgb %%mm1, %%mm7 \n\t" #elif defined(IS_3DNOW) "pavgusb %%mm1, %%mm7 \n\t" @@ -224,93 +211,84 @@ static void DeinterlaceGreedy2Frame_MMX(uint8_t *output, int outstride, * which should make weave look better when there is small amounts of * movement */ -#if defined(IS_SSE) - "movq %%mm0, %%mm4 \n\t" - "movq %%mm2, %%mm5 \n\t" - "psubusb %%mm2, %%mm4 \n\t" - "psubusb %%mm0, %%mm5 \n\t" - "por %%mm5, %%mm4 \n\t" - "psrlw $1, %%mm4 \n\t" - "pavgb %%mm2, %%mm0 \n\t" - "pand %%mm6, %%mm4 \n\t" +#if defined(IS_MMXEXT) + "movq %%mm0, %%mm4 \n\t" + "movq %%mm2, %%mm5 \n\t" + "psubusb %%mm2, %%mm4 \n\t" + "psubusb %%mm0, %%mm5 \n\t" + "por %%mm5, %%mm4 \n\t" + "pavgb %%mm2, %%mm0 \n\t" #elif defined(IS_3DNOW) - "movq %%mm0, %%mm4 \n\t" - "movq %%mm2, %%mm5 \n\t" - "psubusb %%mm2, %%mm4 \n\t" - "psubusb %%mm0, %%mm5 \n\t" - "por %%mm5, %%mm4 \n\t" - "psrlw $1, %%mm4 \n\t" - "pavgusb %%mm2, %%mm0 \n\t" - "pand %%mm6, %%mm4 \n\t" + "movq %%mm0, %%mm4 \n\t" + "movq %%mm2, %%mm5 \n\t" + "psubusb %%mm2, %%mm4 \n\t" + "psubusb %%mm0, %%mm5 \n\t" + "por %%mm5, %%mm4 \n\t" + "pavgusb %%mm2, %%mm0 \n\t" #else - "movq %%mm0, %%mm4 \n\t" - "psubusb %%mm2, %%mm4 \n\t" - "psubusb %%mm0, %%mm2 \n\t" - "por %%mm2, %%mm4 \n\t" - "psrlw $1, %%mm4 \n\t" - "pand %%mm6, %%mm4 \n\t" + "movq %%mm0, %%mm4 \n\t" + "psubusb %%mm2, %%mm4 \n\t" + "psubusb %%mm0, %%mm2 \n\t" + "por %%mm2, %%mm4 \n\t" #endif - /* if |M1-M0| > Threshold we want dword worth of twos */ - "pcmpgtb %3, %%mm4 \n\t" - "pand %4, %%mm4 \n\t" /* get rid of sign bit */ - "pcmpgtd %5, %%mm4 \n\t" /* do we want to bob */ - "pandn %6, %%mm4 \n\t" - "movq %1, %%mm2 \n\t" /* mm2 = T0 */ - /* calculate |T1-T0| put result in mm5 */ - "movq %%mm2, %%mm5 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "psubusb %%mm2, %%mm1 \n\t" - "por %%mm1, %%mm5 \n\t" - "psrlw $1, %%mm5 \n\t" - "pand %%mm6, %%mm5 \n\t" - - /* if |T1-T0| > Threshold we want dword worth of ones */ - "pcmpgtb %3, %%mm5 \n\t" - "pand %%mm6, %%mm5 \n\t" /* get rid of sign bit */ + /* if |M1-M0| > Threshold we want 0 else dword minus one */ + "psrlw $1, %%mm4 \n\t" + "pand %%mm6, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" // zero + "pcmpgtb %3, %%mm4 \n\t" + "pcmpeqd %%mm5, %%mm4 \n\t" /* do we want to bob */ - "pcmpgtd %5, %%mm5 \n\t" - "pandn %5, %%mm5 \n\t" - "paddd %%mm5, %%mm4 \n\t" + /* calculate |T1-T0| put result in mm5 */ + "movq %%mm2, %%mm5 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "psubusb %%mm2, %%mm1 \n\t" + "por %%mm1, %%mm5 \n\t" - "movq %2, %%mm2 \n\t" /* B0 */ + "movq %2, %%mm2 \n\t" /* mm2 = B0 */ - /* calculate |B1-B0| put result in mm5 */ - "movq %%mm2, %%mm5 \n\t" - "psubusb %%mm3, %%mm5 \n\t" - "psubusb %%mm2, %%mm3 \n\t" - "por %%mm3, %%mm5 \n\t" + /* if |T1-T0| > Threshold we want 0 else dword minus one */ "psrlw $1, %%mm5 \n\t" - "pand %%mm6, %%mm5 \n\t" + "pand %%mm6, %%mm5 \n\t" + "pxor %%mm1, %%mm1 \n\t" // zero + "pcmpgtb %3, %%mm5 \n\t" + "pcmpeqd %%mm1, %%mm5 \n\t" + + /* calculate |B1-B0| put result in mm1 */ + "movq %%mm2, %%mm1 \n\t" + "psubusb %%mm3, %%mm1 \n\t" + "psubusb %%mm2, %%mm3 \n\t" + "por %%mm3, %%mm1 \n\t" - /* if |B1-B0| > Threshold we want dword worth of ones */ - "pcmpgtb %3, %%mm5 \n\t" - "pand %%mm6, %%mm5 \n\t" /* get rid of any sign bit */ - "pcmpgtd %5, %%mm5 \n\t" - "pandn %5, %%mm5 \n\t" - "paddd %%mm5, %%mm4 \n\t" + /* if |B1-B0| > Threshold we want 0 else dword minus one */ + "psrlw $1, %%mm1 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pxor %%mm3, %%mm3 \n\t" // zero + "pcmpgtb %3, %%mm1 \n\t" + "pcmpeqd %%mm3, %%mm1 \n\t" - "pcmpgtd %6, %%mm4 \n\t" + "por %%mm1, %%mm5 \n\t" + "pand %%mm5, %%mm4 \n\t" /* debugging feature * output the value of mm4 at this point which is pink where we will weave - * and green were we are going to bob */ + * and green where we are going to bob + */ #ifdef CHECK_BOBWEAVE -#ifdef IS_SSE +#ifdef IS_MMXEXT "movntq %%mm4, %0 \n\t" #else "movq %%mm4, %0 \n\t" #endif #else - "movq %%mm4, %%mm5 \n\t" - /* mm4 now is 1 where we want to weave and 0 where we want to bob */ - "pand %%mm0, %%mm4 \n\t" - "pandn %%mm7, %%mm5 \n\t" - "por %%mm5, %%mm4 \n\t" -#ifdef IS_SSE + /* mm4 now is 1 where we want to weave and 0 where we want to bob */ + "pand %%mm4, %%mm0 \n\t" + "pandn %%mm7, %%mm4 \n\t" + "por %%mm0, %%mm4 \n\t" +#ifdef IS_MMXEXT "movntq %%mm4, %0 \n\t" #else "movq %%mm4, %0 \n\t" @@ -318,7 +296,7 @@ static void DeinterlaceGreedy2Frame_MMX(uint8_t *output, int outstride, #endif : "=m" (*Dest2) - : "m" (*T0), "m" (*B0), "m" (qwGreedyTwoFrameThreshold), "m" (Mask), "m" (DwordOne), "m" (DwordTwo) ); + : "m" (*T0), "m" (*B0), "m" (GreedyTwoFrameThreshold) ); /* Advance to the next set of pixels. */ T1 += 8; @@ -341,7 +319,7 @@ static void DeinterlaceGreedy2Frame_MMX(uint8_t *output, int outstride, B0 += PitchRest; } -#ifdef IS_SSE +#ifdef IS_MMXEXT asm("sfence\n\t"); #endif diff --git a/src/post/deinterlace/plugins/greedy2frame_template_sse2.c b/src/post/deinterlace/plugins/greedy2frame_template_sse2.c new file mode 100644 index 000000000..379a78440 --- /dev/null +++ b/src/post/deinterlace/plugins/greedy2frame_template_sse2.c @@ -0,0 +1,293 @@ +/***************************************************************************** +** Copyright (c) 2000 John Adcock, Tom Barry, Steve Grimm All rights reserved. +** port copyright (c) 2003 Miguel Freitas +****************************************************************************** +** +** This file is subject to the terms of the GNU General Public License as +** published by the Free Software Foundation. A copy of this license is +** included with this software distribution in the file COPYING. If you +** do not have a copy, you may obtain a copy by writing to the Free +** Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. +** +** This software is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details +****************************************************************************** +** CVS Log +** +** Revision 1.10 2006/12/21 09:54:45 dgp85 +** Apply the textrel patch from Gentoo, thanks to PaX team for providing it. The patch was applied and tested for a while in Gentoo and Pardus, and solves also Debian's problems with non-PIC code. If problems will arise, they'll be debugged. +** +** Revision 1.9 2006/02/04 14:06:29 miguelfreitas +** Enable AMD64 mmx/sse support in some plugins (tvtime, libmpeg2, goom...) +** patch by dani3l +** +** Revision 1.8 2005/06/05 16:00:06 miguelfreitas +** quite some hacks for gcc 2.95 compatibility +** +** Revision 1.7 2004/04/09 02:57:06 miguelfreitas +** tvtime deinterlacing algorithms assumed top_field_first=1 +** top_field_first=0 (aka bottom_field_first) should now work as expected +** +** Revision 1.6 2004/02/12 20:53:31 mroi +** my gcc (partly 3.4 already) optimizes these away, because they are only used +** inside inline assembler (which the compiler does not recognize); so actually +** the code is wrong (the asm parts should list these as inputs), but telling +** the compiler to keep them is the easier fix +** +** Revision 1.5 2004/01/05 12:15:55 siggi +** wonder why Mike isn't complaining about C++ style comments, any more... +** +** Revision 1.4 2004/01/05 01:47:26 tmmm +** DOS/Win CRs are forbidden, verboten, interdit +** +** Revision 1.3 2004/01/02 20:53:43 miguelfreitas +** better MANGLE from ffmpeg +** +** Revision 1.2 2004/01/02 20:47:03 miguelfreitas +** my small contribution to the cygwin port ;-) +** +** Revision 1.1 2003/06/22 17:30:03 miguelfreitas +** use our own port of greedy2frame (tvtime port is currently broken) +** +** Revision 1.8 2001/11/23 17:18:54 adcockj +** Fixed silly and/or confusion +** +** Revision 1.7 2001/11/22 22:27:00 adcockj +** Bug Fixes +** +** Revision 1.6 2001/11/21 15:21:40 adcockj +** Renamed DEINTERLACE_INFO to TDeinterlaceInfo in line with standards +** Changed TDeinterlaceInfo structure to have history of pictures. +** +** Revision 1.5 2001/07/31 06:48:33 adcockj +** Fixed index bug spotted by Peter Gubanov +** +** Revision 1.4 2001/07/13 16:13:33 adcockj +** Added CVS tags and removed tabs +** +*****************************************************************************/ + +/* + * This is the implementation of the Greedy 2-frame deinterlace algorithm + * described in DI_Greedy2Frame.c. It's in a separate file so we can compile + * variants for different CPU types; most of the code is the same in the + * different variants. + */ + + +/**************************************************************************** +** Field 1 | Field 2 | Field 3 | Field 4 | +** T0 | | T1 | | +** | M0 | | M1 | +** B0 | | B1 | | +*/ + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static const sse_t Mask128 = { uq: { 0x7f7f7f7f7f7f7f7fll, 0x7f7f7f7f7f7f7f7fll} }; +#define TP GREEDYTWOFRAMETHRESHOLD, GREEDYTWOFRAMETHRESHOLD2 +static const sse_t GreedyTwoFrameThreshold128 = { ub: {TP, TP, TP, TP, TP, TP, TP, TP} }; +#undef TP +#endif + +static void DeinterlaceGreedy2Frame_SSE2(uint8_t *output, int outstride, + deinterlace_frame_data_t *data, + int bottom_field, int second_field, + int width, int height ) +{ +#if defined(ARCH_X86) || defined(ARCH_X86_64) + int Line; + int stride = width * 2; + register uint8_t* M1; + register uint8_t* M0; + register uint8_t* T1; + register uint8_t* T0; + uint8_t* Dest = output; + register uint8_t* Dest2; + register uint8_t* Destc; + register int count; + uint32_t Pitch = stride * 2; + uint32_t LineLength = stride; + uint32_t PitchRest = Pitch - (LineLength >> 4)*16; + + if( second_field ) { + M1 = data->f0; + T1 = data->f0; + M0 = data->f1; + T0 = data->f1; + } else { + M1 = data->f0; + T1 = data->f1; + M0 = data->f1; + T0 = data->f2; + } + + if( bottom_field ) { + M1 += stride; + T1 += 0; + M0 += stride; + T0 += 0; + } else { + M1 += Pitch; + T1 += stride; + M0 += Pitch; + T0 += stride; + + xine_fast_memcpy(Dest, M1, LineLength); + Dest += outstride; + } + + for (Line = 0; Line < (height / 2) - 1; ++Line) + { + /* Always use the most recent data verbatim. By definition it's correct + * (it'd be shown on an interlaced display) and our job is to fill in + * the spaces between the new lines. + */ + /* xine_fast_memcpy would be pretty pointless here as we load the same + * data anyway it's just one additional mov per loop... + * XXX I believe some cpus with sse2 (early A64?) only have one write + * buffer. Using movntdq with 2 different streams may have quite + * bad performance consequences on such cpus. + */ + + Destc = Dest; + Dest += outstride; + Dest2 = Dest; + + /* just rely on gcc not using xmm regs... */ + do { + asm volatile( + "movdqa %0, %%xmm6 \n\t" // xmm6 = Mask + "pxor %%xmm7, %%xmm7 \n\t" // xmm7 = zero + : /* no output */ + : "m" (Mask128) ); + } while (0); + + count = LineLength >> 4; + do { + asm volatile( + /* Figure out what to do with the scanline above the one we copy. + * See above for a description of the algorithm. + * weave if (weave(M) AND (weave(T) OR weave(B))) + */ + "movdqa (%2), %%xmm1 \n\t" /* xmm1 = T1 */ + "movdqa (%3), %%xmm0 \n\t" /* xmm0 = T0 */ + "movdqa (%q4,%2), %%xmm3 \n\t" /* xmm3 = B1 */ + "movdqa (%q4,%3), %%xmm2 \n\t" /* xmm2 = B0 */ + + /* calculate |T1-T0| keep T1 put result in xmm5 */ + "movdqa %%xmm1, %%xmm5 \n\t" + "psubusb %%xmm0, %%xmm5 \n\t" + "psubusb %%xmm1, %%xmm0 \n\t" + "por %%xmm0, %%xmm5 \n\t" + + /* T1 is data for line to copy */ + "movntdq %%xmm1, %1 \n\t" + + /* if |T1-T0| > Threshold we want 0 else dword minus one */ + "psrlw $1, %%xmm5 \n\t" + "pand %%xmm6, %%xmm5 \n\t" + "pcmpgtb %0, %%xmm5 \n\t" + "pcmpeqd %%xmm7, %%xmm5 \n\t" + + "prefetcht0 64(%q4,%2) \n\t" + "prefetcht0 64(%q4,%3) \n\t" + : + : "m" (GreedyTwoFrameThreshold128), + "m" (*Destc), "r" (T1), "r" (T0), "r" (Pitch) ); + + asm volatile ( + /* calculate |B1-B0| keep B1 put result in xmm4 */ + "movdqa %%xmm3, %%xmm4 \n\t" + "psubusb %%xmm2, %%xmm4 \n\t" + "psubusb %%xmm3, %%xmm2 \n\t" + "por %%xmm2, %%xmm4 \n\t" + + "movdqa (%0), %%xmm0 \n\t" /* xmm0 = M1 */ + "movdqa (%1), %%xmm2 \n\t" /* xmm2 = M0 */ + + /* if |B1-B0| > Threshold we want 0 else dword minus one */ + "psrlw $1, %%xmm4 \n\t" + "pand %%xmm6, %%xmm4 \n\t" + "pcmpgtb %2, %%xmm4 \n\t" + "pcmpeqd %%xmm7, %%xmm4 \n\t" + + "por %%xmm4, %%xmm5 \n\t" + + /* Average T1 and B1 so we can do interpolated bobbing if we bob + * onto T1 */ + "pavgb %%xmm3, %%xmm1 \n\t" /* xmm1 = avg(T1,B1) */ + + "prefetcht0 64(%0) \n\t" + "prefetcht0 64(%1) \n\t" + + /* make mm0 the average of M1 and M0 which should make weave + * look better when there is small amounts of movement */ + "movdqa %%xmm2, %%xmm3 \n\t" + "pavgb %%xmm0, %%xmm3 \n\t" /* xmm3 = avg(M1,M0) */ + + /* calculate |M1-M0| put result in xmm4 */ + "movdqa %%xmm0, %%xmm4 \n\t" + "psubusb %%xmm2, %%xmm4 \n\t" + "psubusb %%xmm0, %%xmm2 \n\t" + "por %%xmm2, %%xmm4 \n\t" + + /* if |M1-M0| > Threshold we want 0 else dword minus one */ + "psrlw $1, %%xmm4 \n\t" + "pand %%xmm6, %%xmm4 \n\t" + "pcmpgtb %2, %%xmm4 \n\t" + "pcmpeqd %%xmm7, %%xmm4 \n\t" /* do we want to bob */ + + "pand %%xmm5, %%xmm4 \n\t" + +/* debugging feature + * output the value of xmm4 at this point which is pink where we will weave + * and green where we are going to bob + */ +#ifdef CHECK_BOBWEAVE + "movntdq %%xmm4, %3 \n\t" +#else + /* xmm4 now is 1 where we want to weave and 0 where we want to bob */ + "pand %%xmm4, %%xmm3 \n\t" + "pandn %%xmm1, %%xmm4 \n\t" + "por %%xmm3, %%xmm4 \n\t" + "movntdq %%xmm4, %3 \n\t" +#endif + : + : "r" (M1), "r" (M0), "m" (GreedyTwoFrameThreshold128), + "m" (*Dest2)); + + /* Advance to the next set of pixels. */ + T1 += 16; + M1 += 16; + M0 += 16; + T0 += 16; + Dest2 += 16; + Destc += 16; + + } while( --count ); + + Dest += outstride; + + M1 += PitchRest; + T1 += PitchRest; + M0 += PitchRest; + T0 += PitchRest; + } + + asm("sfence\n\t"); + + if( bottom_field ) + { + xine_fast_memcpy(Dest, T1, stride); + Dest += outstride; + xine_fast_memcpy(Dest, M1, stride); + } + else + { + xine_fast_memcpy(Dest, T1, stride); + } +#endif +} + diff --git a/src/post/deinterlace/plugins/greedyh.asm b/src/post/deinterlace/plugins/greedyh.asm index 11b28ca76..c96bfbf2c 100644 --- a/src/post/deinterlace/plugins/greedyh.asm +++ b/src/post/deinterlace/plugins/greedyh.asm @@ -17,7 +17,6 @@ ///////////////////////////////////////////////////////////////////////////// #include "x86-64_macros.inc" -#include <mangle.h> #if !defined(MASKS_DEFINED) #define MASKS_DEFINED diff --git a/src/post/deinterlace/plugins/kdetv_greedyh.c b/src/post/deinterlace/plugins/kdetv_greedyh.c index 5ec48e4a2..2207772ca 100644 --- a/src/post/deinterlace/plugins/kdetv_greedyh.c +++ b/src/post/deinterlace/plugins/kdetv_greedyh.c @@ -31,8 +31,8 @@ #include <stdint.h> #endif -#include "attributes.h" -#include "xineutils.h" +#include <xine/attributes.h> +#include <xine/xineutils.h> #include "deinterlace.h" #include "speedtools.h" #include "speedy.h" diff --git a/src/post/deinterlace/plugins/kdetv_tomsmocomp.c b/src/post/deinterlace/plugins/kdetv_tomsmocomp.c index ae0fa0363..0f87b913f 100644 --- a/src/post/deinterlace/plugins/kdetv_tomsmocomp.c +++ b/src/post/deinterlace/plugins/kdetv_tomsmocomp.c @@ -31,8 +31,8 @@ #include <stdint.h> #endif -#include "attributes.h" -#include "xineutils.h" +#include <xine/attributes.h> +#include <xine/xineutils.h> #include "deinterlace.h" #include "speedtools.h" #include "speedy.h" diff --git a/src/post/deinterlace/plugins/linearblend.c b/src/post/deinterlace/plugins/linearblend.c index c594f41dd..fe230685b 100644 --- a/src/post/deinterlace/plugins/linearblend.c +++ b/src/post/deinterlace/plugins/linearblend.c @@ -31,8 +31,9 @@ #include <stdint.h> #endif -#include "attributes.h" -#include "xineutils.h" +#include <xine/attributes.h> +#include <xine/xineutils.h> +#include "xine_mmx.h" #include "speedtools.h" #include "speedy.h" #include "deinterlace.h" diff --git a/src/post/deinterlace/plugins/tomsmocomp/TomsMoCompAll.inc b/src/post/deinterlace/plugins/tomsmocomp/TomsMoCompAll.inc index a3b139691..d3ee46a20 100644 --- a/src/post/deinterlace/plugins/tomsmocomp/TomsMoCompAll.inc +++ b/src/post/deinterlace/plugins/tomsmocomp/TomsMoCompAll.inc @@ -21,8 +21,6 @@ // See www.eff.org for details ///////////////////////////////////////////////////////////////////////////// -#include <mangle.h> - #if !defined(MASKS_DEFINED) #define MASKS_DEFINED static const int64_t __attribute__((__used__)) Max_Mov = 0x0404040404040404ull; diff --git a/src/post/deinterlace/plugins/vfir.c b/src/post/deinterlace/plugins/vfir.c index e66d7c789..89ea1d0e5 100644 --- a/src/post/deinterlace/plugins/vfir.c +++ b/src/post/deinterlace/plugins/vfir.c @@ -34,8 +34,9 @@ #include <stdint.h> #endif -#include "attributes.h" -#include "xineutils.h" +#include <xine/attributes.h> +#include <xine/xineutils.h> +#include "xine_mmx.h" #include "speedy.h" #include "deinterlace.h" #include "plugins.h" diff --git a/src/post/deinterlace/speedy.c b/src/post/deinterlace/speedy.c index 32c8b03e3..4c9d5c0d1 100644 --- a/src/post/deinterlace/speedy.c +++ b/src/post/deinterlace/speedy.c @@ -62,8 +62,9 @@ #include <stdint.h> #endif -#include "attributes.h" -#include "xineutils.h" +#include <xine/attributes.h> +#include <xine/xineutils.h> +#include "xine_mmx.h" #include "speedtools.h" #include "speedy.h" @@ -343,6 +344,89 @@ static unsigned int diff_factor_packed422_scanline_mmx( uint8_t *cur, uint8_t *o } #endif +#if defined(ARCH_X86) || defined(ARCH_X86_64) + +static const sse_t dqwYMask = { uq: { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL }}; +static const sse_t dqwCMask = { uq: { 0xff00ff00ff00ff00ULL, 0xff00ff00ff00ff00ULL }}; + +static unsigned int diff_factor_packed422_scanline_sse2_aligned( uint8_t *cur, uint8_t *old, int width ) +{ + register unsigned int temp; + + width /= 8; + + movdqa_m2r( dqwYMask, xmm1 ); + movd_m2r( BitShift, xmm7 ); + pxor_r2r( xmm0, xmm0 ); + + while( width-- ) { + movdqa_m2r( *cur, xmm4 ); + movdqa_m2r( *old, xmm5 ); + + pand_r2r( xmm1, xmm4 ); + pand_r2r( xmm1, xmm5 ); + + psubw_r2r( xmm5, xmm4 ); /* mm4 = Y1 - Y2 */ + pmaddwd_r2r( xmm4, xmm4 ); /* mm4 = (Y1 - Y2)^2 */ + psrld_r2r( xmm7, xmm4 ); /* divide mm4 by 2^BitShift */ + paddd_r2r( xmm4, xmm0 ); /* keep total in mm0 */ + + cur += 16; + old += 16; + } + + pshufd_r2r(xmm0, xmm1, 0x0e); + paddd_r2r(xmm1, xmm0); + pshufd_r2r(xmm0, xmm1, 0x01); + paddd_r2r(xmm1, xmm0); + + movd_r2a(xmm0, temp); + return temp; +} +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static unsigned int diff_factor_packed422_scanline_sse2( uint8_t *cur, uint8_t *old, int width ) +{ + if (0 == (((unsigned int)cur|(unsigned int)old) & 15)) { + return diff_factor_packed422_scanline_sse2_aligned(cur, old, width); + } + + register unsigned int temp; + + width /= 8; + + movdqa_m2r( dqwYMask, xmm1 ); + movd_m2r( BitShift, xmm7 ); + pxor_r2r( xmm0, xmm0 ); + + while( width-- ) { + movdqu_m2r( *cur, xmm4 ); + movdqu_m2r( *old, xmm5 ); + + pand_r2r( xmm1, xmm4 ); + pand_r2r( xmm1, xmm5 ); + + psubw_r2r( xmm5, xmm4 ); /* mm4 = Y1 - Y2 */ + pmaddwd_r2r( xmm4, xmm4 ); /* mm4 = (Y1 - Y2)^2 */ + psrld_r2r( xmm7, xmm4 ); /* divide mm4 by 2^BitShift */ + paddd_r2r( xmm4, xmm0 ); /* keep total in mm0 */ + + cur += 16; + old += 16; + } + + pshufd_r2r(xmm0, xmm1, 0x0e); + paddd_r2r(xmm1, xmm0); + pshufd_r2r(xmm0, xmm1, 0x01); + paddd_r2r(xmm1, xmm0); + + movd_r2a(xmm0, temp); + + return temp; +} +#endif + #define ABS(a) (((a) < 0)?-(a):(a)) #if defined(ARCH_X86) || defined(ARCH_X86_64) @@ -716,6 +800,130 @@ static void vfilter_chroma_332_packed422_scanline_mmx( uint8_t *output, int widt } #endif +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static void vfilter_chroma_332_packed422_scanline_sse2_aligned( uint8_t *output, int width, + uint8_t *m, uint8_t *t, uint8_t *b ) +{ + int i; + + // Get width in bytes. + width *= 2; + i = width / 16; + width -= i * 16; + + movdqa_m2r( dqwYMask, xmm7 ); + movdqa_m2r( dqwCMask, xmm6 ); + + while( i-- ) { + movdqa_m2r ( *t, xmm0 ); + movdqa_m2r ( *b, xmm1 ); + movdqa_m2r ( *m, xmm2 ); + + movdqa_r2r ( xmm2, xmm3 ); + pand_r2r ( xmm7, xmm3 ); + + pand_r2r ( xmm6, xmm0 ); + pand_r2r ( xmm6, xmm1 ); + pand_r2r ( xmm6, xmm2 ); + + psrlq_i2r ( 8, xmm0 ); + psrlq_i2r ( 7, xmm1 ); + psrlq_i2r ( 8, xmm2 ); + + movdqa_r2r ( xmm0, xmm4 ); + movdqa_r2r ( xmm2, xmm5 ); + psllw_i2r ( 1, xmm4 ); + psllw_i2r ( 1, xmm5 ); + paddw_r2r ( xmm4, xmm0 ); + paddw_r2r ( xmm5, xmm2 ); + + paddw_r2r ( xmm0, xmm2 ); + paddw_r2r ( xmm1, xmm2 ); + + psllw_i2r ( 5, xmm2 ); + pand_r2r ( xmm6, xmm2 ); + + por_r2r ( xmm3, xmm2 ); + + movdqa_r2m( xmm2, *output ); + output += 16; + t += 16; + b += 16; + m += 16; + } + output++; t++; b++; m++; + while( width-- ) { + *output = (3 * *t + 3 * *m + 2 * *b) >> 3; + output +=2; t+=2; b+=2; m+=2; + } +} +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static void vfilter_chroma_332_packed422_scanline_sse2( uint8_t *output, int width, + uint8_t *m, uint8_t *t, uint8_t *b ) +{ + int i; + + if (0 == (((unsigned int)output|(unsigned int)m|(unsigned int)t|(unsigned int)b) & 15)) { + vfilter_chroma_332_packed422_scanline_sse2_aligned(output, width, m, t, b); + return; + } + + // Get width in bytes. + width *= 2; + i = width / 16; + width -= i * 16; + + movdqa_m2r( dqwYMask, xmm7 ); + movdqa_m2r( dqwCMask, xmm6 ); + + while( i-- ) { + movdqu_m2r ( *t, xmm0 ); + movdqu_m2r ( *b, xmm1 ); + movdqu_m2r ( *m, xmm2 ); + + movdqa_r2r ( xmm2, xmm3 ); + pand_r2r ( xmm7, xmm3 ); + + pand_r2r ( xmm6, xmm0 ); + pand_r2r ( xmm6, xmm1 ); + pand_r2r ( xmm6, xmm2 ); + + psrlq_i2r ( 8, xmm0 ); + psrlq_i2r ( 7, xmm1 ); + psrlq_i2r ( 8, xmm2 ); + + movdqa_r2r ( xmm0, xmm4 ); + movdqa_r2r ( xmm2, xmm5 ); + psllw_i2r ( 1, xmm4 ); + psllw_i2r ( 1, xmm5 ); + paddw_r2r ( xmm4, xmm0 ); + paddw_r2r ( xmm5, xmm2 ); + + paddw_r2r ( xmm0, xmm2 ); + paddw_r2r ( xmm1, xmm2 ); + + psllw_i2r ( 5, xmm2 ); + pand_r2r ( xmm6, xmm2 ); + + por_r2r ( xmm3, xmm2 ); + + movdqu_r2m( xmm2, *output ); + output += 16; + t += 16; + b += 16; + m += 16; + } + output++; t++; b++; m++; + while( width-- ) { + *output = (3 * *t + 3 * *m + 2 * *b) >> 3; + output +=2; t+=2; b+=2; m+=2; + } +} +#endif + + static void vfilter_chroma_332_packed422_scanline_c( uint8_t *output, int width, uint8_t *m, uint8_t *t, uint8_t *b ) { @@ -2458,6 +2666,14 @@ void setup_speedy_calls( uint32_t accel, int verbose ) printf( "speedycode: No MMX or MMXEXT support detected, using C fallbacks.\n" ); } } + + if( speedy_accel & MM_ACCEL_X86_SSE2 ) { + if( verbose ) { + printf( "speedycode: Using SSE2 optimized functions.\n" ); + } + diff_factor_packed422_scanline = diff_factor_packed422_scanline_sse2; + vfilter_chroma_332_packed422_scanline = vfilter_chroma_332_packed422_scanline_sse2; + } #endif } diff --git a/src/post/deinterlace/xine_plugin.c b/src/post/deinterlace/xine_plugin.c index c9d451b4f..3cce42400 100644 --- a/src/post/deinterlace/xine_plugin.c +++ b/src/post/deinterlace/xine_plugin.c @@ -31,10 +31,10 @@ #define LOG */ -#include "xine_internal.h" -#include "post.h" -#include "xineutils.h" -#include "xine_buffer.h" +#include <xine/xine_internal.h> +#include <xine/post.h> +#include <xine/xineutils.h> +#include <xine/xine_buffer.h> #include <pthread.h> #include "tvtime.h" @@ -51,7 +51,7 @@ static const post_info_t deinterlace_special_info = { XINE_POST_TYPE_VIDEO_FILTE const plugin_info_t xine_plugin_info[] EXPORTED = { /* type, API, "name", version, special_info, init_function */ - { PLUGIN_POST | PLUGIN_MUST_PRELOAD, 9, "tvtime", XINE_VERSION_CODE, &deinterlace_special_info, &deinterlace_init_plugin }, + { PLUGIN_POST | PLUGIN_MUST_PRELOAD, 10, "tvtime", XINE_VERSION_CODE, &deinterlace_special_info, &deinterlace_init_plugin }, { PLUGIN_NONE, 0, "", 0, NULL, NULL } }; @@ -60,8 +60,8 @@ typedef struct post_plugin_deinterlace_s post_plugin_deinterlace_t; #define MAX_NUM_METHODS 30 static const char *enum_methods[MAX_NUM_METHODS]; -static char *enum_pulldown[] = { "none", "vektor", NULL }; -static char *enum_framerate[] = { "full", "half_top", "half_bottom", NULL }; +static const char *const enum_pulldown[] = { "none", "vektor", NULL }; +static const char *const enum_framerate[] = { "full", "half_top", "half_bottom", NULL }; static void *help_string; @@ -280,8 +280,6 @@ static xine_post_api_t post_api = { static post_plugin_t *deinterlace_open_plugin(post_class_t *class_gen, int inputs, xine_audio_port_t **audio_target, xine_video_port_t **video_target); -static char *deinterlace_get_identifier(post_class_t *class_gen); -static char *deinterlace_get_description(post_class_t *class_gen); static void deinterlace_class_dispose(post_class_t *class_gen); /* plugin instance functions */ @@ -311,8 +309,8 @@ static void *deinterlace_init_plugin(xine_t *xine, void *data) return NULL; class->class.open_plugin = deinterlace_open_plugin; - class->class.get_identifier = deinterlace_get_identifier; - class->class.get_description = deinterlace_get_description; + class->class.identifier = "tvtime"; + class->class.description = N_("advanced deinterlacer plugin with pulldown detection"); class->class.dispose = deinterlace_class_dispose; @@ -425,16 +423,6 @@ static post_plugin_t *deinterlace_open_plugin(post_class_t *class_gen, int input return &this->post; } -static char *deinterlace_get_identifier(post_class_t *class_gen) -{ - return "tvtime"; -} - -static char *deinterlace_get_description(post_class_t *class_gen) -{ - return "advanced deinterlacer plugin with pulldown detection"; -} - static void deinterlace_class_dispose(post_class_t *class_gen) { xine_buffer_free(help_string); |