summaryrefslogtreecommitdiff
path: root/src/xine-utils/memcpy.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/xine-utils/memcpy.c')
-rw-r--r--src/xine-utils/memcpy.c145
1 files changed, 123 insertions, 22 deletions
diff --git a/src/xine-utils/memcpy.c b/src/xine-utils/memcpy.c
index df514b682..0c44e0808 100644
--- a/src/xine-utils/memcpy.c
+++ b/src/xine-utils/memcpy.c
@@ -52,7 +52,8 @@
#define LOG
*/
-#include "xine_internal.h"
+#include <xine/xine_internal.h>
+#include "../xine-engine/xine_private.h"
void *(* xine_fast_memcpy)(void *to, const void *from, size_t len);
@@ -162,6 +163,7 @@ int d0, d1, d2;
return (to);
}
+#define AVX_MMREG_SIZE 32
#define SSE_MMREG_SIZE 16
#define MMX_MMREG_SIZE 8
@@ -257,6 +259,98 @@ static void * sse_memcpy(void * to, const void * from, size_t len)
return retval;
}
+#ifdef HAVE_AVX
+static void * avx_memcpy(void * to, const void * from, size_t len)
+{
+ void *retval;
+ size_t i;
+ retval = to;
+
+ /* PREFETCH has effect even for MOVSB instruction ;) */
+ __asm__ __volatile__ (
+ " prefetchnta (%0)\n"
+ " prefetchnta 32(%0)\n"
+ " prefetchnta 64(%0)\n"
+ " prefetchnta 96(%0)\n"
+ " prefetchnta 128(%0)\n"
+ " prefetchnta 160(%0)\n"
+ " prefetchnta 192(%0)\n"
+ " prefetchnta 224(%0)\n"
+ " prefetchnta 256(%0)\n"
+ " prefetchnta 288(%0)\n"
+ : : "r" (from) );
+
+ if(len >= MIN_LEN)
+ {
+ register uintptr_t delta;
+ /* Align destinition to MMREG_SIZE -boundary */
+ delta = ((uintptr_t)to)&(AVX_MMREG_SIZE-1);
+ if(delta)
+ {
+ delta=AVX_MMREG_SIZE-delta;
+ len -= delta;
+ small_memcpy(to, from, delta);
+ }
+ i = len >> 7; /* len/128 */
+ len&=127;
+ if(((uintptr_t)from) & 31)
+ /* if SRC is misaligned */
+ for(; i>0; i--)
+ {
+ __asm__ __volatile__ (
+ "prefetchnta 320(%0)\n"
+ "prefetchnta 352(%0)\n"
+ "prefetchnta 384(%0)\n"
+ "prefetchnta 416(%0)\n"
+ "vmovups (%0), %%ymm0\n"
+ "vmovups 32(%0), %%ymm1\n"
+ "vmovups 64(%0), %%ymm2\n"
+ "vmovups 96(%0), %%ymm3\n"
+ "vmovntps %%ymm0, (%1)\n"
+ "vmovntps %%ymm1, 32(%1)\n"
+ "vmovntps %%ymm2, 64(%1)\n"
+ "vmovntps %%ymm3, 96(%1)\n"
+ :: "r" (from), "r" (to) : "memory");
+ from = ((const unsigned char *)from) + 128;
+ to = ((unsigned char *)to) + 128;
+ }
+ else
+ /*
+ Only if SRC is aligned on 16-byte boundary.
+ It allows to use movaps instead of movups, which required data
+ to be aligned or a general-protection exception (#GP) is generated.
+ */
+ for(; i>0; i--)
+ {
+ __asm__ __volatile__ (
+ "prefetchnta 320(%0)\n"
+ "prefetchnta 352(%0)\n"
+ "prefetchnta 384(%0)\n"
+ "prefetchnta 416(%0)\n"
+ "vmovaps (%0), %%ymm0\n"
+ "vmovaps 32(%0), %%ymm1\n"
+ "vmovaps 64(%0), %%ymm2\n"
+ "vmovaps 96(%0), %%ymm3\n"
+ "vmovntps %%ymm0, (%1)\n"
+ "vmovntps %%ymm1, 32(%1)\n"
+ "vmovntps %%ymm2, 64(%1)\n"
+ "vmovntps %%ymm3, 96(%1)\n"
+ :: "r" (from), "r" (to) : "memory");
+ from = ((const unsigned char *)from) + 128;
+ to = ((unsigned char *)to) + 128;
+ }
+ /* since movntq is weakly-ordered, a "sfence"
+ * is needed to become ordered again. */
+ __asm__ __volatile__ ("sfence":::"memory");
+ }
+ /*
+ * Now do the tail of the block
+ */
+ if(len) linux_kernel_memcpy_impl(to, from, len);
+ return retval;
+}
+#endif /* HAVE_AVX */
+
static void * mmx_memcpy(void * to, const void * from, size_t len)
{
void *retval;
@@ -384,30 +478,33 @@ static void *linux_kernel_memcpy(void *to, const void *from, size_t len) {
#endif /* _MSC_VER */
#endif /* ARCH_X86 */
-static struct {
- char *name;
- void *(* function)(void *to, const void *from, size_t len);
-
- uint64_t time; /* This type could be used for non-MSC build too! */
+static const struct {
+ const char name[16];
+ void *(*const function)(void *to, const void *from, size_t len);
uint32_t cpu_require;
} memcpy_method[] =
{
- { NULL, NULL, 0, 0 },
- { "libc memcpy()", memcpy, 0, 0 },
+ { "", NULL, 0 },
+ { "libc", memcpy, 0 },
#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && !defined(_MSC_VER)
- { "linux kernel memcpy()", linux_kernel_memcpy, 0, 0 },
- { "MMX optimized memcpy()", mmx_memcpy, 0, MM_MMX },
- { "MMXEXT optimized memcpy()", mmx2_memcpy, 0, MM_MMXEXT },
- { "SSE optimized memcpy()", sse_memcpy, 0, MM_MMXEXT|MM_SSE },
+ { "linux kernel", linux_kernel_memcpy, 0 },
+ { "MMX ", mmx_memcpy, MM_MMX },
+ { "MMXEXT", mmx2_memcpy, MM_MMXEXT },
+ { "SSE", sse_memcpy, MM_MMXEXT|MM_SSE },
+# ifdef HAVE_AVX
+ { "AVX", avx_memcpy, MM_ACCEL_X86_AVX },
+# endif /* HAVE_AVX */
#endif /* ARCH_X86 */
#if defined (ARCH_PPC) && !defined (HOST_OS_DARWIN)
- { "ppcasm_memcpy()", ppcasm_memcpy, 0, 0 },
- { "ppcasm_cacheable_memcpy()", ppcasm_cacheable_memcpy, 0, MM_ACCEL_PPC_CACHE32 },
+ { "ppcasm", ppcasm_memcpy, 0 },
+ { "ppcasm_cached", ppcasm_cacheable_memcpy, MM_ACCEL_PPC_CACHE32 },
#endif /* ARCH_PPC && !HOST_OS_DARWIN */
- { NULL, NULL, 0, 0 }
+ { "", NULL, 0 }
};
+static uint64_t memcpy_timing[sizeof(memcpy_method)/sizeof(memcpy_method[0])] = { 0, };
+
#ifdef HAVE_POSIX_TIMERS
/* Prefer clock_gettime() where available. */
static int64_t _x_gettime(void)
@@ -459,7 +556,7 @@ static void update_fast_memcpy(void *user_data, xine_cfg_entry_t *entry) {
if (method != 0
&& (config_flags & memcpy_method[method].cpu_require) ==
memcpy_method[method].cpu_require ) {
- lprintf("using %s\n", memcpy_method[method].name );
+ lprintf("using %s memcpy()\n", memcpy_method[method].name );
xine_fast_memcpy = memcpy_method[method].function;
return;
} else {
@@ -474,10 +571,13 @@ void xine_probe_fast_memcpy(xine_t *xine)
char *buf1, *buf2;
int i, j, best;
int config_flags = -1;
- static const char *memcpy_methods[] = {
+ static const char *const memcpy_methods[] = {
"probe", "libc",
#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && !defined(_MSC_VER)
"kernel", "mmx", "mmxext", "sse",
+# ifdef HAVE_AVX
+ "avx",
+# endif /* HAVE_AVX */
#endif
#if defined (ARCH_PPC) && !defined (HOST_OS_DARWIN)
"ppcasm_memcpy", "ppcasm_cacheable_memcpy",
@@ -498,9 +598,10 @@ void xine_probe_fast_memcpy(xine_t *xine)
/* check if function is configured and valid for this machine */
if( best != 0 &&
+ best < sizeof(memcpy_methods)/sizeof(memcpy_method[0]) &&
(config_flags & memcpy_method[best].cpu_require) ==
memcpy_method[best].cpu_require ) {
- lprintf("using %s\n", memcpy_method[best].name );
+ lprintf("using %s memcpy()\n", memcpy_method[best].name );
xine_fast_memcpy = memcpy_method[best].function;
return;
}
@@ -528,7 +629,7 @@ void xine_probe_fast_memcpy(xine_t *xine)
memcpy_method[1].function(buf1,buf2,BUFSIZE);
}
- for(i=1; memcpy_method[i].name; i++)
+ for(i=1; memcpy_method[i].name[0]; i++)
{
if( (config_flags & memcpy_method[i].cpu_require) !=
memcpy_method[i].cpu_require )
@@ -541,11 +642,11 @@ void xine_probe_fast_memcpy(xine_t *xine)
}
t = rdtsc(config_flags) - t;
- memcpy_method[i].time = t;
+ memcpy_timing[i] = t;
- xprintf(xine, XINE_VERBOSITY_LOG, "\t%s : %" PRIu64 "\n", memcpy_method[i].name, t);
+ xprintf(xine, XINE_VERBOSITY_LOG, "\t%s memcpy() : %" PRIu64 "\n", memcpy_method[i].name, t);
- if( best == 0 || t < memcpy_method[best].time )
+ if( best == 0 || t < memcpy_timing[best] )
best = i;
}