diff options
Diffstat (limited to 'src/xine-utils/memcpy.c')
-rw-r--r-- | src/xine-utils/memcpy.c | 145 |
1 files changed, 123 insertions, 22 deletions
diff --git a/src/xine-utils/memcpy.c b/src/xine-utils/memcpy.c index df514b682..0c44e0808 100644 --- a/src/xine-utils/memcpy.c +++ b/src/xine-utils/memcpy.c @@ -52,7 +52,8 @@ #define LOG */ -#include "xine_internal.h" +#include <xine/xine_internal.h> +#include "../xine-engine/xine_private.h" void *(* xine_fast_memcpy)(void *to, const void *from, size_t len); @@ -162,6 +163,7 @@ int d0, d1, d2; return (to); } +#define AVX_MMREG_SIZE 32 #define SSE_MMREG_SIZE 16 #define MMX_MMREG_SIZE 8 @@ -257,6 +259,98 @@ static void * sse_memcpy(void * to, const void * from, size_t len) return retval; } +#ifdef HAVE_AVX +static void * avx_memcpy(void * to, const void * from, size_t len) +{ + void *retval; + size_t i; + retval = to; + + /* PREFETCH has effect even for MOVSB instruction ;) */ + __asm__ __volatile__ ( + " prefetchnta (%0)\n" + " prefetchnta 32(%0)\n" + " prefetchnta 64(%0)\n" + " prefetchnta 96(%0)\n" + " prefetchnta 128(%0)\n" + " prefetchnta 160(%0)\n" + " prefetchnta 192(%0)\n" + " prefetchnta 224(%0)\n" + " prefetchnta 256(%0)\n" + " prefetchnta 288(%0)\n" + : : "r" (from) ); + + if(len >= MIN_LEN) + { + register uintptr_t delta; + /* Align destinition to MMREG_SIZE -boundary */ + delta = ((uintptr_t)to)&(AVX_MMREG_SIZE-1); + if(delta) + { + delta=AVX_MMREG_SIZE-delta; + len -= delta; + small_memcpy(to, from, delta); + } + i = len >> 7; /* len/128 */ + len&=127; + if(((uintptr_t)from) & 31) + /* if SRC is misaligned */ + for(; i>0; i--) + { + __asm__ __volatile__ ( + "prefetchnta 320(%0)\n" + "prefetchnta 352(%0)\n" + "prefetchnta 384(%0)\n" + "prefetchnta 416(%0)\n" + "vmovups (%0), %%ymm0\n" + "vmovups 32(%0), %%ymm1\n" + "vmovups 64(%0), %%ymm2\n" + "vmovups 96(%0), %%ymm3\n" + "vmovntps %%ymm0, (%1)\n" + "vmovntps %%ymm1, 32(%1)\n" + "vmovntps %%ymm2, 64(%1)\n" + "vmovntps %%ymm3, 96(%1)\n" + :: "r" (from), "r" (to) : "memory"); + from = ((const unsigned char *)from) + 128; + to = ((unsigned char *)to) + 128; + } + else + /* + Only if SRC is aligned on 16-byte boundary. + It allows to use movaps instead of movups, which required data + to be aligned or a general-protection exception (#GP) is generated. + */ + for(; i>0; i--) + { + __asm__ __volatile__ ( + "prefetchnta 320(%0)\n" + "prefetchnta 352(%0)\n" + "prefetchnta 384(%0)\n" + "prefetchnta 416(%0)\n" + "vmovaps (%0), %%ymm0\n" + "vmovaps 32(%0), %%ymm1\n" + "vmovaps 64(%0), %%ymm2\n" + "vmovaps 96(%0), %%ymm3\n" + "vmovntps %%ymm0, (%1)\n" + "vmovntps %%ymm1, 32(%1)\n" + "vmovntps %%ymm2, 64(%1)\n" + "vmovntps %%ymm3, 96(%1)\n" + :: "r" (from), "r" (to) : "memory"); + from = ((const unsigned char *)from) + 128; + to = ((unsigned char *)to) + 128; + } + /* since movntq is weakly-ordered, a "sfence" + * is needed to become ordered again. */ + __asm__ __volatile__ ("sfence":::"memory"); + } + /* + * Now do the tail of the block + */ + if(len) linux_kernel_memcpy_impl(to, from, len); + return retval; +} +#endif /* HAVE_AVX */ + static void * mmx_memcpy(void * to, const void * from, size_t len) { void *retval; @@ -384,30 +478,33 @@ static void *linux_kernel_memcpy(void *to, const void *from, size_t len) { #endif /* _MSC_VER */ #endif /* ARCH_X86 */ -static struct { - char *name; - void *(* function)(void *to, const void *from, size_t len); - - uint64_t time; /* This type could be used for non-MSC build too! */ +static const struct { + const char name[16]; + void *(*const function)(void *to, const void *from, size_t len); uint32_t cpu_require; } memcpy_method[] = { - { NULL, NULL, 0, 0 }, - { "libc memcpy()", memcpy, 0, 0 }, + { "", NULL, 0 }, + { "libc", memcpy, 0 }, #if (defined(ARCH_X86) || defined(ARCH_X86_64)) && !defined(_MSC_VER) - { "linux kernel memcpy()", linux_kernel_memcpy, 0, 0 }, - { "MMX optimized memcpy()", mmx_memcpy, 0, MM_MMX }, - { "MMXEXT optimized memcpy()", mmx2_memcpy, 0, MM_MMXEXT }, - { "SSE optimized memcpy()", sse_memcpy, 0, MM_MMXEXT|MM_SSE }, + { "linux kernel", linux_kernel_memcpy, 0 }, + { "MMX ", mmx_memcpy, MM_MMX }, + { "MMXEXT", mmx2_memcpy, MM_MMXEXT }, + { "SSE", sse_memcpy, MM_MMXEXT|MM_SSE }, +# ifdef HAVE_AVX + { "AVX", avx_memcpy, MM_ACCEL_X86_AVX }, +# endif /* HAVE_AVX */ #endif /* ARCH_X86 */ #if defined (ARCH_PPC) && !defined (HOST_OS_DARWIN) - { "ppcasm_memcpy()", ppcasm_memcpy, 0, 0 }, - { "ppcasm_cacheable_memcpy()", ppcasm_cacheable_memcpy, 0, MM_ACCEL_PPC_CACHE32 }, + { "ppcasm", ppcasm_memcpy, 0 }, + { "ppcasm_cached", ppcasm_cacheable_memcpy, MM_ACCEL_PPC_CACHE32 }, #endif /* ARCH_PPC && !HOST_OS_DARWIN */ - { NULL, NULL, 0, 0 } + { "", NULL, 0 } }; +static uint64_t memcpy_timing[sizeof(memcpy_method)/sizeof(memcpy_method[0])] = { 0, }; + #ifdef HAVE_POSIX_TIMERS /* Prefer clock_gettime() where available. */ static int64_t _x_gettime(void) @@ -459,7 +556,7 @@ static void update_fast_memcpy(void *user_data, xine_cfg_entry_t *entry) { if (method != 0 && (config_flags & memcpy_method[method].cpu_require) == memcpy_method[method].cpu_require ) { - lprintf("using %s\n", memcpy_method[method].name ); + lprintf("using %s memcpy()\n", memcpy_method[method].name ); xine_fast_memcpy = memcpy_method[method].function; return; } else { @@ -474,10 +571,13 @@ void xine_probe_fast_memcpy(xine_t *xine) char *buf1, *buf2; int i, j, best; int config_flags = -1; - static const char *memcpy_methods[] = { + static const char *const memcpy_methods[] = { "probe", "libc", #if (defined(ARCH_X86) || defined(ARCH_X86_64)) && !defined(_MSC_VER) "kernel", "mmx", "mmxext", "sse", +# ifdef HAVE_AVX + "avx", +# endif /* HAVE_AVX */ #endif #if defined (ARCH_PPC) && !defined (HOST_OS_DARWIN) "ppcasm_memcpy", "ppcasm_cacheable_memcpy", @@ -498,9 +598,10 @@ void xine_probe_fast_memcpy(xine_t *xine) /* check if function is configured and valid for this machine */ if( best != 0 && + best < sizeof(memcpy_methods)/sizeof(memcpy_method[0]) && (config_flags & memcpy_method[best].cpu_require) == memcpy_method[best].cpu_require ) { - lprintf("using %s\n", memcpy_method[best].name ); + lprintf("using %s memcpy()\n", memcpy_method[best].name ); xine_fast_memcpy = memcpy_method[best].function; return; } @@ -528,7 +629,7 @@ void xine_probe_fast_memcpy(xine_t *xine) memcpy_method[1].function(buf1,buf2,BUFSIZE); } - for(i=1; memcpy_method[i].name; i++) + for(i=1; memcpy_method[i].name[0]; i++) { if( (config_flags & memcpy_method[i].cpu_require) != memcpy_method[i].cpu_require ) @@ -541,11 +642,11 @@ void xine_probe_fast_memcpy(xine_t *xine) } t = rdtsc(config_flags) - t; - memcpy_method[i].time = t; + memcpy_timing[i] = t; - xprintf(xine, XINE_VERBOSITY_LOG, "\t%s : %" PRIu64 "\n", memcpy_method[i].name, t); + xprintf(xine, XINE_VERBOSITY_LOG, "\t%s memcpy() : %" PRIu64 "\n", memcpy_method[i].name, t); - if( best == 0 || t < memcpy_method[best].time ) + if( best == 0 || t < memcpy_timing[best] ) best = i; } |