summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorDaniel Caujolle-Bert <f1rmb@users.sourceforge.net>2001-10-25 09:23:15 +0000
committerDaniel Caujolle-Bert <f1rmb@users.sourceforge.net>2001-10-25 09:23:15 +0000
commitbf1afaa6fb6b63ffb324e1e1f972cc4f03634825 (patch)
tree74883fe072544ed97c10d6a30fb3f6500aa003ec /src
parenta5049f8fa3fe56454a2763d383af2a568c82d964 (diff)
downloadxine-lib-bf1afaa6fb6b63ffb324e1e1f972cc4f03634825.tar.gz
xine-lib-bf1afaa6fb6b63ffb324e1e1f972cc4f03634825.tar.bz2
This file already moved to xine-utils subdir.
CVS patchset: 883 CVS date: 2001/10/25 09:23:15
Diffstat (limited to 'src')
-rw-r--r--src/xine-engine/memcpy.c451
1 files changed, 0 insertions, 451 deletions
diff --git a/src/xine-engine/memcpy.c b/src/xine-engine/memcpy.c
deleted file mode 100644
index 511dd38ef..000000000
--- a/src/xine-engine/memcpy.c
+++ /dev/null
@@ -1,451 +0,0 @@
-/*
- * Copyright (C) 2001 the xine project
- *
- * This file is part of xine, a unix video player.
- *
- * xine is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * xine is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * These are the MMX/MMX2/SSE optimized versions of memcpy
- *
- * This code was adapted from Linux Kernel sources by Nick Kurshev to
- * the mplayer program. (http://mplayer.sourceforge.net)
- *
- * Miguel Freitas split the #ifdefs into several specialized functions that
- * are benchmarked at runtime by xine. Some original comments from Nick
- * have been preserved documenting some MMX/SSE oddities.
- * Also added kernel memcpy function that seems faster than glibc one.
- *
- */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include <stdlib.h>
-#include <string.h>
-#include "xine_internal.h"
-#include "cpu_accel.h"
-
-void *(* fast_memcpy)(void *to, const void *from, size_t len);
-
-/* Original comments from mplayer (file: aclib.c)
- This part of code was taken by me from Linux-2.4.3 and slightly modified
-for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
-blocks but mplayer uses weakly ordered data and original sources can not
-speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
-
->From IA-32 Intel Architecture Software Developer's Manual Volume 1,
-
-Order Number 245470:
-"10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
-
-Data referenced by a program can be temporal (data will be used again) or
-non-temporal (data will be referenced once and not reused in the immediate
-future). To make efficient use of the processor's caches, it is generally
-desirable to cache temporal data and not cache non-temporal data. Overloading
-the processor's caches with non-temporal data is sometimes referred to as
-"polluting the caches".
-The non-temporal data is written to memory with Write-Combining semantics.
-
-The PREFETCHh instructions permits a program to load data into the processor
-at a suggested cache level, so that it is closer to the processors load and
-store unit when it is needed. If the data is already present in a level of
-the cache hierarchy that is closer to the processor, the PREFETCHh instruction
-will not result in any data movement.
-But we should you PREFETCHNTA: Non-temporal data fetch data into location
-close to the processor, minimizing cache pollution.
-
-The MOVNTQ (store quadword using non-temporal hint) instruction stores
-packed integer data from an MMX register to memory, using a non-temporal hint.
-The MOVNTPS (store packed single-precision floating-point values using
-non-temporal hint) instruction stores packed floating-point data from an
-XMM register to memory, using a non-temporal hint.
-
-The SFENCE (Store Fence) instruction controls write ordering by creating a
-fence for memory store operations. This instruction guarantees that the results
-of every store instruction that precedes the store fence in program order is
-globally visible before any store instruction that follows the fence. The
-SFENCE instruction provides an efficient way of ensuring ordering between
-procedures that produce weakly-ordered data and procedures that consume that
-data.
-
-If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
-*/
-
-/* mmx v.1 Note: Since we added alignment of destinition it speedups
- of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
- standard (non MMX-optimized) version.
- Note: on K6-2+ it speedups memory copying upto 25% and
- on K7 and P3 about 500% (5 times).
-*/
-
-/* Additional notes on gcc assembly and processors: [MF]
-prefetch is specific for AMD processors, the intel ones should be
-prefetch0, prefetch1, prefetch2 which are not recognized by my gcc.
-prefetchnta is supported both on athlon and pentium 3.
-
-therefore i will take off prefetchnta instructions from the mmx1 version
-to avoid problems on pentium mmx and k6-2.
-
-quote of the day:
-"Using prefetches efficiently is more of an art than a science"
-*/
-
-
-#ifdef ARCH_X86
-
-/* for small memory blocks (<256 bytes) this version is faster */
-#define small_memcpy(to,from,n)\
-{\
-register unsigned long int dummy;\
-__asm__ __volatile__(\
- "rep; movsb"\
- :"=&D"(to), "=&S"(from), "=&c"(dummy)\
- :"0" (to), "1" (from),"2" (n)\
- : "memory");\
-}
-
-/* linux kernel __memcpy (from: /include/asm/string.h) */
-static inline void * __memcpy(void * to, const void * from, size_t n)
-{
-int d0, d1, d2;
-
- if( n < 4 ) {
- small_memcpy(to,from,n);
- }
- else
- __asm__ __volatile__(
- "rep ; movsl\n\t"
- "testb $2,%b4\n\t"
- "je 1f\n\t"
- "movsw\n"
- "1:\ttestb $1,%b4\n\t"
- "je 2f\n\t"
- "movsb\n"
- "2:"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
- : "memory");
-
- return (to);
-}
-
-#define SSE_MMREG_SIZE 16
-#define MMX_MMREG_SIZE 8
-
-#define MMX1_MIN_LEN 0x800 /* 2K blocks */
-#define MIN_LEN 0x40 /* 64-byte blocks */
-
-/* SSE note: i tried to move 128 bytes a time instead of 64 but it
-didn't make any measureable difference. i'm using 64 for the sake of
-simplicity. [MF] */
-static void * sse_memcpy(void * to, const void * from, size_t len)
-{
- void *retval;
- size_t i;
- retval = to;
-
- /* PREFETCH has effect even for MOVSB instruction ;) */
- __asm__ __volatile__ (
- " prefetchnta (%0)\n"
- " prefetchnta 64(%0)\n"
- " prefetchnta 128(%0)\n"
- " prefetchnta 192(%0)\n"
- " prefetchnta 256(%0)\n"
- : : "r" (from) );
-
- if(len >= MIN_LEN)
- {
- register unsigned long int delta;
- /* Align destinition to MMREG_SIZE -boundary */
- delta = ((unsigned long int)to)&(SSE_MMREG_SIZE-1);
- if(delta)
- {
- delta=SSE_MMREG_SIZE-delta;
- len -= delta;
- small_memcpy(to, from, delta);
- }
- i = len >> 6; /* len/64 */
- len&=63;
- if(((unsigned long)from) & 15)
- /* if SRC is misaligned */
- for(; i>0; i--)
- {
- __asm__ __volatile__ (
- "prefetchnta 320(%0)\n"
- "movups (%0), %%xmm0\n"
- "movups 16(%0), %%xmm1\n"
- "movups 32(%0), %%xmm2\n"
- "movups 48(%0), %%xmm3\n"
- "movntps %%xmm0, (%1)\n"
- "movntps %%xmm1, 16(%1)\n"
- "movntps %%xmm2, 32(%1)\n"
- "movntps %%xmm3, 48(%1)\n"
- :: "r" (from), "r" (to) : "memory");
- ((const unsigned char *)from)+=64;
- ((unsigned char *)to)+=64;
- }
- else
- /*
- Only if SRC is aligned on 16-byte boundary.
- It allows to use movaps instead of movups, which required data
- to be aligned or a general-protection exception (#GP) is generated.
- */
- for(; i>0; i--)
- {
- __asm__ __volatile__ (
- "prefetchnta 320(%0)\n"
- "movaps (%0), %%xmm0\n"
- "movaps 16(%0), %%xmm1\n"
- "movaps 32(%0), %%xmm2\n"
- "movaps 48(%0), %%xmm3\n"
- "movntps %%xmm0, (%1)\n"
- "movntps %%xmm1, 16(%1)\n"
- "movntps %%xmm2, 32(%1)\n"
- "movntps %%xmm3, 48(%1)\n"
- :: "r" (from), "r" (to) : "memory");
- ((const unsigned char *)from)+=64;
- ((unsigned char *)to)+=64;
- }
- /* since movntq is weakly-ordered, a "sfence"
- * is needed to become ordered again. */
- __asm__ __volatile__ ("sfence":::"memory");
- /* enables to use FPU */
- __asm__ __volatile__ ("emms":::"memory");
- }
- /*
- * Now do the tail of the block
- */
- if(len) __memcpy(to, from, len);
- return retval;
-}
-
-static void * mmx_memcpy(void * to, const void * from, size_t len)
-{
- void *retval;
- size_t i;
- retval = to;
-
- if(len >= MMX1_MIN_LEN)
- {
- register unsigned long int delta;
- /* Align destinition to MMREG_SIZE -boundary */
- delta = ((unsigned long int)to)&(MMX_MMREG_SIZE-1);
- if(delta)
- {
- delta=MMX_MMREG_SIZE-delta;
- len -= delta;
- small_memcpy(to, from, delta);
- }
- i = len >> 6; /* len/64 */
- len&=63;
- for(; i>0; i--)
- {
- __asm__ __volatile__ (
- "movq (%0), %%mm0\n"
- "movq 8(%0), %%mm1\n"
- "movq 16(%0), %%mm2\n"
- "movq 24(%0), %%mm3\n"
- "movq 32(%0), %%mm4\n"
- "movq 40(%0), %%mm5\n"
- "movq 48(%0), %%mm6\n"
- "movq 56(%0), %%mm7\n"
- "movq %%mm0, (%1)\n"
- "movq %%mm1, 8(%1)\n"
- "movq %%mm2, 16(%1)\n"
- "movq %%mm3, 24(%1)\n"
- "movq %%mm4, 32(%1)\n"
- "movq %%mm5, 40(%1)\n"
- "movq %%mm6, 48(%1)\n"
- "movq %%mm7, 56(%1)\n"
- :: "r" (from), "r" (to) : "memory");
- ((const unsigned char *)from)+=64;
- ((unsigned char *)to)+=64;
- }
- __asm__ __volatile__ ("emms":::"memory");
- }
- /*
- * Now do the tail of the block
- */
- if(len) __memcpy(to, from, len);
- return retval;
-}
-
-void * mmx2_memcpy(void * to, const void * from, size_t len)
-{
- void *retval;
- size_t i;
- retval = to;
-
- /* PREFETCH has effect even for MOVSB instruction ;) */
- __asm__ __volatile__ (
- " prefetchnta (%0)\n"
- " prefetchnta 64(%0)\n"
- " prefetchnta 128(%0)\n"
- " prefetchnta 192(%0)\n"
- " prefetchnta 256(%0)\n"
- : : "r" (from) );
-
- if(len >= MIN_LEN)
- {
- register unsigned long int delta;
- /* Align destinition to MMREG_SIZE -boundary */
- delta = ((unsigned long int)to)&(MMX_MMREG_SIZE-1);
- if(delta)
- {
- delta=MMX_MMREG_SIZE-delta;
- len -= delta;
- small_memcpy(to, from, delta);
- }
- i = len >> 6; /* len/64 */
- len&=63;
- for(; i>0; i--)
- {
- __asm__ __volatile__ (
- "prefetchnta 320(%0)\n"
- "movq (%0), %%mm0\n"
- "movq 8(%0), %%mm1\n"
- "movq 16(%0), %%mm2\n"
- "movq 24(%0), %%mm3\n"
- "movq 32(%0), %%mm4\n"
- "movq 40(%0), %%mm5\n"
- "movq 48(%0), %%mm6\n"
- "movq 56(%0), %%mm7\n"
- "movntq %%mm0, (%1)\n"
- "movntq %%mm1, 8(%1)\n"
- "movntq %%mm2, 16(%1)\n"
- "movntq %%mm3, 24(%1)\n"
- "movntq %%mm4, 32(%1)\n"
- "movntq %%mm5, 40(%1)\n"
- "movntq %%mm6, 48(%1)\n"
- "movntq %%mm7, 56(%1)\n"
- :: "r" (from), "r" (to) : "memory");
- ((const unsigned char *)from)+=64;
- ((unsigned char *)to)+=64;
- }
- /* since movntq is weakly-ordered, a "sfence"
- * is needed to become ordered again. */
- __asm__ __volatile__ ("sfence":::"memory");
- __asm__ __volatile__ ("emms":::"memory");
- }
- /*
- * Now do the tail of the block
- */
- if(len) __memcpy(to, from, len);
- return retval;
-}
-
-static void *linux_kernel_memcpy(void *to, const void *from, size_t len) {
- return __memcpy(to,from,len);
-}
-
-#endif /* ARCH_X86 */
-
-static struct {
- char *name;
- void *(* function)(void *to, const void *from, size_t len);
- unsigned long long time;
- uint32_t cpu_require;
-} memcpy_method[] =
-{
- { "glibc memcpy()", memcpy, 0, 0 },
-#ifdef ARCH_X86
- { "linux kernel memcpy()", linux_kernel_memcpy, 0, 0 },
- { "MMX optimized memcpy()", mmx_memcpy, 0, MM_MMX },
- { "MMXEXT optimized memcpy()", mmx2_memcpy, 0, MM_MMXEXT },
- { "SSE optimized memcpy()", sse_memcpy, 0, MM_MMXEXT|MM_SSE },
-#endif /* ARCH_X86 */
- { NULL, NULL, 0, 0 }
-};
-
-#ifdef ARCH_X86
-static unsigned long long int rdtsc()
-{
- unsigned long long int x;
- __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
- return x;
-}
-#else
-static unsigned long long int rdtsc()
-{
- /* FIXME: implement an equivalent for using optimized memcpy on other
- architectures */
- return 0;
-}
-#endif
-
-
-#define BUFSIZE 1024*1024
-void probe_fast_memcpy(config_values_t *config)
-{
-unsigned long long t;
-char *buf1, *buf2;
-int i, j, best;
-static int config_flags = -1;
-
-#ifdef ARCH_X86
- config_flags = mm_accel();
-#else
- config_flags = 0;
-#endif
-
- best = config->lookup_int (config, "fast_memcpy", -1);
- /* check if function is configured and valid for this machine */
- if( best != -1 &&
- (config_flags & memcpy_method[best].cpu_require) ==
- memcpy_method[best].cpu_require ) {
- printf("xine: using %s\n", memcpy_method[best].name );
- fast_memcpy = memcpy_method[best].function;
- return;
- }
-
- fast_memcpy = memcpy;
-
- if( (buf1 = malloc(BUFSIZE)) == NULL )
- return;
-
- if( (buf2 = malloc(BUFSIZE)) == NULL ) {
- free(buf1);
- return;
- }
-
- printf("Benchmarking memcpy methods (smaller is better):\n");
- /* make sure buffers are present on physical memory */
- memcpy(buf1,buf2,BUFSIZE);
-
- for(i=0; memcpy_method[i].name; i++)
- {
- if( (config_flags & memcpy_method[i].cpu_require) !=
- memcpy_method[i].cpu_require )
- continue;
-
- t = rdtsc();
- for(j=0;j<100;j++)
- memcpy_method[i].function(buf1,buf2,BUFSIZE);
- t = rdtsc() - t;
- memcpy_method[i].time = t;
-
- printf("\t%s : %lld\n",memcpy_method[i].name, t);
-
- if( best == -1 || t < memcpy_method[best].time )
- best = i;
- }
- printf("xine: using %s\n", memcpy_method[best].name );
- fast_memcpy = memcpy_method[best].function;
- config->set_int (config, "fast_memcpy", best );
-
- free(buf1);
- free(buf2);
-}