This file already moved to xine-utils subdir.

CVS patchset: 883 CVS date: 2001/10/25 09:23:15
author: Daniel Caujolle-Bert <f1rmb@users.sourceforge.net> 2001-10-25 09:23:15 +0000
committer: Daniel Caujolle-Bert <f1rmb@users.sourceforge.net> 2001-10-25 09:23:15 +0000
commit: bf1afaa6fb6b63ffb324e1e1f972cc4f03634825 (patch)
tree: 74883fe072544ed97c10d6a30fb3f6500aa003ec /src
parent: a5049f8fa3fe56454a2763d383af2a568c82d964 (diff)
download: xine-lib-bf1afaa6fb6b63ffb324e1e1f972cc4f03634825.tar.gz
xine-lib-bf1afaa6fb6b63ffb324e1e1f972cc4f03634825.tar.bz2
1 files changed, 0 insertions, 451 deletions
diff --git a/src/xine-engine/memcpy.c b/src/xine-engine/memcpy.c
deleted file mode 100644
index 511dd38ef..000000000
--- a/src/xine-engine/memcpy.c
+++ /dev/null
@@ -1,451 +0,0 @@
-/* 
- * Copyright (C) 2001 the xine project
- * 
- * This file is part of xine, a unix video player.
- * 
- * xine is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- * 
- * xine is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
- *
- * These are the MMX/MMX2/SSE optimized versions of memcpy
- *
- * This code was adapted from Linux Kernel sources by Nick Kurshev to
- * the mplayer program. (http://mplayer.sourceforge.net)
- *
- * Miguel Freitas split the #ifdefs into several specialized functions that
- * are benchmarked at runtime by xine. Some original comments from Nick
- * have been preserved documenting some MMX/SSE oddities.
- * Also added kernel memcpy function that seems faster than glibc one.
- *
- */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include <stdlib.h>
-#include <string.h>
-#include "xine_internal.h"
-#include "cpu_accel.h"
- 
-void *(* fast_memcpy)(void *to, const void *from, size_t len);
- 
-/* Original comments from mplayer (file: aclib.c)
- This part of code was taken by me from Linux-2.4.3 and slightly modified
-for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
-blocks but mplayer uses weakly ordered data and original sources can not
-speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
-
->From IA-32 Intel Architecture Software Developer's Manual Volume 1,
-
-Order Number 245470:
-"10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
-
-Data referenced by a program can be temporal (data will be used again) or
-non-temporal (data will be referenced once and not reused in the immediate
-future). To make efficient use of the processor's caches, it is generally
-desirable to cache temporal data and not cache non-temporal data. Overloading
-the processor's caches with non-temporal data is sometimes referred to as
-"polluting the caches".
-The non-temporal data is written to memory with Write-Combining semantics.
-
-The PREFETCHh instructions permits a program to load data into the processor
-at a suggested cache level, so that it is closer to the processors load and
-store unit when it is needed. If the data is already present in a level of
-the cache hierarchy that is closer to the processor, the PREFETCHh instruction
-will not result in any data movement.
-But we should you PREFETCHNTA: Non-temporal data fetch data into location
-close to the processor, minimizing cache pollution.
-
-The MOVNTQ (store quadword using non-temporal hint) instruction stores
-packed integer data from an MMX register to memory, using a non-temporal hint.
-The MOVNTPS (store packed single-precision floating-point values using
-non-temporal hint) instruction stores packed floating-point data from an
-XMM register to memory, using a non-temporal hint.
-
-The SFENCE (Store Fence) instruction controls write ordering by creating a
-fence for memory store operations. This instruction guarantees that the results
-of every store instruction that precedes the store fence in program order is
-globally visible before any store instruction that follows the fence. The
-SFENCE instruction provides an efficient way of ensuring ordering between
-procedures that produce weakly-ordered data and procedures that consume that
-data.
-
-If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
-*/
-
-/*  mmx v.1 Note: Since we added alignment of destinition it speedups
-    of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
-    standard (non MMX-optimized) version.
-    Note: on K6-2+ it speedups memory copying upto 25% and
-          on K7 and P3 about 500% (5 times). 
-*/
-
-/* Additional notes on gcc assembly and processors: [MF]
-prefetch is specific for AMD processors, the intel ones should be
-prefetch0, prefetch1, prefetch2 which are not recognized by my gcc.
-prefetchnta is supported both on athlon and pentium 3.
-
-therefore i will take off prefetchnta instructions from the mmx1 version
-to avoid problems on pentium mmx and k6-2.  
-
-quote of the day:
-"Using prefetches efficiently is more of an art than a science"
-*/
-
-
-#ifdef ARCH_X86
-
-/* for small memory blocks (<256 bytes) this version is faster */
-#define small_memcpy(to,from,n)\
-{\
-register unsigned long int dummy;\
-__asm__ __volatile__(\
-  "rep; movsb"\
-  :"=&D"(to), "=&S"(from), "=&c"(dummy)\
-  :"0" (to), "1" (from),"2" (n)\
-  : "memory");\
-}
-
-/* linux kernel __memcpy (from: /include/asm/string.h) */
-static inline void * __memcpy(void * to, const void * from, size_t n)
-{
-int d0, d1, d2;
-
-  if( n < 4 ) {
-    small_memcpy(to,from,n);
-  }
-  else
-    __asm__ __volatile__(
-    "rep ; movsl\n\t"
-    "testb $2,%b4\n\t"
-    "je 1f\n\t"
-    "movsw\n"
-    "1:\ttestb $1,%b4\n\t"
-    "je 2f\n\t"
-    "movsb\n"
-    "2:"
-    : "=&c" (d0), "=&D" (d1), "=&S" (d2)
-    :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
-    : "memory");
-  
-  return (to);
-}
-
-#define SSE_MMREG_SIZE 16
-#define MMX_MMREG_SIZE 8
-
-#define MMX1_MIN_LEN 0x800  /* 2K blocks */
-#define MIN_LEN 0x40  /* 64-byte blocks */
-
-/* SSE note: i tried to move 128 bytes a time instead of 64 but it
-didn't make any measureable difference. i'm using 64 for the sake of
-simplicity. [MF] */
-static void * sse_memcpy(void * to, const void * from, size_t len)
-{
-  void *retval;
-  size_t i;
-  retval = to;
-    
-  /* PREFETCH has effect even for MOVSB instruction ;) */
-  __asm__ __volatile__ (
-    "   prefetchnta (%0)\n"
-    "   prefetchnta 64(%0)\n"
-    "   prefetchnta 128(%0)\n"
-    "   prefetchnta 192(%0)\n"
-    "   prefetchnta 256(%0)\n"
-    : : "r" (from) );
-    
-  if(len >= MIN_LEN)
-  {
-    register unsigned long int delta;
-    /* Align destinition to MMREG_SIZE -boundary */
-    delta = ((unsigned long int)to)&(SSE_MMREG_SIZE-1);
-    if(delta)
-    {
-      delta=SSE_MMREG_SIZE-delta;
-      len -= delta;
-      small_memcpy(to, from, delta);
-    }
-    i = len >> 6; /* len/64 */
-    len&=63;
-    if(((unsigned long)from) & 15)
-      /* if SRC is misaligned */
-      for(; i>0; i--)
-      {
-        __asm__ __volatile__ (
-        "prefetchnta 320(%0)\n"
-        "movups (%0), %%xmm0\n"
-        "movups 16(%0), %%xmm1\n"
-        "movups 32(%0), %%xmm2\n"
-        "movups 48(%0), %%xmm3\n"
-        "movntps %%xmm0, (%1)\n"
-        "movntps %%xmm1, 16(%1)\n"
-        "movntps %%xmm2, 32(%1)\n"
-        "movntps %%xmm3, 48(%1)\n"
-        :: "r" (from), "r" (to) : "memory");
-        ((const unsigned char *)from)+=64;
-        ((unsigned char *)to)+=64;
-      }
-    else 
-      /*
-         Only if SRC is aligned on 16-byte boundary.
-         It allows to use movaps instead of movups, which required data
-         to be aligned or a general-protection exception (#GP) is generated.
-      */
-      for(; i>0; i--)
-      {
-        __asm__ __volatile__ (
-        "prefetchnta 320(%0)\n"
-        "movaps (%0), %%xmm0\n"
-        "movaps 16(%0), %%xmm1\n"
-        "movaps 32(%0), %%xmm2\n"
-        "movaps 48(%0), %%xmm3\n"
-        "movntps %%xmm0, (%1)\n"
-        "movntps %%xmm1, 16(%1)\n"
-        "movntps %%xmm2, 32(%1)\n"
-        "movntps %%xmm3, 48(%1)\n"
-        :: "r" (from), "r" (to) : "memory");
-        ((const unsigned char *)from)+=64;
-        ((unsigned char *)to)+=64;
-      }
-    /* since movntq is weakly-ordered, a "sfence"
-     * is needed to become ordered again. */
-    __asm__ __volatile__ ("sfence":::"memory");
-    /* enables to use FPU */
-    __asm__ __volatile__ ("emms":::"memory");
-  }
-  /*
-   *	Now do the tail of the block
-   */
-  if(len) __memcpy(to, from, len);
-  return retval;
-}
-
-static void * mmx_memcpy(void * to, const void * from, size_t len)
-{
-  void *retval;
-  size_t i;
-  retval = to;
-
-  if(len >= MMX1_MIN_LEN)
-  {
-    register unsigned long int delta;
-    /* Align destinition to MMREG_SIZE -boundary */
-    delta = ((unsigned long int)to)&(MMX_MMREG_SIZE-1);
-    if(delta)
-    {
-      delta=MMX_MMREG_SIZE-delta;
-      len -= delta;
-      small_memcpy(to, from, delta);
-    }
-    i = len >> 6; /* len/64 */
-    len&=63;
-    for(; i>0; i--)
-    {
-      __asm__ __volatile__ (
-      "movq (%0), %%mm0\n"
-      "movq 8(%0), %%mm1\n"
-      "movq 16(%0), %%mm2\n"
-      "movq 24(%0), %%mm3\n"
-      "movq 32(%0), %%mm4\n"
-      "movq 40(%0), %%mm5\n"
-      "movq 48(%0), %%mm6\n"
-      "movq 56(%0), %%mm7\n"
-      "movq %%mm0, (%1)\n"
-      "movq %%mm1, 8(%1)\n"
-      "movq %%mm2, 16(%1)\n"
-      "movq %%mm3, 24(%1)\n"
-      "movq %%mm4, 32(%1)\n"
-      "movq %%mm5, 40(%1)\n"
-      "movq %%mm6, 48(%1)\n"
-      "movq %%mm7, 56(%1)\n"
-      :: "r" (from), "r" (to) : "memory");
-      ((const unsigned char *)from)+=64;
-      ((unsigned char *)to)+=64;
-    }
-    __asm__ __volatile__ ("emms":::"memory");
-  }
-  /*
-   *	Now do the tail of the block
-   */
-  if(len) __memcpy(to, from, len);
-  return retval;
-}
-
-void * mmx2_memcpy(void * to, const void * from, size_t len)
-{
-  void *retval;
-  size_t i;
-  retval = to;
-
-  /* PREFETCH has effect even for MOVSB instruction ;) */
-  __asm__ __volatile__ (
-    "   prefetchnta (%0)\n"
-    "   prefetchnta 64(%0)\n"
-    "   prefetchnta 128(%0)\n"
-    "   prefetchnta 192(%0)\n"
-    "   prefetchnta 256(%0)\n"
-    : : "r" (from) );
-
-  if(len >= MIN_LEN)
-  {
-    register unsigned long int delta;
-    /* Align destinition to MMREG_SIZE -boundary */
-    delta = ((unsigned long int)to)&(MMX_MMREG_SIZE-1);
-    if(delta)
-    {
-      delta=MMX_MMREG_SIZE-delta;
-      len -= delta;
-      small_memcpy(to, from, delta);
-    }
-    i = len >> 6; /* len/64 */
-    len&=63;
-    for(; i>0; i--)
-    {
-      __asm__ __volatile__ (
-      "prefetchnta 320(%0)\n"
-      "movq (%0), %%mm0\n"
-      "movq 8(%0), %%mm1\n"
-      "movq 16(%0), %%mm2\n"
-      "movq 24(%0), %%mm3\n"
-      "movq 32(%0), %%mm4\n"
-      "movq 40(%0), %%mm5\n"
-      "movq 48(%0), %%mm6\n"
-      "movq 56(%0), %%mm7\n"
-      "movntq %%mm0, (%1)\n"
-      "movntq %%mm1, 8(%1)\n"
-      "movntq %%mm2, 16(%1)\n"
-      "movntq %%mm3, 24(%1)\n"
-      "movntq %%mm4, 32(%1)\n"
-      "movntq %%mm5, 40(%1)\n"
-      "movntq %%mm6, 48(%1)\n"
-      "movntq %%mm7, 56(%1)\n"
-      :: "r" (from), "r" (to) : "memory");
-      ((const unsigned char *)from)+=64;
-      ((unsigned char *)to)+=64;
-    }
-     /* since movntq is weakly-ordered, a "sfence"
-     * is needed to become ordered again. */
-    __asm__ __volatile__ ("sfence":::"memory");
-    __asm__ __volatile__ ("emms":::"memory");
-  }
-  /*
-   *	Now do the tail of the block
-   */
-  if(len) __memcpy(to, from, len);
-  return retval;
-}
-
-static void *linux_kernel_memcpy(void *to, const void *from, size_t len) {
-  return __memcpy(to,from,len);
-}
-
-#endif /* ARCH_X86 */
-
-static struct {
-  char *name;
-  void *(* function)(void *to, const void *from, size_t len);
-  unsigned long long time;
-  uint32_t cpu_require;
-} memcpy_method[] = 
-{
-  { "glibc memcpy()", memcpy, 0, 0 },
-#ifdef ARCH_X86
-  { "linux kernel memcpy()", linux_kernel_memcpy, 0, 0 },
-  { "MMX optimized memcpy()", mmx_memcpy, 0, MM_MMX },
-  { "MMXEXT optimized memcpy()", mmx2_memcpy, 0, MM_MMXEXT },
-  { "SSE optimized memcpy()", sse_memcpy, 0, MM_MMXEXT|MM_SSE },
-#endif /* ARCH_X86 */
-  { NULL, NULL, 0, 0 }
-};
-
-#ifdef ARCH_X86
-static unsigned long long int rdtsc()
-{
-  unsigned long long int x;
-  __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));     
-  return x;
-}
-#else
-static unsigned long long int rdtsc()
-{
-  /* FIXME: implement an equivalent for using optimized memcpy on other
-            architectures */
-  return 0;
-}
-#endif
-
-
-#define BUFSIZE 1024*1024
-void probe_fast_memcpy(config_values_t *config)
-{
-unsigned long long t;
-char *buf1, *buf2;
-int i, j, best;
-static int config_flags = -1;
-
-#ifdef ARCH_X86
-  config_flags = mm_accel();
-#else
-  config_flags = 0;
-#endif
-
-  best = config->lookup_int (config, "fast_memcpy", -1);
-  /* check if function is configured and valid for this machine */
-  if( best != -1 && 
-     (config_flags & memcpy_method[best].cpu_require) == 
-      memcpy_method[best].cpu_require ) {
-    printf("xine: using %s\n", memcpy_method[best].name );
-    fast_memcpy = memcpy_method[best].function;
-    return;
-  }
-  
-  fast_memcpy = memcpy;
-  
-  if( (buf1 = malloc(BUFSIZE)) == NULL )
-    return;
-    
-  if( (buf2 = malloc(BUFSIZE)) == NULL ) {
-    free(buf1);
-    return;
-  }
-
-  printf("Benchmarking memcpy methods (smaller is better):\n");
-  /* make sure buffers are present on physical memory */
-  memcpy(buf1,buf2,BUFSIZE);
-
-  for(i=0; memcpy_method[i].name; i++)
-  {
-    if( (config_flags & memcpy_method[i].cpu_require) != 
-         memcpy_method[i].cpu_require )
-      continue;
-    
-    t = rdtsc();
-    for(j=0;j<100;j++)
-      memcpy_method[i].function(buf1,buf2,BUFSIZE);
-    t = rdtsc() - t;
-    memcpy_method[i].time = t;
-    
-    printf("\t%s : %lld\n",memcpy_method[i].name, t);
-    
-    if( best == -1 || t < memcpy_method[best].time )
-      best = i;
-  }
-  printf("xine: using %s\n", memcpy_method[best].name );
-  fast_memcpy = memcpy_method[best].function;
-  config->set_int (config, "fast_memcpy", best );
-  
-  free(buf1);
-  free(buf2);
-}
author	Daniel Caujolle-Bert <f1rmb@users.sourceforge.net>	2001-10-25 09:23:15 +0000
committer	Daniel Caujolle-Bert <f1rmb@users.sourceforge.net>	2001-10-25 09:23:15 +0000
commit	bf1afaa6fb6b63ffb324e1e1f972cc4f03634825 (patch)
tree	74883fe072544ed97c10d6a30fb3f6500aa003ec /src
parent	a5049f8fa3fe56454a2763d383af2a568c82d964 (diff)
download	xine-lib-bf1afaa6fb6b63ffb324e1e1f972cc4f03634825.tar.gz xine-lib-bf1afaa6fb6b63ffb324e1e1f972cc4f03634825.tar.bz2