summaryrefslogtreecommitdiff
path: root/dxr3memcpy.c
diff options
context:
space:
mode:
authorscop <scop>2005-03-14 13:53:29 +0000
committerscop <scop>2005-03-14 13:53:29 +0000
commit199be12e1e23d7d741e49ea3495cf9dc5f44bdbb (patch)
treee91213a087a59bd333745340c8ca2f732472c0cd /dxr3memcpy.c
parentb7225753e91a5500a9624ce7de9365d9f8523106 (diff)
downloadvdr-plugin-dxr3-199be12e1e23d7d741e49ea3495cf9dc5f44bdbb.tar.gz
vdr-plugin-dxr3-199be12e1e23d7d741e49ea3495cf9dc5f44bdbb.tar.bz2
More GCC 3.4+ compilation fixes.
Diffstat (limited to 'dxr3memcpy.c')
-rw-r--r--dxr3memcpy.c108
1 files changed, 49 insertions, 59 deletions
diff --git a/dxr3memcpy.c b/dxr3memcpy.c
index 0d59e09..30c857d 100644
--- a/dxr3memcpy.c
+++ b/dxr3memcpy.c
@@ -34,20 +34,12 @@
#include "dxr3log.h"
#include "dxr3cpu.h"
#include "dxr3memcpy.h"
-#include <sys/times.h>
-#include <limits.h>
-
-// ==================================
-//! our function pointer
void *(* dxr3_memcpy)(void *to, const void *from, size_t len);
-#ifdef __i386__
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
// ==================================
// for small memory blocks (<256 bytes) this version is faster
-#define small_memcpy(to,from,n) { register unsigned long int dummy; __asm__ __volatile__("rep; movsb":"=&D"(to), "=&S"(from), "=&c"(dummy) :"0" (to), "1" (from),"2" (n) : "memory"); }
-/*
-// -- dosn't compile with 2.95 gcc --
#define small_memcpy(to,from,n)\
{\
register unsigned long int dummy;\
@@ -57,9 +49,9 @@ __asm__ __volatile__(\
:"0" (to), "1" (from),"2" (n)\
: "memory");\
}
-*/
+
// ==================================
-//! linux kernel __memcpy (from: /include/asm/string.h)
+// linux kernel __memcpy (from: /include/asm/string.h)
static __inline__ void * __memcpy (
void * to,
const void * from,
@@ -94,6 +86,9 @@ int d0, d1, d2;
#define MIN_LEN 0x40 /* 64-byte blocks */
+// Test for GCC > 3.2.0
+#if GCC_VERSION > 30200
+
// ==================================
/* SSE note: i tried to move 128 bytes a time instead of 64 but it
didn't make any measureable difference. i'm using 64 for the sake of
@@ -146,9 +141,9 @@ static void * sse_memcpy(void * to, const void * from, size_t len)
"movntps %%xmm1, 16(%1)\n"
"movntps %%xmm2, 32(%1)\n"
"movntps %%xmm3, 48(%1)\n"
- : : "r" (from), "r" (to) : "memory");
- ((const unsigned char *)from)+=64;
- ((unsigned char *)to)+=64;
+ :: "r" (from), "r" (to) : "memory");
+ from = ((const unsigned char *)from) + 64;
+ to = ((unsigned char *)to) + 64;
}
else
/*
@@ -169,15 +164,15 @@ static void * sse_memcpy(void * to, const void * from, size_t len)
"movntps %%xmm1, 16(%1)\n"
"movntps %%xmm2, 32(%1)\n"
"movntps %%xmm3, 48(%1)\n"
- : : "r" (from), "r" (to) : "memory");
- ((const unsigned char *)from)+=64;
- ((unsigned char *)to)+=64;
+ :: "r" (from), "r" (to) : "memory");
+ from = ((const unsigned char *)from) + 64;
+ to = ((unsigned char *)to) + 64;
}
/* since movntq is weakly-ordered, a "sfence"
* is needed to become ordered again. */
- __asm__ __volatile__ ("sfence": : :"memory");
+ __asm__ __volatile__ ("sfence":::"memory");
/* enables to use FPU */
- __asm__ __volatile__ ("emms": : :"memory");
+ __asm__ __volatile__ ("emms":::"memory");
}
/*
* Now do the tail of the block
@@ -225,11 +220,11 @@ static void * mmx_memcpy(void * to, const void * from, size_t len)
"movq %%mm5, 40(%1)\n"
"movq %%mm6, 48(%1)\n"
"movq %%mm7, 56(%1)\n"
- : : "r" (from), "r" (to) : "memory");
- ((const unsigned char *)from)+=64;
- ((unsigned char *)to)+=64;
+ :: "r" (from), "r" (to) : "memory");
+ from = ((const unsigned char *)from) + 64;
+ to = ((unsigned char *)to) + 64;
}
- __asm__ __volatile__ ("emms": : :"memory");
+ __asm__ __volatile__ ("emms":::"memory");
}
/*
* Now do the tail of the block
@@ -257,7 +252,7 @@ static void * mmx2_memcpy(void * to, const void * from, size_t len)
" prefetchnta 224(%0)\n"
" prefetchnta 256(%0)\n"
" prefetchnta 288(%0)\n"
- : : "r" (from) );
+ :: "r" (from) );
if(len >= MIN_LEN)
{
@@ -293,14 +288,14 @@ static void * mmx2_memcpy(void * to, const void * from, size_t len)
"movntq %%mm5, 40(%1)\n"
"movntq %%mm6, 48(%1)\n"
"movntq %%mm7, 56(%1)\n"
- : : "r" (from), "r" (to) : "memory");
- ((const unsigned char *)from)+=64;
- ((unsigned char *)to)+=64;
+ :: "r" (from), "r" (to) : "memory");
+ from = ((const unsigned char *)from) + 64;
+ to = ((unsigned char *)to) + 64;
}
/* since movntq is weakly-ordered, a "sfence"
* is needed to become ordered again. */
- __asm__ __volatile__ ("sfence": : :"memory");
- __asm__ __volatile__ ("emms": : :"memory");
+ __asm__ __volatile__ ("sfence":::"memory");
+ __asm__ __volatile__ ("emms":::"memory");
}
/*
* Now do the tail of the block
@@ -309,15 +304,17 @@ static void * mmx2_memcpy(void * to, const void * from, size_t len)
return retval;
}
+#endif /*GCC_VERSION > 30200*/
+
// ==================================
static void *linux_kernel_memcpy(void *to, const void *from, size_t len) {
return __memcpy(to,from,len);
}
-#endif /*__i386__*/
+#endif /*ARCH_X86/ARCH_X86_64*/
// ==================================
-//! constr.
+// constr.
cDxr3MemcpyBench::cDxr3MemcpyBench(uint32_t config_flags)
{
//
@@ -333,7 +330,7 @@ cDxr3MemcpyBench::cDxr3MemcpyBench(uint32_t config_flags)
routine.cpu_require = 0;
m_methods.push_back(routine);
- #ifdef __i386__
+ #if defined(ARCH_X86) || defined(ARCH_X86_64)
// linux_kernel_memcpy
routine.name = "linux_kernel_memcpy()";
@@ -341,6 +338,9 @@ cDxr3MemcpyBench::cDxr3MemcpyBench(uint32_t config_flags)
routine.cpu_require = 0;
m_methods.push_back(routine);
+ // Test for GCC > 3.2.0
+ # if GCC_VERSION > 30200
+
// MMX optimized memcpy()
routine.name = "MMX optimized memcpy()";
routine.function = mmx_memcpy;
@@ -353,7 +353,7 @@ cDxr3MemcpyBench::cDxr3MemcpyBench(uint32_t config_flags)
routine.cpu_require = CC_MMXEXT;
m_methods.push_back(routine);
- # ifndef __FreeBSD__
+ # ifndef __FreeBSD__
// SSE optimized memcpy()
routine.name = "SSE optimized memcpy()";
@@ -361,8 +361,9 @@ cDxr3MemcpyBench::cDxr3MemcpyBench(uint32_t config_flags)
routine.cpu_require = CC_MMXEXT|CC_SSE;
m_methods.push_back(routine);
- # endif /*__FreeBSD__*/
- #endif /*__i386__*/
+ # endif /*__FreeBSD__*/
+ # endif /*GCC_VERSION > 30200*/
+ #endif /*ARCH_X86/ARCH_X86_64*/
//
@@ -394,13 +395,13 @@ cDxr3MemcpyBench::cDxr3MemcpyBench(uint32_t config_flags)
}
// count 100 runs of the memcpy function
- t = Rdtsc(config_flags);
+ t = Rdtsc();
for (j = 0; j < 50; j++)
{
m_methods[i].function(buf2,buf1,BUFSIZE);
m_methods[i].function(buf1,buf2,BUFSIZE);
}
- t = Rdtsc(config_flags) - t;
+ t = Rdtsc() - t;
m_methods[i].time = t;
@@ -422,27 +423,16 @@ cDxr3MemcpyBench::cDxr3MemcpyBench(uint32_t config_flags)
}
// ==================================
-//! needed for exact timing
-#ifdef __i386__
-unsigned long long int cDxr3MemcpyBench::Rdtsc(uint32_t config_flags)
-{
- // we need rdtsc support
- if (config_flags && CC_MMX)
- {
- unsigned long long int x;
- __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
- return x;
- }
- else
- {
- return times(NULL);
- }
-
-}
-#else
-unsigned long long int cDxr3MemcpyBench::Rdtsc(uint32_t config_flags)
+// neede for exact timing
+unsigned long long int cDxr3MemcpyBench::Rdtsc()
{
- struct tms tp;
- return times(&tp);
+ #ifdef ARCH_X86
+ unsigned long long int x;
+ __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
+ return x;
+ #else
+ /* FIXME: implement an equivalent for using optimized memcpy on other
+ architectures */
+ return 0;
+ #endif
}
-#endif