summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--HISTORY3
-rw-r--r--dxr3memcpy.c54
2 files changed, 36 insertions, 21 deletions
diff --git a/HISTORY b/HISTORY
index 48a2c64..eefbbc6 100644
--- a/HISTORY
+++ b/HISTORY
@@ -243,6 +243,9 @@ NOTE: I havent found time to include all of the languages, will be done in pre2
(Christian Gmeiner, Paavo Hartikainen)
- added many comments into source (Christian Gmeiner)
- using doxygen for docs (Christian Gmeiner)
+- extended cDxr3MemcpyBench::Rdtsc(uint32_t config_flags): support for
+ non-x86 archs, support for cpu's, which dont support rdtsc timing
+ (Christian Gmeiner)
- added support for VDR 1.3.13 and later (Luca Olivetti, Peter Dittmann)
- removed -lz from makefile (Christian Gmeiner)
- compiles now with 3.4.x gcc's (Christian Gmeiner, Ville Skyttä)
diff --git a/dxr3memcpy.c b/dxr3memcpy.c
index 979d2c4..374cc32 100644
--- a/dxr3memcpy.c
+++ b/dxr3memcpy.c
@@ -34,6 +34,8 @@
#include "dxr3log.h"
#include "dxr3cpu.h"
#include "dxr3memcpy.h"
+#include <sys/times.h>
+#include <limits.h>
// ==================================
@@ -43,6 +45,9 @@ void *(* dxr3_memcpy)(void *to, const void *from, size_t len);
#if defined(__i386__) || defined(__x86_64__)
// ==================================
// for small memory blocks (<256 bytes) this version is faster
+#define small_memcpy(to,from,n) { register unsigned long int dummy; __asm__ __volatile__("rep; movsb":"=&D"(to), "=&S"(from), "=&c"(dummy) :"0" (to), "1" (from),"2" (n) : "memory"); }
+/*
+// -- doesn't compile with 2.95 gcc --
#define small_memcpy(to,from,n)\
{\
register unsigned long int dummy;\
@@ -52,9 +57,9 @@ __asm__ __volatile__(\
:"0" (to), "1" (from),"2" (n)\
: "memory");\
}
-
+*/
// ==================================
-// linux kernel __memcpy (from: /include/asm/string.h)
+//! linux kernel __memcpy (from: /include/asm/string.h)
static __inline__ void * __memcpy (
void * to,
const void * from,
@@ -141,7 +146,7 @@ static void * sse_memcpy(void * to, const void * from, size_t len)
"movntps %%xmm1, 16(%1)\n"
"movntps %%xmm2, 32(%1)\n"
"movntps %%xmm3, 48(%1)\n"
- :: "r" (from), "r" (to) : "memory");
+ : : "r" (from), "r" (to) : "memory");
from = ((const unsigned char *)from) + 64;
to = ((unsigned char *)to) + 64;
}
@@ -164,15 +169,15 @@ static void * sse_memcpy(void * to, const void * from, size_t len)
"movntps %%xmm1, 16(%1)\n"
"movntps %%xmm2, 32(%1)\n"
"movntps %%xmm3, 48(%1)\n"
- :: "r" (from), "r" (to) : "memory");
+ : : "r" (from), "r" (to) : "memory");
from = ((const unsigned char *)from) + 64;
to = ((unsigned char *)to) + 64;
}
/* since movntq is weakly-ordered, a "sfence"
* is needed to become ordered again. */
- __asm__ __volatile__ ("sfence":::"memory");
+ __asm__ __volatile__ ("sfence": : :"memory");
/* enables to use FPU */
- __asm__ __volatile__ ("emms":::"memory");
+ __asm__ __volatile__ ("emms": : :"memory");
}
/*
* Now do the tail of the block
@@ -220,11 +225,11 @@ static void * mmx_memcpy(void * to, const void * from, size_t len)
"movq %%mm5, 40(%1)\n"
"movq %%mm6, 48(%1)\n"
"movq %%mm7, 56(%1)\n"
- :: "r" (from), "r" (to) : "memory");
+ : : "r" (from), "r" (to) : "memory");
from = ((const unsigned char *)from) + 64;
to = ((unsigned char *)to) + 64;
}
- __asm__ __volatile__ ("emms":::"memory");
+ __asm__ __volatile__ ("emms": : :"memory");
}
/*
* Now do the tail of the block
@@ -252,7 +257,7 @@ static void * mmx2_memcpy(void * to, const void * from, size_t len)
" prefetchnta 224(%0)\n"
" prefetchnta 256(%0)\n"
" prefetchnta 288(%0)\n"
- :: "r" (from) );
+ : : "r" (from) );
if(len >= MIN_LEN)
{
@@ -288,14 +293,14 @@ static void * mmx2_memcpy(void * to, const void * from, size_t len)
"movntq %%mm5, 40(%1)\n"
"movntq %%mm6, 48(%1)\n"
"movntq %%mm7, 56(%1)\n"
- :: "r" (from), "r" (to) : "memory");
+ : : "r" (from), "r" (to) : "memory");
from = ((const unsigned char *)from) + 64;
to = ((unsigned char *)to) + 64;
}
/* since movntq is weakly-ordered, a "sfence"
* is needed to become ordered again. */
- __asm__ __volatile__ ("sfence":::"memory");
- __asm__ __volatile__ ("emms":::"memory");
+ __asm__ __volatile__ ("sfence": : :"memory");
+ __asm__ __volatile__ ("emms": : :"memory");
}
/*
* Now do the tail of the block
@@ -388,13 +393,13 @@ cDxr3MemcpyBench::cDxr3MemcpyBench(uint32_t config_flags)
}
// count 100 runs of the memcpy function
- t = Rdtsc();
+ t = Rdtsc(config_flags);
for (j = 0; j < 50; j++)
{
m_methods[i].function(buf2,buf1,BUFSIZE);
m_methods[i].function(buf1,buf2,BUFSIZE);
}
- t = Rdtsc() - t;
+ t = Rdtsc(config_flags) - t;
m_methods[i].time = t;
@@ -417,15 +422,22 @@ cDxr3MemcpyBench::cDxr3MemcpyBench(uint32_t config_flags)
// ==================================
//! needed for exact timing
-unsigned long long int cDxr3MemcpyBench::Rdtsc()
+unsigned long long int cDxr3MemcpyBench::Rdtsc(uint32_t config_flags)
{
#if defined(__i386__) || defined(__x86_64__)
- unsigned long long int x;
- __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
- return x;
+ // we need rdtsc support
+ if (config_flags && CC_MMX)
+ {
+ unsigned long long int x;
+ __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
+ return x;
+ }
+ else
+ {
+ return times(NULL);
+ }
#else
- /* FIXME: implement an equivalent for using optimized memcpy on other
- architectures */
- return 0;
+ struct tms tp;
+ return times(&tp);
#endif /* __i386__ || __x86_64__ */
}