From f9c59e544ada17055c3ed15b3d80f0d285c3928d Mon Sep 17 00:00:00 2001 From: scop Date: Tue, 19 Apr 2005 18:19:34 +0000 Subject: Mass indentation/whitespace cleanup. --- dxr3memcpy.c | 619 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 312 insertions(+), 307 deletions(-) (limited to 'dxr3memcpy.c') diff --git a/dxr3memcpy.c b/dxr3memcpy.c index fe65cc0..19f7fda 100644 --- a/dxr3memcpy.c +++ b/dxr3memcpy.c @@ -60,258 +60,260 @@ __asm__ __volatile__(\ */ // ================================== //! linux kernel __memcpy (from: /include/asm/string.h) -static __inline__ void * __memcpy ( - void * to, - const void * from, - size_t n) +static __inline__ void * __memcpy (void * to, const void * from, size_t n) { -int d0, d1, d2; - - if( n < 4 ) { - small_memcpy(to,from,n); - } - else - __asm__ __volatile__( - "rep ; movsl\n\t" - "testb $2,%b4\n\t" - "je 1f\n\t" - "movsw\n" - "1:\ttestb $1,%b4\n\t" - "je 2f\n\t" - "movsb\n" - "2:" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from) - : "memory"); - - return (to); + int d0, d1, d2; + + if (n < 4) + { + small_memcpy(to, from, n); + } + else + __asm__ __volatile__( + "rep ; movsl\n\t" + "testb $2,%b4\n\t" + "je 1f\n\t" + "movsw\n" + "1:\ttestb $1,%b4\n\t" + "je 2f\n\t" + "movsb\n" + "2:" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from) + : "memory"); + + return (to); } #define SSE_MMREG_SIZE 16 #define MMX_MMREG_SIZE 8 #define MMX1_MIN_LEN 0x800 /* 2K blocks */ -#define MIN_LEN 0x40 /* 64-byte blocks */ +#define MIN_LEN 0x40 /* 64-byte blocks */ // ================================== /* SSE note: i tried to move 128 bytes a time instead of 64 but it -didn't make any measureable difference. i'm using 64 for the sake of -simplicity. [MF] */ + didn't make any measureable difference. i'm using 64 for the sake of + simplicity. [MF] */ static void * sse_memcpy(void * to, const void * from, size_t len) { - void *retval; - size_t i; - retval = to; - - /* PREFETCH has effect even for MOVSB instruction ;) */ - __asm__ __volatile__ ( - " prefetchnta (%0)\n" - " prefetchnta 32(%0)\n" - " prefetchnta 64(%0)\n" - " prefetchnta 96(%0)\n" - " prefetchnta 128(%0)\n" - " prefetchnta 160(%0)\n" - " prefetchnta 192(%0)\n" - " prefetchnta 224(%0)\n" - " prefetchnta 256(%0)\n" - " prefetchnta 288(%0)\n" - : : "r" (from) ); - - if(len >= MIN_LEN) - { - register unsigned long int delta; - /* Align destinition to MMREG_SIZE -boundary */ - delta = ((unsigned long int)to)&(SSE_MMREG_SIZE-1); - if(delta) + void *retval; + size_t i; + retval = to; + + /* PREFETCH has effect even for MOVSB instruction ;) */ + __asm__ __volatile__ ( + " prefetchnta (%0)\n" + " prefetchnta 32(%0)\n" + " prefetchnta 64(%0)\n" + " prefetchnta 96(%0)\n" + " prefetchnta 128(%0)\n" + " prefetchnta 160(%0)\n" + " prefetchnta 192(%0)\n" + " prefetchnta 224(%0)\n" + " prefetchnta 256(%0)\n" + " prefetchnta 288(%0)\n" + : : "r" (from) ); + + if (len >= MIN_LEN) { - delta=SSE_MMREG_SIZE-delta; - len -= delta; - small_memcpy(to, from, delta); + register unsigned long int delta; + /* Align destinition to MMREG_SIZE -boundary */ + delta = ((unsigned long int)to) & (SSE_MMREG_SIZE - 1); + if (delta) + { + delta = SSE_MMREG_SIZE - delta; + len -= delta; + small_memcpy(to, from, delta); + } + i = len >> 6; /* len/64 */ + len&=63; + if (((unsigned long)from) & 15) /* if SRC is misaligned */ + { + for( ; i > 0; i--) + { + __asm__ __volatile__ ( + "prefetchnta 320(%0)\n" + "prefetchnta 352(%0)\n" + "movups (%0), %%xmm0\n" + "movups 16(%0), %%xmm1\n" + "movups 32(%0), %%xmm2\n" + "movups 48(%0), %%xmm3\n" + "movntps %%xmm0, (%1)\n" + "movntps %%xmm1, 16(%1)\n" + "movntps %%xmm2, 32(%1)\n" + "movntps %%xmm3, 48(%1)\n" + : : "r" (from), "r" (to) : "memory"); + from = ((const unsigned char *)from) + 64; + to = ((unsigned char *)to) + 64; + } + } + else + { + /* + Only if SRC is aligned on 16-byte boundary. + It allows to use movaps instead of movups, which required data + to be aligned or a general-protection exception (#GP) is generated. + */ + for( ; i > 0; i--) + { + __asm__ __volatile__ ( + "prefetchnta 320(%0)\n" + "prefetchnta 352(%0)\n" + "movaps (%0), %%xmm0\n" + "movaps 16(%0), %%xmm1\n" + "movaps 32(%0), %%xmm2\n" + "movaps 48(%0), %%xmm3\n" + "movntps %%xmm0, (%1)\n" + "movntps %%xmm1, 16(%1)\n" + "movntps %%xmm2, 32(%1)\n" + "movntps %%xmm3, 48(%1)\n" + : : "r" (from), "r" (to) : "memory"); + from = ((const unsigned char *)from) + 64; + to = ((unsigned char *)to) + 64; + } + } + /* since movntq is weakly-ordered, a "sfence" + * is needed to become ordered again. */ + __asm__ __volatile__ ("sfence": : :"memory"); + /* enables to use FPU */ + __asm__ __volatile__ ("emms": : :"memory"); } - i = len >> 6; /* len/64 */ - len&=63; - if(((unsigned long)from) & 15) - /* if SRC is misaligned */ - for(; i>0; i--) - { - __asm__ __volatile__ ( - "prefetchnta 320(%0)\n" - "prefetchnta 352(%0)\n" - "movups (%0), %%xmm0\n" - "movups 16(%0), %%xmm1\n" - "movups 32(%0), %%xmm2\n" - "movups 48(%0), %%xmm3\n" - "movntps %%xmm0, (%1)\n" - "movntps %%xmm1, 16(%1)\n" - "movntps %%xmm2, 32(%1)\n" - "movntps %%xmm3, 48(%1)\n" - : : "r" (from), "r" (to) : "memory"); - from = ((const unsigned char *)from) + 64; - to = ((unsigned char *)to) + 64; - } - else - /* - Only if SRC is aligned on 16-byte boundary. - It allows to use movaps instead of movups, which required data - to be aligned or a general-protection exception (#GP) is generated. - */ - for(; i>0; i--) - { - __asm__ __volatile__ ( - "prefetchnta 320(%0)\n" - "prefetchnta 352(%0)\n" - "movaps (%0), %%xmm0\n" - "movaps 16(%0), %%xmm1\n" - "movaps 32(%0), %%xmm2\n" - "movaps 48(%0), %%xmm3\n" - "movntps %%xmm0, (%1)\n" - "movntps %%xmm1, 16(%1)\n" - "movntps %%xmm2, 32(%1)\n" - "movntps %%xmm3, 48(%1)\n" - : : "r" (from), "r" (to) : "memory"); - from = ((const unsigned char *)from) + 64; - to = ((unsigned char *)to) + 64; - } - /* since movntq is weakly-ordered, a "sfence" - * is needed to become ordered again. */ - __asm__ __volatile__ ("sfence": : :"memory"); - /* enables to use FPU */ - __asm__ __volatile__ ("emms": : :"memory"); - } - /* - * Now do the tail of the block - */ - if(len) __memcpy(to, from, len); - return retval; + /* + * Now do the tail of the block + */ + if(len) __memcpy(to, from, len); + return retval; } // ================================== static void * mmx_memcpy(void * to, const void * from, size_t len) { - void *retval; - size_t i; - retval = to; - - if(len >= MMX1_MIN_LEN) - { - register unsigned long int delta; - /* Align destinition to MMREG_SIZE -boundary */ - delta = ((unsigned long int)to)&(MMX_MMREG_SIZE-1); - if(delta) - { - delta=MMX_MMREG_SIZE-delta; - len -= delta; - small_memcpy(to, from, delta); - } - i = len >> 6; /* len/64 */ - len&=63; - for(; i>0; i--) + void *retval; + size_t i; + retval = to; + + if(len >= MMX1_MIN_LEN) { - __asm__ __volatile__ ( - "movq (%0), %%mm0\n" - "movq 8(%0), %%mm1\n" - "movq 16(%0), %%mm2\n" - "movq 24(%0), %%mm3\n" - "movq 32(%0), %%mm4\n" - "movq 40(%0), %%mm5\n" - "movq 48(%0), %%mm6\n" - "movq 56(%0), %%mm7\n" - "movq %%mm0, (%1)\n" - "movq %%mm1, 8(%1)\n" - "movq %%mm2, 16(%1)\n" - "movq %%mm3, 24(%1)\n" - "movq %%mm4, 32(%1)\n" - "movq %%mm5, 40(%1)\n" - "movq %%mm6, 48(%1)\n" - "movq %%mm7, 56(%1)\n" - : : "r" (from), "r" (to) : "memory"); - from = ((const unsigned char *)from) + 64; - to = ((unsigned char *)to) + 64; + register unsigned long int delta; + /* Align destinition to MMREG_SIZE -boundary */ + delta = ((unsigned long int)to) & (MMX_MMREG_SIZE - 1); + if (delta) + { + delta = MMX_MMREG_SIZE - delta; + len -= delta; + small_memcpy(to, from, delta); + } + i = len >> 6; /* len/64 */ + len&=63; + for( ; i > 0; i--) + { + __asm__ __volatile__ ( + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" + "movq 32(%0), %%mm4\n" + "movq 40(%0), %%mm5\n" + "movq 48(%0), %%mm6\n" + "movq 56(%0), %%mm7\n" + "movq %%mm0, (%1)\n" + "movq %%mm1, 8(%1)\n" + "movq %%mm2, 16(%1)\n" + "movq %%mm3, 24(%1)\n" + "movq %%mm4, 32(%1)\n" + "movq %%mm5, 40(%1)\n" + "movq %%mm6, 48(%1)\n" + "movq %%mm7, 56(%1)\n" + : : "r" (from), "r" (to) : "memory"); + from = ((const unsigned char *)from) + 64; + to = ((unsigned char *)to) + 64; + } + __asm__ __volatile__ ("emms": : :"memory"); } - __asm__ __volatile__ ("emms": : :"memory"); - } - /* - * Now do the tail of the block - */ - if(len) __memcpy(to, from, len); - return retval; + /* + * Now do the tail of the block + */ + if(len) __memcpy(to, from, len); + return retval; } // ================================== static void * mmx2_memcpy(void * to, const void * from, size_t len) { - void *retval; - size_t i; - retval = to; - - /* PREFETCH has effect even for MOVSB instruction ;) */ - __asm__ __volatile__ ( - " prefetchnta (%0)\n" - " prefetchnta 32(%0)\n" - " prefetchnta 64(%0)\n" - " prefetchnta 96(%0)\n" - " prefetchnta 128(%0)\n" - " prefetchnta 160(%0)\n" - " prefetchnta 192(%0)\n" - " prefetchnta 224(%0)\n" - " prefetchnta 256(%0)\n" - " prefetchnta 288(%0)\n" - : : "r" (from) ); - - if(len >= MIN_LEN) - { - register unsigned long int delta; - /* Align destinition to MMREG_SIZE -boundary */ - delta = ((unsigned long int)to)&(MMX_MMREG_SIZE-1); - if(delta) + void *retval; + size_t i; + retval = to; + + /* PREFETCH has effect even for MOVSB instruction ;) */ + __asm__ __volatile__ ( + " prefetchnta (%0)\n" + " prefetchnta 32(%0)\n" + " prefetchnta 64(%0)\n" + " prefetchnta 96(%0)\n" + " prefetchnta 128(%0)\n" + " prefetchnta 160(%0)\n" + " prefetchnta 192(%0)\n" + " prefetchnta 224(%0)\n" + " prefetchnta 256(%0)\n" + " prefetchnta 288(%0)\n" + : : "r" (from) ); + + if (len >= MIN_LEN) { - delta=MMX_MMREG_SIZE-delta; - len -= delta; - small_memcpy(to, from, delta); - } - i = len >> 6; /* len/64 */ - len&=63; - for(; i>0; i--) - { - __asm__ __volatile__ ( - "prefetchnta 320(%0)\n" - "prefetchnta 352(%0)\n" - "movq (%0), %%mm0\n" - "movq 8(%0), %%mm1\n" - "movq 16(%0), %%mm2\n" - "movq 24(%0), %%mm3\n" - "movq 32(%0), %%mm4\n" - "movq 40(%0), %%mm5\n" - "movq 48(%0), %%mm6\n" - "movq 56(%0), %%mm7\n" - "movntq %%mm0, (%1)\n" - "movntq %%mm1, 8(%1)\n" - "movntq %%mm2, 16(%1)\n" - "movntq %%mm3, 24(%1)\n" - "movntq %%mm4, 32(%1)\n" - "movntq %%mm5, 40(%1)\n" - "movntq %%mm6, 48(%1)\n" - "movntq %%mm7, 56(%1)\n" - : : "r" (from), "r" (to) : "memory"); - from = ((const unsigned char *)from) + 64; - to = ((unsigned char *)to) + 64; + register unsigned long int delta; + /* Align destinition to MMREG_SIZE -boundary */ + delta = ((unsigned long int)to) & (MMX_MMREG_SIZE - 1); + if (delta) + { + delta = MMX_MMREG_SIZE - delta; + len -= delta; + small_memcpy(to, from, delta); + } + i = len >> 6; /* len/64 */ + len&=63; + for( ; i > 0; i--) + { + __asm__ __volatile__ ( + "prefetchnta 320(%0)\n" + "prefetchnta 352(%0)\n" + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" + "movq 32(%0), %%mm4\n" + "movq 40(%0), %%mm5\n" + "movq 48(%0), %%mm6\n" + "movq 56(%0), %%mm7\n" + "movntq %%mm0, (%1)\n" + "movntq %%mm1, 8(%1)\n" + "movntq %%mm2, 16(%1)\n" + "movntq %%mm3, 24(%1)\n" + "movntq %%mm4, 32(%1)\n" + "movntq %%mm5, 40(%1)\n" + "movntq %%mm6, 48(%1)\n" + "movntq %%mm7, 56(%1)\n" + : : "r" (from), "r" (to) : "memory"); + from = ((const unsigned char *)from) + 64; + to = ((unsigned char *)to) + 64; + } + /* since movntq is weakly-ordered, a "sfence" + * is needed to become ordered again. */ + __asm__ __volatile__ ("sfence": : :"memory"); + __asm__ __volatile__ ("emms": : :"memory"); } - /* since movntq is weakly-ordered, a "sfence" - * is needed to become ordered again. */ - __asm__ __volatile__ ("sfence": : :"memory"); - __asm__ __volatile__ ("emms": : :"memory"); - } - /* - * Now do the tail of the block - */ - if(len) __memcpy(to, from, len); - return retval; + /* + * Now do the tail of the block + */ + if(len) __memcpy(to, from, len); + return retval; } // ================================== -static void *linux_kernel_memcpy(void *to, const void *from, size_t len) { - return __memcpy(to,from,len); +static void *linux_kernel_memcpy(void *to, const void *from, size_t len) +{ + return __memcpy(to, from, len); } #endif /* __i386__ || __x86_64__ */ @@ -319,105 +321,108 @@ static void *linux_kernel_memcpy(void *to, const void *from, size_t len) { // ================================== //! constr. cDxr3MemcpyBench::cDxr3MemcpyBench(uint32_t config_flags) -{ - // - // add all aviable memcpy routines - // +{ + // + // add all available memcpy routines + // - memcpy_routine routine; + memcpy_routine routine; - // glibc memcpy - routine.name = "glibc memcpy()"; - routine.function = memcpy; - routine.time = 0; - routine.cpu_require = 0; - m_methods.push_back(routine); + // glibc memcpy + routine.name = "glibc memcpy()"; + routine.function = memcpy; + routine.time = 0; + routine.cpu_require = 0; + m_methods.push_back(routine); #if defined(__i386__) || defined(__x86_64__) - // linux_kernel_memcpy - routine.name = "linux_kernel_memcpy()"; - routine.function = linux_kernel_memcpy; - routine.cpu_require = 0; - m_methods.push_back(routine); + // linux_kernel_memcpy + routine.name = "linux_kernel_memcpy()"; + routine.function = linux_kernel_memcpy; + routine.cpu_require = 0; + m_methods.push_back(routine); - // MMX optimized memcpy() - routine.name = "MMX optimized memcpy()"; - routine.function = mmx_memcpy; - routine.cpu_require = CC_MMX; - m_methods.push_back(routine); + // MMX optimized memcpy() + routine.name = "MMX optimized memcpy()"; + routine.function = mmx_memcpy; + routine.cpu_require = CC_MMX; + m_methods.push_back(routine); - // MMXEXT optimized memcpy() - routine.name = "MMXEXT optimized memcpy()"; - routine.function = mmx2_memcpy; - routine.cpu_require = CC_MMXEXT; - m_methods.push_back(routine); + // MMXEXT optimized memcpy() + routine.name = "MMXEXT optimized memcpy()"; + routine.function = mmx2_memcpy; + routine.cpu_require = CC_MMXEXT; + m_methods.push_back(routine); #ifndef __FreeBSD__ - // SSE optimized memcpy() - routine.name = "SSE optimized memcpy()"; - routine.function = sse_memcpy; - routine.cpu_require = CC_MMXEXT|CC_SSE; - m_methods.push_back(routine); + // SSE optimized memcpy() + routine.name = "SSE optimized memcpy()"; + routine.function = sse_memcpy; + routine.cpu_require = CC_MMXEXT|CC_SSE; + m_methods.push_back(routine); #endif /* not __FreeBSD__ */ #endif /* __i386__ || __x86_64__ */ - // - // run benchmarking - // + // + // run benchmarking + // + + unsigned long long t = 0; + void *buf1, *buf2; + int j, best = -1; + + if ((buf1 = malloc(BUFSIZE)) == NULL) + return; - unsigned long long t = 0; - void *buf1, *buf2; - int j, best = -1; + if ((buf2 = malloc(BUFSIZE)) == NULL) + { + free(buf1); + return; + } - if ((buf1 = malloc(BUFSIZE)) == NULL) - return; - - if ((buf2 = malloc(BUFSIZE)) == NULL) + cLog::Instance() << + "\nBenchmarking memcpy() methods (smaller is better):\n"; + // make sure buffers are present on physical memory + memcpy(buf1, buf2, BUFSIZE); + + for (size_t i = 0; i < m_methods.size(); i++) + { + if ((config_flags & m_methods[i].cpu_require) != m_methods[i].cpu_require) + { + continue; + } + + // count 100 runs of the memcpy function + t = Rdtsc(config_flags); + for (j = 0; j < 50; j++) { - free(buf1); - return; + m_methods[i].function(buf2, buf1, BUFSIZE); + m_methods[i].function(buf1, buf2, BUFSIZE); } + t = Rdtsc(config_flags) - t; + + m_methods[i].time = t; - cLog::Instance() << "\nBenchmarking memcpy() methods (smaller is better):\n"; - // make sure buffers are present on physical memory - memcpy(buf1,buf2,BUFSIZE); + cLog::Instance() << m_methods[i].name.c_str() << ": " + << (unsigned long long)t << "\n"; - for (size_t i = 0; i < m_methods.size(); i++) + if (best == -1 || t < m_methods[best].time) { - if ((config_flags & m_methods[i].cpu_require) != m_methods[i].cpu_require) - { - continue; - } - - // count 100 runs of the memcpy function - t = Rdtsc(config_flags); - for (j = 0; j < 50; j++) - { - m_methods[i].function(buf2,buf1,BUFSIZE); - m_methods[i].function(buf1,buf2,BUFSIZE); - } - t = Rdtsc(config_flags) - t; - - m_methods[i].time = t; - - cLog::Instance() << m_methods[i].name.c_str() << ": " << (unsigned long long)t << "\n"; - - if (best == -1 || t < m_methods[best].time) - { - best = i; - } + best = i; } - cLog::Instance() << "\nBest one: " << m_methods[best].name.c_str() << "\n\n"; + } + cLog::Instance() << "\nBest one: " + << m_methods[best].name.c_str() << "\n\n"; - dxr3_memcpy = m_methods[best].function; + dxr3_memcpy = m_methods[best].function; - // clear unused memory - free(buf1); - free(buf2); + // clear unused memory + free(buf1); + free(buf2); } // ================================== @@ -425,20 +430,20 @@ cDxr3MemcpyBench::cDxr3MemcpyBench(uint32_t config_flags) unsigned long long int cDxr3MemcpyBench::Rdtsc(uint32_t config_flags) { #if defined(__i386__) || defined(__x86_64__) - // we need rdtsc support - if (config_flags && CC_MMX) - { - unsigned long long int x; - __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x)); - return x; - } - else - { - return times(NULL); - } + // we need rdtsc support + if (config_flags && CC_MMX) + { + unsigned long long int x; + __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x)); + return x; + } + else + { + return times(NULL); + } #else - struct tms tp; - return times(&tp); + struct tms tp; + return times(&tp); #endif /* __i386__ || __x86_64__ */ } -- cgit v1.2.3