diff options
author | Petri Hintukainen <phintuka@users.sourceforge.net> | 2012-05-15 22:16:22 +0300 |
---|---|---|
committer | Petri Hintukainen <phintuka@users.sourceforge.net> | 2012-05-15 22:16:22 +0300 |
commit | 2345dfe3dec4f081da14ebbcc68184ff141dcc46 (patch) | |
tree | e87a8e460d3b69c0d178e6cd7e746107bf694a89 | |
parent | 03f050c78569507d3dc80e812d874e26fa2852dc (diff) | |
download | xine-lib-2345dfe3dec4f081da14ebbcc68184ff141dcc46.tar.gz xine-lib-2345dfe3dec4f081da14ebbcc68184ff141dcc46.tar.bz2 |
speedy.c: added diff_factor_packed422_scanline_sse2()
Tested with Atom N550 + SD video:
- ~25% speedup with unaligned data
- ~50% speedup aligned data
(scanlines were properly aligned with all my test clips)
-rw-r--r-- | src/post/deinterlace/speedy.c | 90 |
1 files changed, 90 insertions, 0 deletions
diff --git a/src/post/deinterlace/speedy.c b/src/post/deinterlace/speedy.c index 6af2c3b88..461e4aefd 100644 --- a/src/post/deinterlace/speedy.c +++ b/src/post/deinterlace/speedy.c @@ -344,6 +344,89 @@ static unsigned int diff_factor_packed422_scanline_mmx( uint8_t *cur, uint8_t *o } #endif +#if defined(ARCH_X86) || defined(ARCH_X86_64) + +static const sse_t dqwYMask = { uq: { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL }}; +static const sse_t dqwCMask = { uq: { 0xff00ff00ff00ff00ULL, 0xff00ff00ff00ff00ULL }}; + +static unsigned int diff_factor_packed422_scanline_sse2_aligned( uint8_t *cur, uint8_t *old, int width ) +{ + register unsigned int temp; + + width /= 8; + + movdqa_m2r( dqwYMask, xmm1 ); + movd_m2r( BitShift, xmm7 ); + pxor_r2r( xmm0, xmm0 ); + + while( width-- ) { + movdqa_m2r( *cur, xmm4 ); + movdqa_m2r( *old, xmm5 ); + + pand_r2r( xmm1, xmm4 ); + pand_r2r( xmm1, xmm5 ); + + psubw_r2r( xmm5, xmm4 ); /* mm4 = Y1 - Y2 */ + pmaddwd_r2r( xmm4, xmm4 ); /* mm4 = (Y1 - Y2)^2 */ + psrld_r2r( xmm7, xmm4 ); /* divide mm4 by 2^BitShift */ + paddd_r2r( xmm4, xmm0 ); /* keep total in mm0 */ + + cur += 16; + old += 16; + } + + pshufd_r2r(xmm0, xmm1, 0x0e); + paddd_r2r(xmm1, xmm0); + pshufd_r2r(xmm0, xmm1, 0x01); + paddd_r2r(xmm1, xmm0); + + movd_r2a(xmm0, temp); + return temp; +} +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static unsigned int diff_factor_packed422_scanline_sse2( uint8_t *cur, uint8_t *old, int width ) +{ + if (0 == (((unsigned int)cur|(unsigned int)old) & 15)) { + return diff_factor_packed422_scanline_sse2_aligned(cur, old, width); + } + + register unsigned int temp; + + width /= 8; + + movdqa_m2r( dqwYMask, xmm1 ); + movd_m2r( BitShift, xmm7 ); + pxor_r2r( xmm0, xmm0 ); + + while( width-- ) { + movdqu_m2r( *cur, xmm4 ); + movdqu_m2r( *old, xmm5 ); + + pand_r2r( xmm1, xmm4 ); + pand_r2r( xmm1, xmm5 ); + + psubw_r2r( xmm5, xmm4 ); /* mm4 = Y1 - Y2 */ + pmaddwd_r2r( xmm4, xmm4 ); /* mm4 = (Y1 - Y2)^2 */ + psrld_r2r( xmm7, xmm4 ); /* divide mm4 by 2^BitShift */ + paddd_r2r( xmm4, xmm0 ); /* keep total in mm0 */ + + cur += 16; + old += 16; + } + + pshufd_r2r(xmm0, xmm1, 0x0e); + paddd_r2r(xmm1, xmm0); + pshufd_r2r(xmm0, xmm1, 0x01); + paddd_r2r(xmm1, xmm0); + + movd_r2a(xmm0, temp); + + return temp; +} +#endif + #define ABS(a) (((a) < 0)?-(a):(a)) #if defined(ARCH_X86) || defined(ARCH_X86_64) @@ -2459,6 +2542,13 @@ void setup_speedy_calls( uint32_t accel, int verbose ) printf( "speedycode: No MMX or MMXEXT support detected, using C fallbacks.\n" ); } } + + if( speedy_accel & MM_ACCEL_X86_SSE2 ) { + if( verbose ) { + printf( "speedycode: Using SSE2 optimized functions.\n" ); + } + diff_factor_packed422_scanline = diff_factor_packed422_scanline_sse2; + } #endif } |