summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMiguel Freitas <miguelfreitas@users.sourceforge.net>2003-12-08 17:06:53 +0000
committerMiguel Freitas <miguelfreitas@users.sourceforge.net>2003-12-08 17:06:53 +0000
commit0e18f82b79ac54f306aa1ee31170bd3099b5c17a (patch)
treeb291be19e1c8786594cd3295feb1b4262d755685 /src
parent335d7eaf32502909f106b6e85e5dcb47ed60fd26 (diff)
downloadxine-lib-0e18f82b79ac54f306aa1ee31170bd3099b5c17a.tar.gz
xine-lib-0e18f82b79ac54f306aa1ee31170bd3099b5c17a.tar.bz2
sfence is not a valid MMX instruction.
provide a new MMXEXT version. about 15% faster here. CVS patchset: 5875 CVS date: 2003/12/08 17:06:53
Diffstat (limited to 'src')
-rw-r--r--src/post/deinterlace/plugins/linearblend.c155
1 files changed, 152 insertions, 3 deletions
diff --git a/src/post/deinterlace/plugins/linearblend.c b/src/post/deinterlace/plugins/linearblend.c
index ef7ab52bc..977efe2b3 100644
--- a/src/post/deinterlace/plugins/linearblend.c
+++ b/src/post/deinterlace/plugins/linearblend.c
@@ -86,6 +86,7 @@ static void deinterlace_scanline_linear_blend( uint8_t *output,
movd_r2m( mm2, *output );
movd_r2m( mm5, *(output+4) );
+
output += 8;
t0 += 8;
b0 += 8;
@@ -94,7 +95,6 @@ static void deinterlace_scanline_linear_blend( uint8_t *output,
while( width-- ) {
*output++ = (*t0++ + *b0++ + (*m1++ << 1)) >> 2;
}
- sfence();
emms();
#else
width *= 2;
@@ -151,6 +151,7 @@ static void deinterlace_scanline_linear_blend2( uint8_t *output,
movd_r2m( mm2, *output );
movd_r2m( mm5, *(output+4) );
+
output += 8;
t1 += 8;
b1 += 8;
@@ -159,7 +160,6 @@ static void deinterlace_scanline_linear_blend2( uint8_t *output,
while( width-- ) {
*output++ = (*t1++ + *b1++ + (*m0++ << 1)) >> 2;
}
- sfence();
emms();
#else
width *= 2;
@@ -169,6 +169,150 @@ static void deinterlace_scanline_linear_blend2( uint8_t *output,
#endif
}
+#ifdef ARCH_X86
+
+/* MMXEXT version is about 15% faster with Athlon XP [MF] */
+
+static void deinterlace_scanline_linear_blend_mmxext( uint8_t *output,
+ deinterlace_scanline_data_t *data,
+ int width )
+{
+ uint8_t *t0 = data->t0;
+ uint8_t *b0 = data->b0;
+ uint8_t *m1 = data->m1;
+ int i;
+ static mmx_t high_mask = {ub:{0xff,0xff,0xff,0xff,0,0,0,0}};
+
+ // Get width in bytes.
+ width *= 2;
+ i = width / 8;
+ width -= i * 8;
+
+ movd_m2r( high_mask, mm6 );
+ pxor_r2r( mm7, mm7 );
+ while( i-- ) {
+ movd_m2r( *t0, mm0 );
+ movd_m2r( *b0, mm1 );
+ movd_m2r( *m1, mm2 );
+
+ movd_m2r( *(t0+4), mm3 );
+ movd_m2r( *(b0+4), mm4 );
+ movd_m2r( *(m1+4), mm5 );
+
+ punpcklbw_r2r( mm7, mm0 );
+ punpcklbw_r2r( mm7, mm1 );
+ punpcklbw_r2r( mm7, mm2 );
+
+ punpcklbw_r2r( mm7, mm3 );
+ punpcklbw_r2r( mm7, mm4 );
+ punpcklbw_r2r( mm7, mm5 );
+
+ psllw_i2r( 1, mm2 );
+ psllw_i2r( 1, mm5 );
+ paddw_r2r( mm0, mm2 );
+ paddw_r2r( mm3, mm5 );
+ paddw_r2r( mm1, mm2 );
+ paddw_r2r( mm4, mm5 );
+ psrlw_i2r( 2, mm2 );
+ psrlw_i2r( 2, mm5 );
+ packuswb_r2r( mm2, mm2 );
+ packuswb_r2r( mm5, mm5 );
+
+ psllq_i2r( 32, mm5 );
+ pand_r2r( mm6, mm2 );
+ por_r2r( mm2, mm5 );
+ movntq_r2m( mm5, *output );
+
+ output += 8;
+ t0 += 8;
+ b0 += 8;
+ m1 += 8;
+ }
+ while( width-- ) {
+ *output++ = (*t0++ + *b0++ + (*m1++ << 1)) >> 2;
+ }
+ sfence();
+ emms();
+}
+
+static void deinterlace_scanline_linear_blend2_mmxext( uint8_t *output,
+ deinterlace_scanline_data_t *data,
+ int width )
+{
+ uint8_t *m0 = data->m0;
+ uint8_t *t1 = data->t1;
+ uint8_t *b1 = data->b1;
+
+ int i;
+
+ // Get width in bytes.
+ width *= 2;
+ i = width / 8;
+ width -= i * 8;
+
+ pxor_r2r( mm7, mm7 );
+ while( i-- ) {
+ movd_m2r( *t1, mm0 );
+ movd_m2r( *b1, mm1 );
+ movd_m2r( *m0, mm2 );
+
+ movd_m2r( *(t1+4), mm3 );
+ movd_m2r( *(b1+4), mm4 );
+ movd_m2r( *(m0+4), mm5 );
+
+ punpcklbw_r2r( mm7, mm0 );
+ punpcklbw_r2r( mm7, mm1 );
+ punpcklbw_r2r( mm7, mm2 );
+
+ punpcklbw_r2r( mm7, mm3 );
+ punpcklbw_r2r( mm7, mm4 );
+ punpcklbw_r2r( mm7, mm5 );
+
+ psllw_i2r( 1, mm2 );
+ psllw_i2r( 1, mm5 );
+ paddw_r2r( mm0, mm2 );
+ paddw_r2r( mm3, mm5 );
+ paddw_r2r( mm1, mm2 );
+ paddw_r2r( mm4, mm5 );
+ psrlw_i2r( 2, mm2 );
+ psrlw_i2r( 2, mm5 );
+ packuswb_r2r( mm2, mm2 );
+ packuswb_r2r( mm5, mm5 );
+
+ psllq_i2r( 32, mm5 );
+ pand_r2r( mm6, mm2 );
+ por_r2r( mm2, mm5 );
+ movntq_r2m( mm5, *output );
+
+ output += 8;
+ t1 += 8;
+ b1 += 8;
+ m0 += 8;
+ }
+ while( width-- ) {
+ *output++ = (*t1++ + *b1++ + (*m0++ << 1)) >> 2;
+ }
+ sfence();
+ emms();
+}
+
+static deinterlace_method_t linearblendmethod_mmxext =
+{
+ DEINTERLACE_PLUGIN_API_VERSION,
+ "mplayer: Linear Blend",
+ "LinearBlend",
+ 2,
+ MM_ACCEL_X86_MMXEXT,
+ 0,
+ 0,
+ 0,
+ 1,
+ deinterlace_scanline_linear_blend_mmxext,
+ deinterlace_scanline_linear_blend2_mmxext,
+ 0
+};
+
+#endif
static deinterlace_method_t linearblendmethod =
{
@@ -196,5 +340,10 @@ void deinterlace_plugin_init( void )
void linearblend_plugin_init( void )
#endif
{
- register_deinterlace_method( &linearblendmethod );
+#ifdef ARCH_X86
+ if( xine_mm_accel() & MM_ACCEL_X86_MMXEXT )
+ register_deinterlace_method( &linearblendmethod_mmxext );
+ else
+#endif
+ register_deinterlace_method( &linearblendmethod );
}