4 files changed, 426 insertions, 204 deletions
diff --git a/src/post/deinterlace/deinterlace.c b/src/post/deinterlace/deinterlace.c
index f797f28cf..60f5b50ab 100644
--- a/src/post/deinterlace/deinterlace.c
+++ b/src/post/deinterlace/deinterlace.c
@@ -30,24 +30,19 @@ struct methodlist_item_s
 };
 
 static methodlist_item_t *methodlist = 0;
-static initialized = 0;
 
 void register_deinterlace_method( deinterlace_method_t *method )
 {
-    methodlist_item_t **dest;
-
-    if( initialized )
-        return;
+    methodlist_item_t **dest = &methodlist;
+    methodlist_item_t *cur = methodlist;
 
-    if( !methodlist ) {
-        dest = &methodlist;
-    } else {
-        methodlist_item_t *cur = methodlist;
-        while( cur->next ) cur = cur->next;
+    while( cur ) {
+        if( cur->method == method ) return;
         dest = &(cur->next);
+        cur = cur->next;
     }
 
-    *dest = (methodlist_item_t *) malloc( sizeof( methodlist_item_t ) );
+    *dest = malloc( sizeof( methodlist_item_t ) );
     if( *dest ) {
         (*dest)->method = method;
         (*dest)->next = 0;
@@ -80,14 +75,27 @@ deinterlace_method_t *get_deinterlace_method( int i )
     return cur->method;
 }
 
+void register_deinterlace_plugin( const char *filename )
+{
+    void *handle = dlopen( filename, RTLD_NOW );
+
+    if( !handle ) {
+        fprintf( stderr, "deinterlace: Can't load plugin '%s': %s\n",
+                 filename, dlerror() );
+    } else {
+        deinterlace_plugin_init_t plugin_init;
+        plugin_init = (deinterlace_plugin_init_t) dlsym( handle, "deinterlace_plugin_init" );
+        if( plugin_init ) {
+            plugin_init();
+        }
+    }
+}
+
 void filter_deinterlace_methods( int accel, int fields_available )
 {
     methodlist_item_t *prev = 0;
     methodlist_item_t *cur = methodlist;
 
-    if( initialized )
-        return;
-
     while( cur ) {
         methodlist_item_t *next = cur->next;
         int drop = 0;
@@ -120,6 +128,5 @@ void filter_deinterlace_methods( int accel, int fields_available )
         }
         cur = next;
     }
-    initialized = 1;
 }
 
diff --git a/src/post/deinterlace/speedy.c b/src/post/deinterlace/speedy.c
index 2a1e7f673..3a94cec55 100644
--- a/src/post/deinterlace/speedy.c
+++ b/src/post/deinterlace/speedy.c
@@ -17,18 +17,6 @@
  * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 
-/*
- * Uses code from:
- *
- *  linux/arch/i386/kernel/setup.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- *
- * Found in linux 2.4.20.
- *
- * Also helped from code in 'cpuinfo.c' found in mplayer.
- */
-
 #include <stdio.h>
 #include <string.h>
 #include <sys/time.h>
@@ -96,10 +84,6 @@ void (*subpix_blit_vertical_packed422_scanline)( uint8_t *output, uint8_t *top,
                                                  uint8_t *bot, int subpixpos, int width );
 
 
-#define SPEEDY_START()
-
-#define SPEEDY_END()
-
 /**
  * result = (1 - alpha)B + alpha*F
  *        =  B - alpha*B + alpha*F
@@ -124,18 +108,16 @@ static inline __attribute__ ((always_inline,const)) uint8_t clip255( int x )
     }
 }
 
-#ifdef ARCH_X86
+static unsigned long CombJaggieThreshold = 73;
 
 static unsigned int comb_factor_packed422_scanline_mmx( uint8_t *top, uint8_t *mid,
                                                  uint8_t *bot, int width )
 {
+#ifdef ARCH_X86
     const mmx_t qwYMask = { 0x00ff00ff00ff00ffULL };
     const mmx_t qwOnes = { 0x0001000100010001ULL };
     mmx_t qwThreshold;
     unsigned int temp1, temp2;
-    unsigned long CombJaggieThreshold = 73;
-
-    SPEEDY_START();
 
     width /= 4;
 
@@ -206,12 +188,11 @@ static unsigned int comb_factor_packed422_scanline_mmx( uint8_t *top, uint8_t *m
 
     emms();
 
-    SPEEDY_END();
-
     return temp1;
-}
-
+#else
+    return 0;
 #endif
+}
 
 static unsigned long BitShift = 6;
 
@@ -219,8 +200,6 @@ static unsigned int diff_factor_packed422_scanline_c( uint8_t *cur, uint8_t *old
 {
     unsigned int ret = 0;
 
-    SPEEDY_START();
-
     width /= 4;
 
     while( width-- ) {
@@ -233,7 +212,6 @@ static unsigned int diff_factor_packed422_scanline_c( uint8_t *cur, uint8_t *old
         cur += 8;
         old += 8;
     }
-    SPEEDY_END();
 
     return ret;
 }
@@ -242,8 +220,6 @@ static unsigned int diff_factor_packed422_scanline_test_c( uint8_t *cur, uint8_t
 {
     unsigned int ret = 0;
 
-    SPEEDY_START();
-
     width /= 16;
 
     while( width-- ) {
@@ -256,20 +232,17 @@ static unsigned int diff_factor_packed422_scanline_test_c( uint8_t *cur, uint8_t
         cur += (8*4);
         old += (8*4);
     }
-    SPEEDY_END();
 
     return ret;
 }
 
-#ifdef ARCH_X86
 
 static unsigned int diff_factor_packed422_scanline_mmx( uint8_t *cur, uint8_t *old, int width )
 {
+#ifdef ARCH_X86
     const mmx_t qwYMask = { 0x00ff00ff00ff00ffULL };
     unsigned int temp1, temp2;
 
-    SPEEDY_START();
-
     width /= 4;
 
     movq_m2r( qwYMask, mm1 );
@@ -299,9 +272,10 @@ static unsigned int diff_factor_packed422_scanline_mmx( uint8_t *cur, uint8_t *o
 
     emms();
 
-    SPEEDY_END();
-
     return temp1;
+#else
+    return 0;
+#endif
 }
 
 #define ABS(a) (((a) < 0)?-(a):(a))
@@ -309,14 +283,13 @@ static unsigned int diff_factor_packed422_scanline_mmx( uint8_t *cur, uint8_t *o
 static void diff_packed422_block8x8_mmx( pulldown_metrics_t *m, uint8_t *old,
                                   uint8_t *new, int os, int ns )
 {
+#ifdef ARCH_X86
     const mmx_t ymask = { 0x00ff00ff00ff00ffULL };
     short out[ 24 ]; /* Output buffer for the partial metrics from the mmx code. */
     uint8_t *outdata = (uint8_t *) out;
     uint8_t *oldp, *newp;
     int i;
 
-    SPEEDY_START();
-
     pxor_r2r( mm4, mm4 );  // 4 even difference sums.
     pxor_r2r( mm5, mm5 );  // 4 odd difference sums.
     pxor_r2r( mm7, mm7 );  // zeros
@@ -449,11 +422,8 @@ static void diff_packed422_block8x8_mmx( pulldown_metrics_t *m, uint8_t *old,
     }
 
     emms();
-
-    SPEEDY_END();
-}
-
 #endif
+}
 
 static void diff_packed422_block8x8_c( pulldown_metrics_t *m, uint8_t *old,
                                 uint8_t *new, int os, int ns )
@@ -461,7 +431,6 @@ static void diff_packed422_block8x8_c( pulldown_metrics_t *m, uint8_t *old,
     int x, y, e=0, o=0, s=0, p=0, t=0;
     uint8_t *oldp, *newp;
 
-    SPEEDY_START();
     m->s = m->p = m->t = 0;
     for (x = 8; x; x--) {
         oldp = old; old += 2;
@@ -483,12 +452,10 @@ static void diff_packed422_block8x8_c( pulldown_metrics_t *m, uint8_t *old,
     m->e = e;
     m->o = o;
     m->d = e+o;
-    SPEEDY_END();
 }
 
 static void packed444_to_packed422_scanline_c( uint8_t *output, uint8_t *input, int width )
 {
-    SPEEDY_START();
     width /= 2;
     while( width-- ) {
         output[ 0 ] = input[ 0 ];
@@ -498,12 +465,10 @@ static void packed444_to_packed422_scanline_c( uint8_t *output, uint8_t *input,
         output += 4;
         input += 6;
     }
-    SPEEDY_END();
 }
 
 static void packed422_to_packed444_scanline_c( uint8_t *output, uint8_t *input, int width )
 {
-    SPEEDY_START();
     width /= 2;
     while( width-- ) {
         output[ 0 ] = input[ 0 ];
@@ -515,7 +480,6 @@ static void packed422_to_packed444_scanline_c( uint8_t *output, uint8_t *input,
         output += 6;
         input += 4;
     }
-    SPEEDY_END();
 }
 
 /**
@@ -527,7 +491,6 @@ void packed422_to_packed444_rec601_scanline( uint8_t *dest, uint8_t *src, int wi
 {
     int i;
 
-    SPEEDY_START();
     /* Process two input pixels at a time.  Input is [Y'][Cb][Y'][Cr]. */
     for( i = 0; i < width / 2; i++ ) {
         dest[ (i*6) + 0 ] = src[ (i*4) + 0 ];
@@ -556,18 +519,14 @@ void packed422_to_packed444_rec601_scanline( uint8_t *dest, uint8_t *src, int wi
             dest[ (i*6) + 5 ] = src[ (i*4) + 3 ];
         }
     }
-    SPEEDY_END();
 }
 
-#ifdef ARCH_X86
-
 static void kill_chroma_packed422_inplace_scanline_mmx( uint8_t *data, int width )
 {
+#ifdef ARCH_X86
     const mmx_t ymask = { 0x00ff00ff00ff00ffULL };
     const mmx_t nullchroma = { 0x8000800080008000ULL };
 
-    SPEEDY_START();
-
     movq_m2r( ymask, mm7 );
     movq_m2r( nullchroma, mm6 );
     for(; width > 4; width -= 4 ) {
@@ -583,19 +542,15 @@ static void kill_chroma_packed422_inplace_scanline_mmx( uint8_t *data, int width
         data[ 1 ] = 128;
         data += 2;
     }
-    SPEEDY_END();
-}
-
 #endif
+}
 
 static void kill_chroma_packed422_inplace_scanline_c( uint8_t *data, int width )
 {
-    SPEEDY_START();
     while( width-- ) {
         data[ 1 ] = 128;
         data += 2;
     }
-    SPEEDY_END();
 }
 
 /*
@@ -607,7 +562,6 @@ static void testing_packed422_inplace_scanline_c( uint8_t *data, int width, int
     volatile static int topbottom = 0;
     static uint8_t scanbuffer[2048];
 
-    SPEEDY_START();
     if( scanline <= 1 ) {
         topbottom = scanline;
         memcpy(scanbuffer, data, width*2);
@@ -620,7 +574,6 @@ static void testing_packed422_inplace_scanline_c( uint8_t *data, int width, int
     } else {
         memcpy(scanbuffer, data, width*2);
     }
-    SPEEDY_END();
 }
 */
 
@@ -629,7 +582,6 @@ static void mirror_packed422_inplace_scanline_c( uint8_t *data, int width )
     int x, tmp1, tmp2;
     int width2 = width*2;
 
-    SPEEDY_START();
     for( x = 0; x < width; x += 2 ) {
         tmp1 = data[ x   ];
         tmp2 = data[ x+1 ];
@@ -638,19 +590,16 @@ static void mirror_packed422_inplace_scanline_c( uint8_t *data, int width )
         data[ width2 - x     ] = tmp1;
         data[ width2 - x + 1 ] = tmp2;
     }
-    SPEEDY_END();
 }
 
 static void halfmirror_packed422_inplace_scanline_c( uint8_t *data, int width )
 {
     int x;
 
-    SPEEDY_START();
     for( x = 0; x < width; x += 2 ) {
         data[ width + x     ] = data[ width - x     ];
         data[ width + x + 1 ] = data[ width - x + 1 ];
     }
-    SPEEDY_END();
 }
 
 static void filter_luma_121_packed422_inplace_scanline_c( uint8_t *data, int width )
@@ -658,7 +607,6 @@ static void filter_luma_121_packed422_inplace_scanline_c( uint8_t *data, int wid
     int r1 = 0;
     int r2 = 0;
 
-    SPEEDY_START();
     data += 2;
     width -= 1;
     while( width-- ) {
@@ -668,7 +616,6 @@ static void filter_luma_121_packed422_inplace_scanline_c( uint8_t *data, int wid
         *(data - 2) = s2 >> 2;
         data += 2;
     }
-    SPEEDY_END();
 }
 
 static void filter_luma_14641_packed422_inplace_scanline_c( uint8_t *data, int width )
@@ -678,7 +625,6 @@ static void filter_luma_14641_packed422_inplace_scanline_c( uint8_t *data, int w
     int r3 = 0;
     int r4 = 0;
 
-    SPEEDY_START();
     width -= 4;
     data += 4;
     while( width-- ) {
@@ -690,7 +636,6 @@ static void filter_luma_14641_packed422_inplace_scanline_c( uint8_t *data, int w
         *(data - 4) = s4 >> 4;
         data += 2;
     }
-    SPEEDY_END();
 }
 
 static void interpolate_packed422_scanline_c( uint8_t *output, uint8_t *top,
@@ -698,25 +643,18 @@ static void interpolate_packed422_scanline_c( uint8_t *output, uint8_t *top,
 {
     int i;
 
-    SPEEDY_START();
-
     for( i = width*2; i; --i ) {
         *output++ = ((*top++) + (*bot++)) >> 1;
     }
-
-    SPEEDY_END();
 }
 
-#ifdef ARCH_X86
-
 static void interpolate_packed422_scanline_mmx( uint8_t *output, uint8_t *top,
                                          uint8_t *bot, int width )
 {
+#ifdef ARCH_X86
     const mmx_t shiftmask = { 0xfefffefffefffeffULL };  /* To avoid shifting chroma to luma. */
     int i;
 
-    SPEEDY_START();
-
     for( i = width/16; i; --i ) {
         movq_m2r( *bot, mm0 );
         movq_m2r( *top, mm1 );
@@ -777,17 +715,15 @@ static void interpolate_packed422_scanline_mmx( uint8_t *output, uint8_t *top,
     }
 
     emms();
-
-    SPEEDY_END();
+#endif
 }
 
 static void interpolate_packed422_scanline_mmxext( uint8_t *output, uint8_t *top,
                                             uint8_t *bot, int width )
 {
+#ifdef ARCH_X86
     int i;
 
-    SPEEDY_START();
-
     for( i = width/16; i; --i ) {
         movq_m2r( *bot, mm0 );
         movq_m2r( *top, mm1 );
@@ -829,35 +765,25 @@ static void interpolate_packed422_scanline_mmxext( uint8_t *output, uint8_t *top
 
     sfence();
     emms();
-
-    SPEEDY_END();
-}
-
 #endif
+}
 
 static void blit_colour_packed422_scanline_c( uint8_t *output, int width, int y, int cb, int cr )
 {
     uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;
     uint32_t *o = (uint32_t *) output;
 
-    SPEEDY_START();
-
     for( width /= 2; width; --width ) {
         *o++ = colour;
     }
-
-    SPEEDY_END();
 }
 
-#ifdef ARCH_X86
-
 static void blit_colour_packed422_scanline_mmx( uint8_t *output, int width, int y, int cb, int cr )
 {
+#ifdef ARCH_X86
     uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;
     int i;
 
-    SPEEDY_START();
-
     movd_m2r( colour, mm1 );
     movd_m2r( colour, mm2 );
     psllq_i2r( 32, mm1 );
@@ -889,17 +815,15 @@ static void blit_colour_packed422_scanline_mmx( uint8_t *output, int width, int
     }
 
     emms();
-
-    SPEEDY_END();
+#endif
 }
 
 static void blit_colour_packed422_scanline_mmxext( uint8_t *output, int width, int y, int cb, int cr )
 {
+#ifdef ARCH_X86
     uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;
     int i;
 
-    SPEEDY_START();
-
     movd_m2r( colour, mm1 );
     movd_m2r( colour, mm2 );
     psllq_i2r( 32, mm1 );
@@ -932,40 +856,30 @@ static void blit_colour_packed422_scanline_mmxext( uint8_t *output, int width, i
 
     sfence();
     emms();
-
-    SPEEDY_END();
-}
-
 #endif
+}
 
 static void blit_colour_packed4444_scanline_c( uint8_t *output, int width,
                                         int alpha, int luma, int cb, int cr )
 {
     int j;
 
-    SPEEDY_START();
-
     for( j = 0; j < width; j++ ) {
         *output++ = alpha;
         *output++ = luma;
         *output++ = cb;
         *output++ = cr;
     }
-
-    SPEEDY_END();
 }
 
-#ifdef ARCH_X86
-
 static void blit_colour_packed4444_scanline_mmx( uint8_t *output, int width,
                                           int alpha, int luma,
                                           int cb, int cr )
 {
+#ifdef ARCH_X86
     uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha;
     int i;
 
-    SPEEDY_START();
-
     movd_m2r( colour, mm1 );
     movd_m2r( colour, mm2 );
     psllq_i2r( 32, mm1 );
@@ -992,19 +906,17 @@ static void blit_colour_packed4444_scanline_mmx( uint8_t *output, int width,
     }
 
     emms();
-
-    SPEEDY_END();
+#endif
 }
 
 void blit_colour_packed4444_scanline_mmxext( uint8_t *output, int width,
                                              int alpha, int luma,
                                              int cb, int cr )
 {
+#ifdef ARCH_X86
     uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha;
     int i;
 
-    SPEEDY_START();
-
     movd_m2r( colour, mm1 );
     movd_m2r( colour, mm2 );
     psllq_i2r( 32, mm1 );
@@ -1032,25 +944,28 @@ void blit_colour_packed4444_scanline_mmxext( uint8_t *output, int width,
 
     sfence();
     emms();
-
-    SPEEDY_END();
+#endif
 }
 
-#endif
+
+#define speedy_memcpy_c xine_fast_memcpy
+#define speedy_memcpy_mmx xine_fast_memcpy
+#define speedy_memcpy_mmxext xine_fast_memcpy
+
 
 static void blit_packed422_scanline_c( uint8_t *dest, const uint8_t *src, int width )
 {
-    xine_fast_memcpy( dest, src, width*2 );
+    speedy_memcpy_c( dest, src, width*2 );
 }
 
 static void blit_packed422_scanline_mmx( uint8_t *dest, const uint8_t *src, int width )
 {
-    xine_fast_memcpy( dest, src, width*2 );
+    speedy_memcpy_mmx( dest, src, width*2 );
 }
 
 static void blit_packed422_scanline_mmxext( uint8_t *dest, const uint8_t *src, int width )
 {
-    xine_fast_memcpy( dest, src, width*2 );
+    speedy_memcpy_mmxext( dest, src, width*2 );
 }
 
 static void composite_packed4444_alpha_to_packed422_scanline_c( uint8_t *output, uint8_t *input,
@@ -1058,7 +973,6 @@ static void composite_packed4444_alpha_to_packed422_scanline_c( uint8_t *output,
 {
     int i;
 
-    SPEEDY_START();
     for( i = 0; i < width; i++ ) {
         int af = foreground[ 0 ];
 
@@ -1110,16 +1024,14 @@ static void composite_packed4444_alpha_to_packed422_scanline_c( uint8_t *output,
         output += 2;
         input += 2;
     }
-    SPEEDY_END();
 }
 
-#ifdef ARCH_X86
-
 static void composite_packed4444_alpha_to_packed422_scanline_mmxext( uint8_t *output,
                                                               uint8_t *input,
                                                               uint8_t *foreground,
                                                               int width, int alpha )
 {
+#ifdef ARCH_X86
     const mmx_t alpha2 = { 0x0000FFFF00000000ULL };
     const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL };
     const mmx_t round  = { 0x0080008000800080ULL };
@@ -1135,7 +1047,6 @@ static void composite_packed4444_alpha_to_packed422_scanline_mmxext( uint8_t *ou
         return;
     }
 
-    SPEEDY_START();
     READ_PREFETCH_2048( input );
     READ_PREFETCH_2048( foreground );
 
@@ -1214,17 +1125,13 @@ static void composite_packed4444_alpha_to_packed422_scanline_mmxext( uint8_t *ou
     }
     sfence();
     emms();
-
-    SPEEDY_END();
-}
-
 #endif
+}
 
 static void composite_packed4444_to_packed422_scanline_c( uint8_t *output, uint8_t *input,
                                                    uint8_t *foreground, int width )
 {
     int i;
-    SPEEDY_START();
     for( i = 0; i < width; i++ ) {
         int a = foreground[ 0 ];
 
@@ -1258,20 +1165,18 @@ static void composite_packed4444_to_packed422_scanline_c( uint8_t *output, uint8
         output += 2;
         input += 2;
     }
-    SPEEDY_END();
 }
 
-#ifdef ARCH_X86
 
 static void composite_packed4444_to_packed422_scanline_mmxext( uint8_t *output, uint8_t *input,
                                                         uint8_t *foreground, int width )
 {
+#ifdef ARCH_X86
     const mmx_t alpha2 = { 0x0000FFFF00000000ULL };
     const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL };
     const mmx_t round  = { 0x0080008000800080ULL };
     int i;
 
-    SPEEDY_START();
     READ_PREFETCH_2048( input );
     READ_PREFETCH_2048( foreground );
 
@@ -1355,11 +1260,8 @@ static void composite_packed4444_to_packed422_scanline_mmxext( uint8_t *output,
     }
     sfence();
     emms();
-
-    SPEEDY_END();
-}
-
 #endif
+}
 
 /**
  * um... just need some scrap paper...
@@ -1379,8 +1281,6 @@ static void composite_alphamask_to_packed4444_scanline_c( uint8_t *output,
     uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;
     int i;
 
-    SPEEDY_START();
-
     for( i = 0; i < width; i++ ) {
         int a = *mask;
 
@@ -1400,11 +1300,8 @@ static void composite_alphamask_to_packed4444_scanline_c( uint8_t *output,
         output += 4;
         input += 4;
     }
-    SPEEDY_END();
 }
 
-#ifdef ARCH_X86
-
 static void composite_alphamask_to_packed4444_scanline_mmxext( uint8_t *output,
                                                         uint8_t *input,
                                                         uint8_t *mask,
@@ -1412,13 +1309,12 @@ static void composite_alphamask_to_packed4444_scanline_mmxext( uint8_t *output,
                                                         int textluma, int textcb,
                                                         int textcr )
 {
+#ifdef ARCH_X86
     uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;
     const mmx_t round = { 0x0080008000800080ULL };
     const mmx_t fullalpha = { 0x00000000000000ffULL };
     mmx_t colour;
 
-    SPEEDY_START();
-
     colour.w[ 0 ] = 0x00;
     colour.w[ 1 ] = textluma;
     colour.w[ 2 ] = textcb;
@@ -1502,10 +1398,8 @@ static void composite_alphamask_to_packed4444_scanline_mmxext( uint8_t *output,
     }
     sfence();
     emms();
-    SPEEDY_END();
-}
-
 #endif
+}
 
 static void composite_alphamask_alpha_to_packed4444_scanline_c( uint8_t *output,
                                                        uint8_t *input,
@@ -1516,8 +1410,6 @@ static void composite_alphamask_alpha_to_packed4444_scanline_c( uint8_t *output,
     uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;
     int i;
 
-    SPEEDY_START();
-
     for( i = 0; i < width; i++ ) {
         int af = *mask;
 
@@ -1541,14 +1433,10 @@ static void composite_alphamask_alpha_to_packed4444_scanline_c( uint8_t *output,
         output += 4;
         input += 4;
     }
-
-    SPEEDY_END();
 }
 
 static void premultiply_packed4444_scanline_c( uint8_t *output, uint8_t *input, int width )
 {
-    SPEEDY_START();
-
     while( width-- ) {
         unsigned int cur_a = input[ 0 ];
 
@@ -1560,20 +1448,15 @@ static void premultiply_packed4444_scanline_c( uint8_t *output, uint8_t *input,
         output += 4;
         input += 4;
     }
-
-    SPEEDY_END();
 }
 
-#ifdef ARCH_X86
-
 static void premultiply_packed4444_scanline_mmxext( uint8_t *output, uint8_t *input, int width )
 {
+#ifdef ARCH_X86
     const mmx_t round  = { 0x0080008000800080ULL };
     const mmx_t alpha  = { 0x00000000000000ffULL };
     const mmx_t noalp  = { 0xffffffffffff0000ULL };
 
-    SPEEDY_START();
-
     pxor_r2r( mm7, mm7 );
     while( width-- ) {
         movd_m2r( *input, mm0 );
@@ -1603,11 +1486,8 @@ static void premultiply_packed4444_scanline_mmxext( uint8_t *output, uint8_t *in
     }
     sfence();
     emms();
-
-    SPEEDY_END();
-}
-
 #endif
+}
 
 static void blend_packed422_scanline_c( uint8_t *output, uint8_t *src1,
                                  uint8_t *src2, int width, int pos )
@@ -1626,11 +1506,10 @@ static void blend_packed422_scanline_c( uint8_t *output, uint8_t *src1,
     }
 }
 
-#ifdef ARCH_X86
-
 static void blend_packed422_scanline_mmxext( uint8_t *output, uint8_t *src1,
                                       uint8_t *src2, int width, int pos )
 {
+#ifdef ARCH_X86
     if( pos <= 0 ) {
         blit_packed422_scanline( output, src1, width );
     } else if( pos >= 256 ) {
@@ -1641,8 +1520,6 @@ static void blend_packed422_scanline_mmxext( uint8_t *output, uint8_t *src1,
         const mmx_t all256 = { 0x0100010001000100ULL };
         const mmx_t round  = { 0x0080008000800080ULL };
 
-        SPEEDY_START();
-
         movd_m2r( pos, mm0 );
         pshufw_r2r( mm0, mm0, 0 );
         movq_m2r( all256, mm1 );
@@ -1670,17 +1547,16 @@ static void blend_packed422_scanline_mmxext( uint8_t *output, uint8_t *src1,
         }
         sfence();
         emms();
-
-        SPEEDY_END();
     }
+#endif
 }
 
 static void quarter_blit_vertical_packed422_scanline_mmxext( uint8_t *output, uint8_t *one,
                                                       uint8_t *three, int width )
 {
+#ifdef ARCH_X86
     int i;
 
-    SPEEDY_START();
     for( i = width/16; i; --i ) {
         movq_m2r( *one, mm0 );
         movq_m2r( *three, mm1 );
@@ -1729,23 +1605,19 @@ static void quarter_blit_vertical_packed422_scanline_mmxext( uint8_t *output, ui
 
     sfence();
     emms();
-
-    SPEEDY_END();
+#endif
 }
 
-#endif
 
 static void quarter_blit_vertical_packed422_scanline_c( uint8_t *output, uint8_t *one,
                                                  uint8_t *three, int width )
 {
-    SPEEDY_START();
     width *= 2;
     while( width-- ) {
         *output++ = (*one + *three + *three + *three + 2) / 4;
         one++;
         three++;
     }
-    SPEEDY_END();
 }
 
 static void subpix_blit_vertical_packed422_scanline_c( uint8_t *output, uint8_t *top,
@@ -1760,13 +1632,10 @@ static void subpix_blit_vertical_packed422_scanline_c( uint8_t *output, uint8_t
     } else {
         int x;
 
-        SPEEDY_START();
-
         width *= 2;
         for( x = 0; x < width; x++ ) {
             output[ x ] = ( ( top[ x ] * subpixpos ) + ( bot[ x ] * ( 0xffff - subpixpos ) ) ) >> 16;
         }
-        SPEEDY_END();
     }
 }
 
@@ -1783,12 +1652,355 @@ static void a8_subpix_blit_scanline_c( uint8_t *output, uint8_t *input,
     }
 }
 
+/**
+ * These are from lavtools in mjpegtools:
+ *
+ * colorspace.c:  Routines to perform colorspace conversions.
+ *
+ *  Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+
+#define FP_BITS 18
+
+/* precomputed tables */
+
+static int Y_R[256];
+static int Y_G[256];
+static int Y_B[256];
+static int Cb_R[256];
+static int Cb_G[256];
+static int Cb_B[256];
+static int Cr_R[256];
+static int Cr_G[256];
+static int Cr_B[256];
+static int conv_RY_inited = 0;
+
+static int RGB_Y[256];
+static int R_Cr[256];
+static int G_Cb[256];
+static int G_Cr[256];
+static int B_Cb[256];
+static int conv_YR_inited = 0;
+
+static int myround(double n)
+{
+  if (n >= 0) 
+    return (int)(n + 0.5);
+  else
+    return (int)(n - 0.5);
+}
+
+static void init_RGB_to_YCbCr_tables(void)
+{
+  int i;
+
+  /*
+   * Q_Z[i] =   (coefficient * i
+   *             * (Q-excursion) / (Z-excursion) * fixed-point-factor)
+   *
+   * to one of each, add the following:
+   *             + (fixed-point-factor / 2)         --- for rounding later
+   *             + (Q-offset * fixed-point-factor)  --- to add the offset
+   *             
+   */
+  for (i = 0; i < 256; i++) {
+    Y_R[i] = myround(0.299 * (double)i * 219.0 / 255.0 * (double)(1<<FP_BITS));
+    Y_G[i] = myround(0.587 * (double)i * 219.0 / 255.0 * (double)(1<<FP_BITS));
+    Y_B[i] = myround((0.114 * (double)i * 219.0 / 255.0 * (double)(1<<FP_BITS))
+                     + (double)(1<<(FP_BITS-1)) + (16.0 * (double)(1<<FP_BITS)));
+
+    Cb_R[i] = myround(-0.168736 * (double)i * 224.0 / 255.0 * (double)(1<<FP_BITS));
+    Cb_G[i] = myround(-0.331264 * (double)i * 224.0 / 255.0 * (double)(1<<FP_BITS));
+    Cb_B[i] = myround((0.500 * (double)i * 224.0 / 255.0 * (double)(1<<FP_BITS))
+                       + (double)(1<<(FP_BITS-1)) + (128.0 * (double)(1<<FP_BITS)));
+
+    Cr_R[i] = myround(0.500 * (double)i * 224.0 / 255.0 * (double)(1<<FP_BITS));
+    Cr_G[i] = myround(-0.418688 * (double)i * 224.0 / 255.0 * (double)(1<<FP_BITS));
+    Cr_B[i] = myround((-0.081312 * (double)i * 224.0 / 255.0 * (double)(1<<FP_BITS))
+                      + (double)(1<<(FP_BITS-1)) + (128.0 * (double)(1<<FP_BITS)));
+  }
+  conv_RY_inited = 1;
+}
+
+static void init_YCbCr_to_RGB_tables(void)
+{
+  int i;
+
+  /*
+   * Q_Z[i] =   (coefficient * i
+   *             * (Q-excursion) / (Z-excursion) * fixed-point-factor)
+   *
+   * to one of each, add the following:
+   *             + (fixed-point-factor / 2)         --- for rounding later
+   *             + (Q-offset * fixed-point-factor)  --- to add the offset
+   *             
+   */
+
+  /* clip Y values under 16 */
+  for (i = 0; i < 16; i++) {
+    RGB_Y[i] = myround((1.0 * (double)(16) * 255.0 / 219.0 * (double)(1<<FP_BITS))
+                       + (double)(1<<(FP_BITS-1)));
+  }
+  for (i = 16; i < 236; i++) {
+    RGB_Y[i] = myround((1.0 * (double)(i - 16) * 255.0 / 219.0 * (double)(1<<FP_BITS))
+                       + (double)(1<<(FP_BITS-1)));
+  }
+  /* clip Y values above 235 */
+  for (i = 236; i < 256; i++) {
+    RGB_Y[i] = myround((1.0 * (double)(235) * 255.0 / 219.0 * (double)(1<<FP_BITS))
+                       + (double)(1<<(FP_BITS-1)));
+  }
+    
+  /* clip Cb/Cr values below 16 */
+  for (i = 0; i < 16; i++) {
+    R_Cr[i] = myround(1.402 * (double)(-112) * 255.0 / 224.0 * (double)(1<<FP_BITS));
+    G_Cr[i] = myround(-0.714136 * (double)(-112) * 255.0 / 224.0 * (double)(1<<FP_BITS));
+    G_Cb[i] = myround(-0.344136 * (double)(-112) * 255.0 / 224.0 * (double)(1<<FP_BITS));
+    B_Cb[i] = myround(1.772 * (double)(-112) * 255.0 / 224.0 * (double)(1<<FP_BITS));
+  }
+  for (i = 16; i < 241; i++) {
+    R_Cr[i] = myround(1.402 * (double)(i - 128) * 255.0 / 224.0 * (double)(1<<FP_BITS));
+    G_Cr[i] = myround(-0.714136 * (double)(i - 128) * 255.0 / 224.0 * (double)(1<<FP_BITS));
+    G_Cb[i] = myround(-0.344136 * (double)(i - 128) * 255.0 / 224.0 * (double)(1<<FP_BITS));
+    B_Cb[i] = myround(1.772 * (double)(i - 128) * 255.0 / 224.0 * (double)(1<<FP_BITS));
+  }
+  /* clip Cb/Cr values above 240 */
+  for (i = 241; i < 256; i++) {
+    R_Cr[i] = myround(1.402 * (double)(112) * 255.0 / 224.0 * (double)(1<<FP_BITS));
+    G_Cr[i] = myround(-0.714136 * (double)(112) * 255.0 / 224.0 * (double)(1<<FP_BITS));
+    G_Cb[i] = myround(-0.344136 * (double)(i - 128) * 255.0 / 224.0 * (double)(1<<FP_BITS));
+    B_Cb[i] = myround(1.772 * (double)(112) * 255.0 / 224.0 * (double)(1<<FP_BITS));
+  }
+  conv_YR_inited = 1;
+}
+
+void rgb24_to_packed444_rec601_scanline( uint8_t *output, uint8_t *input, int width )
+{
+    if( !conv_RY_inited ) init_RGB_to_YCbCr_tables();
+
+    while( width-- ) {
+        int r = input[ 0 ];
+        int g = input[ 1 ];
+        int b = input[ 2 ];
+
+        output[ 0 ] = (Y_R[ r ] + Y_G[ g ] + Y_B[ b ]) >> FP_BITS;
+        output[ 1 ] = (Cb_R[ r ] + Cb_G[ g ] + Cb_B[ b ]) >> FP_BITS;
+        output[ 2 ] = (Cr_R[ r ] + Cr_G[ g ] + Cr_B[ b ]) >> FP_BITS;
+        output += 3;
+        input += 3;
+    }
+}
+
+void rgba32_to_packed4444_rec601_scanline( uint8_t *output, uint8_t *input, int width )
+{
+    if( !conv_RY_inited ) init_RGB_to_YCbCr_tables();
+
+    while( width-- ) {
+        int r = input[ 0 ];
+        int g = input[ 1 ];
+        int b = input[ 2 ];
+        int a = input[ 3 ];
+        
+        output[ 0 ] = a;
+        output[ 1 ] = (Y_R[ r ] + Y_G[ g ] + Y_B[ b ]) >> FP_BITS;
+        output[ 2 ] = (Cb_R[ r ] + Cb_G[ g ] + Cb_B[ b ]) >> FP_BITS;
+        output[ 3 ] = (Cr_R[ r ] + Cr_G[ g ] + Cr_B[ b ]) >> FP_BITS;
+        output += 4;
+        input += 4;
+    }
+}
+
+void packed444_to_rgb24_rec601_scanline( uint8_t *output, uint8_t *input, int width )
+{
+    if( !conv_YR_inited ) init_YCbCr_to_RGB_tables();
+
+    while( width-- ) {
+        int luma = input[ 0 ];
+        int cb = input[ 1 ];
+        int cr = input[ 2 ];
+
+        output[ 0 ] = clip255( (RGB_Y[ luma ] + R_Cr[ cr ]) >> FP_BITS );
+        output[ 1 ] = clip255( (RGB_Y[ luma ] + G_Cb[ cb ] + G_Cr[cr]) >> FP_BITS );
+        output[ 2 ] = clip255( (RGB_Y[ luma ] + B_Cb[ cb ]) >> FP_BITS );
+
+        output += 3;
+        input += 3;
+    }
+}
+
+/**
+ * 601 numbers:
+ *
+ * Y' =  0.299*R' + 0.587*G' + 0.114*B' (in  0.0 to  1.0)
+ * Cb = -0.169*R' - 0.331*G' + 0.500*B' (in -0.5 to +0.5)
+ * Cr =  0.500*R' - 0.419*G' - 0.081*B' (in -0.5 to +0.5)
+ *
+ * Inverse:
+ *      Y         Cb        Cr
+ * R  1.0000   -0.0009    1.4017
+ * G  1.0000   -0.3437   -0.7142
+ * B  1.0000    1.7722    0.0010
+ *
+ * S170M numbers:
+ * Y'   =  0.299*R' + 0.587*G' + 0.114*B' (in  0.0 to 1.0)
+ * B-Y' = -0.299*R' - 0.587*G' + 0.886*B'
+ * R-Y' =  0.701*R' - 0.587*G' - 0.114*B'
+ */
+void packed444_to_rgb24_rec601_reference_scanline( uint8_t *output, uint8_t *input, int width )
+{
+    while( width-- ) {
+        double yp = (((double) input[ 0 ]) - 16.0) / 255.0;
+        double cb = (((double) input[ 1 ]) - 128.0) / 255.0;
+        double cr = (((double) input[ 2 ]) - 128.0) / 255.0;
+        double r, g, b;
+
+        r = yp - (0.0009*cb) + (1.4017*cr);
+        g = yp - (0.3437*cb) - (0.7142*cr);
+        b = yp + (1.7722*cb) + (0.0010*cr);
+
+        if( r > 1.0 ) r = 1.0; else if( r < 0.0 ) r = 0.0;
+        if( g > 1.0 ) g = 1.0; else if( g < 0.0 ) g = 0.0;
+        if( b > 1.0 ) b = 1.0; else if( b < 0.0 ) b = 0.0;
+
+        output[ 0 ] = (int) ((r * 255.0) + 0.5);
+        output[ 1 ] = (int) ((g * 255.0) + 0.5);
+        output[ 2 ] = (int) ((b * 255.0) + 0.5);
+
+        output += 3;
+        input += 3;
+    }
+}
+
+void packed444_to_nonpremultiplied_packed4444_scanline( uint8_t *output, 
+                                                        uint8_t *input,
+                                                        int width, int alpha )
+{
+    int i;
+
+    for( i = 0; i < width; i++ ) {
+        output[ 0 ] = alpha & 0xff;
+        output[ 1 ] = input[ 0 ] & 0xff;
+        output[ 2 ] = input[ 1 ] & 0xff;
+        output[ 3 ] = input[ 2 ] & 0xff;
+
+        output += 4;
+        input += 3;
+    }
+
+}
+
+void aspect_adjust_packed4444_scanline( uint8_t *output,
+                                        uint8_t *input, 
+                                        int width,
+                                        double pixel_aspect )
+{
+    double i;
+    int prev_i = 0;
+    int w = 0;
+
+    pixel_aspect = 1.0 / pixel_aspect;
+
+    for( i = 0.0; i < width; i += pixel_aspect ) {
+        uint8_t *curin = input + ((int) i)*4;
+
+        if( !prev_i ) {
+            output[ 0 ] = curin[ 0 ];
+            output[ 1 ] = curin[ 1 ];
+            output[ 2 ] = curin[ 2 ];
+            output[ 3 ] = curin[ 3 ];
+        } else {
+            int avg_a = 0;
+            int avg_y = 0;
+            int avg_cb = 0;
+            int avg_cr = 0;
+            int pos = prev_i * 4;
+            int c = 0;
+            int j;
+
+            for( j = prev_i; j <= (int) i; j++ ) {
+                avg_a += input[ pos++ ];
+                avg_y += input[ pos++ ];
+                avg_cb += input[ pos++ ];
+                avg_cr += input[ pos++ ];
+                c++;
+            }
+            output[ 0 ] = avg_a / c;
+            output[ 1 ] = avg_y / c;
+            output[ 2 ] = avg_cb / c;
+            output[ 3 ] = avg_cr / c;
+        }
+        output += 4;
+        prev_i = (int) i;
+        w++;
+    }
+}
+
+/**
+ * Sub-pixel data bar renderer.  There are 128 bars.
+ */
+void composite_bars_packed4444_scanline( uint8_t *output,
+                                         uint8_t *background, int width,
+                                         int a, int luma, int cb, int cr,
+                                         int percentage )
+{
+    /**
+     * This is the size of both the bar and the spacing in between in subpixel
+     * units out of 256.  Yes, as it so happens, that puts it equal to 'width'.
+     */
+    int barsize = ( width * 256 ) / 256;
+    int i;
+
+    /* We only need to composite the bar on the pixels that matter. */
+    for( i = 0; i < percentage; i++ ) {
+        int barstart = i * barsize * 2;
+        int barend = barstart + barsize;
+        int pixstart = barstart / 256;
+        int pixend = barend / 256;
+        int j;
+
+        for( j = pixstart; j <= pixend; j++ ) {
+            uint8_t *curout = output + (j*4);
+            uint8_t *curin = background + (j*4);
+            int curstart = j * 256;
+            int curend = curstart + 256;
+            int alpha;
+
+            if( barstart > curstart ) curstart = barstart;
+            if( barend < curend ) curend = barend;
+            if( curend - curstart < 256 ) {
+                alpha = ( ( curend - curstart ) * a ) / 256;
+            } else {
+                alpha = a;
+            }
+
+            curout[ 0 ] = curin[ 0 ] + multiply_alpha( alpha - curin[ 0 ], alpha );
+            curout[ 1 ] = curin[ 1 ] + multiply_alpha( luma - curin[ 1 ], alpha );
+            curout[ 2 ] = curin[ 2 ] + multiply_alpha( cb - curin[ 2 ], alpha );
+            curout[ 3 ] = curin[ 3 ] + multiply_alpha( cr - curin[ 3 ], alpha );
+        }
+    }
+}
+
 
 static uint32_t speedy_accel;
 
-void setup_speedy_calls( int verbose )
+void setup_speedy_calls( uint32_t accel, int verbose )
 {
-    speedy_accel = xine_mm_accel();
+    speedy_accel = accel;
 
     interpolate_packed422_scanline = interpolate_packed422_scanline_c;
     blit_colour_packed422_scanline = blit_colour_packed422_scanline_c;
@@ -1807,7 +2019,7 @@ void setup_speedy_calls( int verbose )
     kill_chroma_packed422_inplace_scanline = kill_chroma_packed422_inplace_scanline_c;
     mirror_packed422_inplace_scanline = mirror_packed422_inplace_scanline_c;
     halfmirror_packed422_inplace_scanline = halfmirror_packed422_inplace_scanline_c;
-    speedy_memcpy = xine_fast_memcpy;
+    speedy_memcpy = speedy_memcpy_c;
     diff_packed422_block8x8 = diff_packed422_block8x8_c;
     a8_subpix_blit_scanline = a8_subpix_blit_scanline_c;
     quarter_blit_vertical_packed422_scanline = quarter_blit_vertical_packed422_scanline_c;
@@ -1832,6 +2044,7 @@ void setup_speedy_calls( int verbose )
         comb_factor_packed422_scanline = comb_factor_packed422_scanline_mmx;
         diff_packed422_block8x8 = diff_packed422_block8x8_mmx;
         quarter_blit_vertical_packed422_scanline = quarter_blit_vertical_packed422_scanline_mmxext;
+        speedy_memcpy = speedy_memcpy_mmxext;
     } else if( speedy_accel & MM_ACCEL_X86_MMX ) {
         if( verbose ) {
             fprintf( stderr, "speedycode: Using MMX optimized functions.\n" );
@@ -1844,16 +2057,16 @@ void setup_speedy_calls( int verbose )
         comb_factor_packed422_scanline = comb_factor_packed422_scanline_mmx;
         kill_chroma_packed422_inplace_scanline = kill_chroma_packed422_inplace_scanline_mmx;
         diff_packed422_block8x8 = diff_packed422_block8x8_mmx;
-    } else
-#endif
-    {
+        speedy_memcpy = speedy_memcpy_mmx;
+    } else {
         if( verbose ) {
             fprintf( stderr, "speedycode: No MMX or MMXEXT support detected, using C fallbacks.\n" );
         }
     }
+#endif
 }
 
-int speedy_get_accel( void )
+uint32_t speedy_get_accel( void )
 {
     return speedy_accel;
 }
diff --git a/src/post/deinterlace/speedy.h b/src/post/deinterlace/speedy.h
index 341f85494..734050b40 100644
--- a/src/post/deinterlace/speedy.h
+++ b/src/post/deinterlace/speedy.h
@@ -124,14 +124,16 @@ extern void (*subpix_blit_vertical_packed422_scanline)( uint8_t *output, uint8_t
                                                         uint8_t *bot, int subpixpos, int width );
 
 /**
- * Sets up the function pointers to point at the fastest function available.
+ * Sets up the function pointers to point at the fastest function
+ * available.  Requires accelleration settings (see mm_accel.h).
  */
-void setup_speedy_calls( int verbose );
+void setup_speedy_calls( uint32_t accel, int verbose );
 
 /**
- * Returns a bitfield of what accellerations are available.  See mm_accel.h.
+ * Returns a bitfield of what accellerations were used when speedy was
+ * initialized.  See mm_accel.h.
  */
-int speedy_get_accel( void );
+uint32_t speedy_get_accel( void );
 
 #ifdef __cplusplus
 };
diff --git a/src/post/deinterlace/xine_plugin.c b/src/post/deinterlace/xine_plugin.c
index fa65df4f0..76bb1d46e 100644
--- a/src/post/deinterlace/xine_plugin.c
+++ b/src/post/deinterlace/xine_plugin.c
@@ -17,7 +17,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
  *
- * $Id: xine_plugin.c,v 1.3 2003/06/15 15:25:35 miguelfreitas Exp $
+ * $Id: xine_plugin.c,v 1.4 2003/06/16 18:28:11 miguelfreitas Exp $
  *
  * advanced video deinterlacer plugin
  * Jun/2003 by Miguel Freitas
@@ -231,7 +231,7 @@ static void *deinterlace_init_plugin(xine_t *xine, void *data)
   class->dispose         = deinterlace_class_dispose;
 
 
-  setup_speedy_calls(0);
+  setup_speedy_calls(xine_mm_accel(),0);
 
   linear_plugin_init();
   linearblend_plugin_init();