1 files changed, 1856 insertions, 0 deletions
diff --git a/src/post/deinterlace/speedy.c b/src/post/deinterlace/speedy.c
new file mode 100644
index 000000000..b06e4bc88
--- /dev/null
+++ b/src/post/deinterlace/speedy.c
@@ -0,0 +1,1856 @@
+/**
+ * Copyright (c) 2002, 2003 Billy Biggs <vektor@dumbterm.net>.
+ * Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * Uses code from:
+ *
+ *  linux/arch/i386/kernel/setup.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ * Found in linux 2.4.20.
+ *
+ * Also helped from code in 'cpuinfo.c' found in mplayer.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <ctype.h>
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "attributes.h"
+#include "xineutils.h"
+#include "speedtools.h"
+#include "speedy.h"
+
+/* Function pointer definitions. */
+void (*interpolate_packed422_scanline)( uint8_t *output, uint8_t *top,
+                                        uint8_t *bot, int width );
+void (*blit_colour_packed422_scanline)( uint8_t *output,
+                                        int width, int y, int cb, int cr );
+void (*blit_colour_packed4444_scanline)( uint8_t *output,
+                                         int width, int alpha, int luma,
+                                         int cb, int cr );
+void (*blit_packed422_scanline)( uint8_t *dest, const uint8_t *src, int width );
+void (*composite_packed4444_to_packed422_scanline)( uint8_t *output, uint8_t *input,
+                                                    uint8_t *foreground, int width );
+void (*composite_packed4444_alpha_to_packed422_scanline)( uint8_t *output,
+                                                          uint8_t *input,
+                                                          uint8_t *foreground,
+                                                          int width, int alpha );
+void (*composite_alphamask_to_packed4444_scanline)( uint8_t *output,
+                                                uint8_t *input,
+                                                uint8_t *mask, int width,
+                                                int textluma, int textcb,
+                                                int textcr );
+void (*composite_alphamask_alpha_to_packed4444_scanline)( uint8_t *output,
+                                                       uint8_t *input,
+                                                       uint8_t *mask, int width,
+                                                       int textluma, int textcb,
+                                                       int textcr, int alpha );
+void (*premultiply_packed4444_scanline)( uint8_t *output, uint8_t *input, int width );
+void (*blend_packed422_scanline)( uint8_t *output, uint8_t *src1,
+                                  uint8_t *src2, int width, int pos );
+void (*filter_luma_121_packed422_inplace_scanline)( uint8_t *data, int width );
+void (*filter_luma_14641_packed422_inplace_scanline)( uint8_t *data, int width );
+unsigned int (*diff_factor_packed422_scanline)( uint8_t *cur, uint8_t *old, int width );
+unsigned int (*comb_factor_packed422_scanline)( uint8_t *top, uint8_t *mid,
+                                                uint8_t *bot, int width );
+void (*kill_chroma_packed422_inplace_scanline)( uint8_t *data, int width );
+void (*mirror_packed422_inplace_scanline)( uint8_t *data, int width );
+void (*halfmirror_packed422_inplace_scanline)( uint8_t *data, int width );
+void *(*speedy_memcpy)( void *output, const void *input, size_t size );
+void (*diff_packed422_block8x8)( pulldown_metrics_t *m, uint8_t *old,
+                                 uint8_t *new, int os, int ns );
+void (*a8_subpix_blit_scanline)( uint8_t *output, uint8_t *input,
+                                 int lasta, int startpos, int width );
+void (*quarter_blit_vertical_packed422_scanline)( uint8_t *output, uint8_t *one,
+                                                  uint8_t *three, int width );
+void (*subpix_blit_vertical_packed422_scanline)( uint8_t *output, uint8_t *top,
+                                                 uint8_t *bot, int subpixpos, int width );
+
+
+#define SPEEDY_START()
+
+#define SPEEDY_END()
+
+/**
+ * result = (1 - alpha)B + alpha*F
+ *        =  B - alpha*B + alpha*F
+ *        =  B + alpha*(F - B)
+ */
+
+static inline __attribute__ ((always_inline,const)) int multiply_alpha( int a, int r )
+{
+    int temp;
+    temp = (r * a) + 0x80;
+    return ((temp + (temp >> 8)) >> 8);
+}
+
+static inline __attribute__ ((always_inline,const)) uint8_t clip255( int x )
+{
+    if( x > 255 ) {
+        return 255;
+    } else if( x < 0 ) {
+        return 0;
+    } else {
+        return x;
+    }
+}
+
+#ifdef ARCH_X86
+
+static unsigned int comb_factor_packed422_scanline_mmx( uint8_t *top, uint8_t *mid,
+                                                 uint8_t *bot, int width )
+{
+    const mmx_t qwYMask = { 0x00ff00ff00ff00ffULL };
+    const mmx_t qwOnes = { 0x0001000100010001ULL };
+    mmx_t qwThreshold;
+    unsigned int temp1, temp2;
+    unsigned long CombJaggieThreshold = 73;
+
+    SPEEDY_START();
+
+    width /= 4;
+
+    qwThreshold.uw[ 0 ] = CombJaggieThreshold;
+    qwThreshold.uw[ 1 ] = CombJaggieThreshold;
+    qwThreshold.uw[ 2 ] = CombJaggieThreshold;
+    qwThreshold.uw[ 3 ] = CombJaggieThreshold;
+
+    movq_m2r( qwThreshold, mm0 );
+    movq_m2r( qwYMask, mm1 );
+    movq_m2r( qwOnes, mm2 );
+    pxor_r2r( mm7, mm7 );         /* mm7 = 0. */
+
+    while( width-- ) {
+        /* Load and keep just the luma. */
+        movq_m2r( *top, mm3 );
+        movq_m2r( *mid, mm4 );
+        movq_m2r( *bot, mm5 );
+
+        pand_r2r( mm1, mm3 );
+        pand_r2r( mm1, mm4 );
+        pand_r2r( mm1, mm5 );
+
+        /* Work out mm6 = (top - mid) * (bot - mid) - ( (top - mid)^2 >> 7 ) */
+        psrlw_i2r( 1, mm3 );
+        psrlw_i2r( 1, mm4 );
+        psrlw_i2r( 1, mm5 );
+
+        /* mm6 = (top - mid) */
+        movq_r2r( mm3, mm6 );
+        psubw_r2r( mm4, mm6 );
+
+        /* mm3 = (top - bot) */
+        psubw_r2r( mm5, mm3 );
+
+        /* mm5 = (bot - mid) */
+        psubw_r2r( mm4, mm5 );
+
+        /* mm6 = (top - mid) * (bot - mid) */
+        pmullw_r2r( mm5, mm6 );
+
+        /* mm3 = (top - bot)^2 >> 7 */
+        pmullw_r2r( mm3, mm3 );   /* mm3 = (top - bot)^2 */
+        psrlw_i2r( 7, mm3 );      /* mm3 = ((top - bot)^2 >> 7) */
+
+        /* mm6 is what we want. */
+        psubw_r2r( mm3, mm6 );
+
+        /* FF's if greater than qwTheshold */
+        pcmpgtw_r2r( mm0, mm6 );
+
+        /* Add to count if we are greater than threshold */
+        pand_r2r( mm2, mm6 );
+        paddw_r2r( mm6, mm7 );
+
+        top += 8;
+        mid += 8;
+        bot += 8;
+    }
+
+    movd_r2m( mm7, temp1 );
+    psrlq_i2r( 32, mm7 );
+    movd_r2m( mm7, temp2 );
+    temp1 += temp2;
+    temp2 = temp1;
+    temp1 >>= 16;
+    temp1 += temp2 & 0xffff;
+
+    emms();
+
+    SPEEDY_END();
+
+    return temp1;
+}
+
+#endif
+
+static unsigned long BitShift = 6;
+
+static unsigned int diff_factor_packed422_scanline_c( uint8_t *cur, uint8_t *old, int width )
+{
+    unsigned int ret = 0;
+
+    SPEEDY_START();
+
+    width /= 4;
+
+    while( width-- ) {
+        unsigned int tmp1 = (cur[ 0 ] + cur[ 2 ] + cur[ 4 ] + cur[ 6 ] + 2)>>2;
+        unsigned int tmp2 = (old[ 0 ] + old[ 2 ] + old[ 4 ] + old[ 6 ] + 2)>>2;
+        tmp1  = (tmp1 - tmp2);
+        tmp1 *= tmp1;
+        tmp1 >>= BitShift;
+        ret += tmp1;
+        cur += 8;
+        old += 8;
+    }
+    SPEEDY_END();
+
+    return ret;
+}
+
+static unsigned int diff_factor_packed422_scanline_test_c( uint8_t *cur, uint8_t *old, int width )
+{
+    unsigned int ret = 0;
+
+    SPEEDY_START();
+
+    width /= 16;
+
+    while( width-- ) {
+        unsigned int tmp1 = (cur[ 0 ] + cur[ 2 ] + cur[ 4 ] + cur[ 6 ])>>2;
+        unsigned int tmp2 = (old[ 0 ] + old[ 2 ] + old[ 4 ] + old[ 6 ])>>2;
+        tmp1  = (tmp1 - tmp2);
+        tmp1 *= tmp1;
+        tmp1 >>= BitShift;
+        ret += tmp1;
+        cur += (8*4);
+        old += (8*4);
+    }
+    SPEEDY_END();
+
+    return ret;
+}
+
+#ifdef ARCH_X86
+
+static unsigned int diff_factor_packed422_scanline_mmx( uint8_t *cur, uint8_t *old, int width )
+{
+    const mmx_t qwYMask = { 0x00ff00ff00ff00ffULL };
+    unsigned int temp1, temp2;
+
+    SPEEDY_START();
+
+    width /= 4;
+
+    movq_m2r( qwYMask, mm1 );
+    movd_m2r( BitShift, mm7 );
+    pxor_r2r( mm0, mm0 );
+
+    while( width-- ) {
+        movq_m2r( *cur, mm4 );
+        movq_m2r( *old, mm5 );
+
+        pand_r2r( mm1, mm4 );
+        pand_r2r( mm1, mm5 );
+
+        psubw_r2r( mm5, mm4 );   /* mm4 = Y1 - Y2            */
+        pmaddwd_r2r( mm4, mm4 ); /* mm4 = (Y1 - Y2)^2        */
+        psrld_r2r( mm7, mm4 );   /* divide mm4 by 2^BitShift */
+        paddd_r2r( mm4, mm0 );   /* keep total in mm0        */
+
+        cur += 8;
+        old += 8;
+    }
+
+    movd_r2m( mm0, temp1 );
+    psrlq_i2r( 32, mm0 );
+    movd_r2m( mm0, temp2 );
+    temp1 += temp2;
+
+    emms();
+
+    SPEEDY_END();
+
+    return temp1;
+}
+
+#define ABS(a) (((a) < 0)?-(a):(a))
+
+static void diff_packed422_block8x8_mmx( pulldown_metrics_t *m, uint8_t *old,
+                                  uint8_t *new, int os, int ns )
+{
+    const mmx_t ymask = { 0x00ff00ff00ff00ffULL };
+    short out[ 24 ]; /* Output buffer for the partial metrics from the mmx code. */
+    uint8_t *outdata = (uint8_t *) out;
+    uint8_t *oldp, *newp;
+    int i;
+
+    SPEEDY_START();
+
+    pxor_r2r( mm4, mm4 );  // 4 even difference sums.
+    pxor_r2r( mm5, mm5 );  // 4 odd difference sums.
+    pxor_r2r( mm7, mm7 );  // zeros
+
+    oldp = old; newp = new;
+    for( i = 4; i; --i ) {
+        // Even difference.
+        movq_m2r( oldp[0], mm0 );
+        movq_m2r( oldp[8], mm2 );
+        pand_m2r( ymask, mm0 );
+        pand_m2r( ymask, mm2 );
+        oldp += os;
+
+        movq_m2r( newp[0], mm1 );
+        movq_m2r( newp[8], mm3 );
+        pand_m2r( ymask, mm1 );
+        pand_m2r( ymask, mm3 );
+        newp += ns;
+
+        movq_r2r( mm0, mm6 );
+        psubusb_r2r( mm1, mm0 );
+        psubusb_r2r( mm6, mm1 );
+        movq_r2r( mm2, mm6 );
+        psubusb_r2r( mm3, mm2 );
+        psubusb_r2r( mm6, mm3 );
+
+        paddw_r2r( mm0, mm4 );
+        paddw_r2r( mm1, mm4 );
+        paddw_r2r( mm2, mm4 );
+        paddw_r2r( mm3, mm4 );
+
+        // Odd difference.
+        movq_m2r( oldp[0], mm0 );
+        movq_m2r( oldp[8], mm2 );
+        pand_m2r( ymask, mm0 );
+        pand_m2r( ymask, mm2 );
+        oldp += os;
+
+        movq_m2r( newp[0], mm1 );
+        movq_m2r( newp[8], mm3 );
+        pand_m2r( ymask, mm1 );
+        pand_m2r( ymask, mm3 );
+        newp += ns;
+
+        movq_r2r( mm0, mm6 );
+        psubusb_r2r( mm1, mm0 );
+        psubusb_r2r( mm6, mm1 );
+        movq_r2r( mm2, mm6 );
+        psubusb_r2r( mm3, mm2 );
+        psubusb_r2r( mm6, mm3 );
+
+        paddw_r2r( mm0, mm5 );
+        paddw_r2r( mm1, mm5 );
+        paddw_r2r( mm2, mm5 );
+        paddw_r2r( mm3, mm5 );
+    }
+    movq_r2m( mm4, outdata[0] );
+    movq_r2m( mm5, outdata[8] );
+
+    m->e = out[0] + out[1] + out[2] + out[3];
+    m->o = out[4] + out[5] + out[6] + out[7];
+    m->d = m->e + m->o;
+
+    pxor_r2r( mm4, mm4 );  // Past spacial noise.
+    pxor_r2r( mm5, mm5 );  // Temporal noise.
+    pxor_r2r( mm6, mm6 );  // Current spacial noise.
+
+    // First loop to measure first four columns
+    oldp = old; newp = new;
+    for( i = 4; i; --i ) {
+        movq_m2r( oldp[0], mm0 );
+        movq_m2r( oldp[os], mm1 );
+        pand_m2r( ymask, mm0 );
+        pand_m2r( ymask, mm1 );
+        oldp += (os*2);
+
+        movq_m2r( newp[0], mm2 );
+        movq_m2r( newp[ns], mm3 );
+        pand_m2r( ymask, mm2 );
+        pand_m2r( ymask, mm3 );
+        newp += (ns*2);
+
+        paddw_r2r( mm1, mm4 );
+        paddw_r2r( mm1, mm5 );
+        paddw_r2r( mm3, mm6 );
+        psubw_r2r( mm0, mm4 );
+        psubw_r2r( mm2, mm5 );
+        psubw_r2r( mm2, mm6 );
+    }
+    movq_r2m( mm4, outdata[0] );
+    movq_r2m( mm5, outdata[16] );
+    movq_r2m( mm6, outdata[32] );
+
+    pxor_r2r( mm4, mm4 );
+    pxor_r2r( mm5, mm5 );
+    pxor_r2r( mm6, mm6 );
+
+    // Second loop for the last four columns
+    oldp = old; newp = new;
+    for( i = 4; i; --i ) {
+        movq_m2r( oldp[8], mm0 );
+        movq_m2r( oldp[os+8], mm1 );
+        pand_m2r( ymask, mm0 );
+        pand_m2r( ymask, mm1 );
+        oldp += (os*2);
+
+        movq_m2r( newp[8], mm2 );
+        movq_m2r( newp[ns+8], mm3 );
+        pand_m2r( ymask, mm2 );
+        pand_m2r( ymask, mm3 );
+        newp += (ns*2);
+
+        paddw_r2r( mm1, mm4 );
+        paddw_r2r( mm1, mm5 );
+        paddw_r2r( mm3, mm6 );
+        psubw_r2r( mm0, mm4 );
+        psubw_r2r( mm2, mm5 );
+        psubw_r2r( mm2, mm6 );
+    }
+    movq_r2m( mm4, outdata[8] );
+    movq_r2m( mm5, outdata[24] );
+    movq_r2m( mm6, outdata[40] );
+
+    m->p = m->t = m->s = 0;
+    for (i=0; i<8; i++) {
+        // FIXME: move abs() into the mmx code!
+        m->p += ABS(out[i]);
+        m->t += ABS(out[8+i]);
+        m->s += ABS(out[16+i]);
+    }
+
+    emms();
+
+    SPEEDY_END();
+}
+
+#endif
+
+static void diff_packed422_block8x8_c( pulldown_metrics_t *m, uint8_t *old,
+                                uint8_t *new, int os, int ns )
+{
+    int x, y, e=0, o=0, s=0, p=0, t=0;
+    uint8_t *oldp, *newp;
+
+    SPEEDY_START();
+    m->s = m->p = m->t = 0;
+    for (x = 8; x; x--) {
+        oldp = old; old += 2;
+        newp = new; new += 2;
+        s = p = t = 0;
+        for (y = 4; y; y--) {
+            e += ABS(newp[0] - oldp[0]);
+            o += ABS(newp[ns] - oldp[os]);
+            s += newp[ns]-newp[0];
+            p += oldp[os]-oldp[0];
+            t += oldp[os]-newp[0];
+            oldp += os<<1;
+            newp += ns<<1;
+        }
+        m->s += ABS(s);
+        m->p += ABS(p);
+        m->t += ABS(t);
+    }
+    m->e = e;
+    m->o = o;
+    m->d = e+o;
+    SPEEDY_END();
+}
+
+static void packed444_to_packed422_scanline_c( uint8_t *output, uint8_t *input, int width )
+{
+    SPEEDY_START();
+    width /= 2;
+    while( width-- ) {
+        output[ 0 ] = input[ 0 ];
+        output[ 1 ] = input[ 1 ];
+        output[ 2 ] = input[ 3 ];
+        output[ 3 ] = input[ 2 ];
+        output += 4;
+        input += 6;
+    }
+    SPEEDY_END();
+}
+
+static void packed422_to_packed444_scanline_c( uint8_t *output, uint8_t *input, int width )
+{
+    SPEEDY_START();
+    width /= 2;
+    while( width-- ) {
+        output[ 0 ] = input[ 0 ];
+        output[ 1 ] = input[ 1 ];
+        output[ 2 ] = input[ 3 ];
+        output[ 3 ] = input[ 2 ];
+        output[ 4 ] = input[ 1 ];
+        output[ 5 ] = input[ 3 ];
+        output += 6;
+        input += 4;
+    }
+    SPEEDY_END();
+}
+
+/**
+ * For the middle pixels, the filter kernel is:
+ *
+ * [-1 3 -6 12 -24 80 80 -24 12 -6 3 -1]
+ */
+void packed422_to_packed444_rec601_scanline( uint8_t *dest, uint8_t *src, int width )
+{
+    int i;
+
+    SPEEDY_START();
+    /* Process two input pixels at a time.  Input is [Y'][Cb][Y'][Cr]. */
+    for( i = 0; i < width / 2; i++ ) {
+        dest[ (i*6) + 0 ] = src[ (i*4) + 0 ];
+        dest[ (i*6) + 1 ] = src[ (i*4) + 1 ];
+        dest[ (i*6) + 2 ] = src[ (i*4) + 3 ];
+
+        dest[ (i*6) + 3 ] = src[ (i*4) + 2 ];
+        if( i > (5*2) && i < ((width/2) - (6*2)) ) {
+            dest[ (i*6) + 4 ] = clip255( ((  (80*(src[ (i*4) + 1 ] + src[ (i*4) + 5 ]))
+                                           - (24*(src[ (i*4) - 3 ] + src[ (i*4) + 9 ]))
+                                           + (12*(src[ (i*4) - 7 ] + src[ (i*4) + 13]))
+                                           - ( 6*(src[ (i*4) - 11] + src[ (i*4) + 17]))
+                                           + ( 3*(src[ (i*4) - 15] + src[ (i*4) + 21]))
+                                           - (   (src[ (i*4) - 19] + src[ (i*4) + 25]))) + 64) >> 7 );
+            dest[ (i*6) + 5 ] = clip255( ((  (80*(src[ (i*4) + 3 ] + src[ (i*4) + 7 ]))
+                                           - (24*(src[ (i*4) - 1 ] + src[ (i*4) + 11]))
+                                           + (12*(src[ (i*4) - 5 ] + src[ (i*4) + 15]))
+                                           - ( 6*(src[ (i*4) - 9 ] + src[ (i*4) + 19]))
+                                           + ( 3*(src[ (i*4) - 13] + src[ (i*4) + 23]))
+                                           - (   (src[ (i*4) - 17] + src[ (i*4) + 27]))) + 64) >> 7 );
+        } else if( i < ((width/2) - 1) ) {
+            dest[ (i*6) + 4 ] = (src[ (i*4) + 1 ] + src[ (i*4) + 5 ] + 1) >> 1;
+            dest[ (i*6) + 5 ] = (src[ (i*4) + 3 ] + src[ (i*4) + 7 ] + 1) >> 1;
+        } else {
+            dest[ (i*6) + 4 ] = src[ (i*4) + 1 ];
+            dest[ (i*6) + 5 ] = src[ (i*4) + 3 ];
+        }
+    }
+    SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void kill_chroma_packed422_inplace_scanline_mmx( uint8_t *data, int width )
+{
+    const mmx_t ymask = { 0x00ff00ff00ff00ffULL };
+    const mmx_t nullchroma = { 0x8000800080008000ULL };
+
+    SPEEDY_START();
+
+    movq_m2r( ymask, mm7 );
+    movq_m2r( nullchroma, mm6 );
+    for(; width > 4; width -= 4 ) {
+        movq_m2r( *data, mm0 );
+        pand_r2r( mm7, mm0 );
+        paddb_r2r( mm6, mm0 );
+        movq_r2m( mm0, *data );
+        data += 8;
+    }
+    emms();
+
+    while( width-- ) {
+        data[ 1 ] = 128;
+        data += 2;
+    }
+    SPEEDY_END();
+}
+
+#endif
+
+static void kill_chroma_packed422_inplace_scanline_c( uint8_t *data, int width )
+{
+    SPEEDY_START();
+    while( width-- ) {
+        data[ 1 ] = 128;
+        data += 2;
+    }
+    SPEEDY_END();
+}
+
+/*
+// this duplicates alternate lines in alternate frames to highlight or mute
+// the effects of chroma crawl. it is not a solution or proper filter. it's
+// only for testing.
+static void testing_packed422_inplace_scanline_c( uint8_t *data, int width, int scanline )
+{
+    volatile static int topbottom = 0;
+    static uint8_t scanbuffer[2048];
+
+    SPEEDY_START();
+    if( scanline <= 1 ) {
+        topbottom = scanline;
+        memcpy(scanbuffer, data, width*2);
+    }
+    if ( scanline < 10 ) {
+        printf("scanline: %d %d\n", scanline, topbottom);
+    }
+    if ( ((scanline-topbottom)/2)%2 && scanline > 1 ) {
+        memcpy(data, scanbuffer, width*2);
+    } else {
+        memcpy(scanbuffer, data, width*2);
+    }
+    SPEEDY_END();
+}
+*/
+
+static void mirror_packed422_inplace_scanline_c( uint8_t *data, int width )
+{
+    int x, tmp1, tmp2;
+    int width2 = width*2;
+
+    SPEEDY_START();
+    for( x = 0; x < width; x += 2 ) {
+        tmp1 = data[ x   ];
+        tmp2 = data[ x+1 ];
+        data[ x   ] = data[ width2 - x     ];
+        data[ x+1 ] = data[ width2 - x + 1 ];
+        data[ width2 - x     ] = tmp1;
+        data[ width2 - x + 1 ] = tmp2;
+    }
+    SPEEDY_END();
+}
+
+static void halfmirror_packed422_inplace_scanline_c( uint8_t *data, int width )
+{
+    int x;
+
+    SPEEDY_START();
+    for( x = 0; x < width; x += 2 ) {
+        data[ width + x     ] = data[ width - x     ];
+        data[ width + x + 1 ] = data[ width - x + 1 ];
+    }
+    SPEEDY_END();
+}
+
+static void filter_luma_121_packed422_inplace_scanline_c( uint8_t *data, int width )
+{
+    int r1 = 0;
+    int r2 = 0;
+
+    SPEEDY_START();
+    data += 2;
+    width -= 1;
+    while( width-- ) {
+        int s1, s2;
+        s1 = *data + r1; r1 = *data;
+        s2 = s1    + r2; r2 = s1;
+        *(data - 2) = s2 >> 2;
+        data += 2;
+    }
+    SPEEDY_END();
+}
+
+static void filter_luma_14641_packed422_inplace_scanline_c( uint8_t *data, int width )
+{
+    int r1 = 0;
+    int r2 = 0;
+    int r3 = 0;
+    int r4 = 0;
+
+    SPEEDY_START();
+    width -= 4;
+    data += 4;
+    while( width-- ) {
+        int s1, s2, s3, s4;
+        s1 = *data + r1; r1 = *data;
+        s2 = s1    + r2; r2 = s1;
+        s3 = s2    + r3; r3 = s2;
+        s4 = s3    + r4; r4 = s3;
+        *(data - 4) = s4 >> 4;
+        data += 2;
+    }
+    SPEEDY_END();
+}
+
+static void interpolate_packed422_scanline_c( uint8_t *output, uint8_t *top,
+                                       uint8_t *bot, int width )
+{
+    int i;
+
+    SPEEDY_START();
+
+    for( i = width*2; i; --i ) {
+        *output++ = ((*top++) + (*bot++)) >> 1;
+    }
+
+    SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void interpolate_packed422_scanline_mmx( uint8_t *output, uint8_t *top,
+                                         uint8_t *bot, int width )
+{
+    const mmx_t shiftmask = { 0xfefffefffefffeffULL };  /* To avoid shifting chroma to luma. */
+    int i;
+
+    SPEEDY_START();
+
+    for( i = width/16; i; --i ) {
+        movq_m2r( *bot, mm0 );
+        movq_m2r( *top, mm1 );
+        movq_m2r( *(bot + 8), mm2 );
+        movq_m2r( *(top + 8), mm3 );
+        movq_m2r( *(bot + 16), mm4 );
+        movq_m2r( *(top + 16), mm5 );
+        movq_m2r( *(bot + 24), mm6 );
+        movq_m2r( *(top + 24), mm7 );
+        pand_m2r( shiftmask, mm0 );
+        pand_m2r( shiftmask, mm1 );
+        pand_m2r( shiftmask, mm2 );
+        pand_m2r( shiftmask, mm3 );
+        pand_m2r( shiftmask, mm4 );
+        pand_m2r( shiftmask, mm5 );
+        pand_m2r( shiftmask, mm6 );
+        pand_m2r( shiftmask, mm7 );
+        psrlw_i2r( 1, mm0 );
+        psrlw_i2r( 1, mm1 );
+        psrlw_i2r( 1, mm2 );
+        psrlw_i2r( 1, mm3 );
+        psrlw_i2r( 1, mm4 );
+        psrlw_i2r( 1, mm5 );
+        psrlw_i2r( 1, mm6 );
+        psrlw_i2r( 1, mm7 );
+        paddb_r2r( mm1, mm0 );
+        paddb_r2r( mm3, mm2 );
+        paddb_r2r( mm5, mm4 );
+        paddb_r2r( mm7, mm6 );
+        movq_r2m( mm0, *output );
+        movq_r2m( mm2, *(output + 8) );
+        movq_r2m( mm4, *(output + 16) );
+        movq_r2m( mm6, *(output + 24) );
+        output += 32;
+        top += 32;
+        bot += 32;
+    }
+    width = (width & 0xf);
+
+    for( i = width/4; i; --i ) {
+        movq_m2r( *bot, mm0 );
+        movq_m2r( *top, mm1 );
+        pand_m2r( shiftmask, mm0 );
+        pand_m2r( shiftmask, mm1 );
+        psrlw_i2r( 1, mm0 );
+        psrlw_i2r( 1, mm1 );
+        paddb_r2r( mm1, mm0 );
+        movq_r2m( mm0, *output );
+        output += 8;
+        top += 8;
+        bot += 8;
+    }
+    width = width & 0x7;
+
+    /* Handle last few pixels. */
+    for( i = width * 2; i; --i ) {
+        *output++ = ((*top++) + (*bot++)) >> 1;
+    }
+
+    emms();
+
+    SPEEDY_END();
+}
+
+static void interpolate_packed422_scanline_mmxext( uint8_t *output, uint8_t *top,
+                                            uint8_t *bot, int width )
+{
+    int i;
+
+    SPEEDY_START();
+
+    for( i = width/16; i; --i ) {
+        movq_m2r( *bot, mm0 );
+        movq_m2r( *top, mm1 );
+        movq_m2r( *(bot + 8), mm2 );
+        movq_m2r( *(top + 8), mm3 );
+        movq_m2r( *(bot + 16), mm4 );
+        movq_m2r( *(top + 16), mm5 );
+        movq_m2r( *(bot + 24), mm6 );
+        movq_m2r( *(top + 24), mm7 );
+        pavgb_r2r( mm1, mm0 );
+        pavgb_r2r( mm3, mm2 );
+        pavgb_r2r( mm5, mm4 );
+        pavgb_r2r( mm7, mm6 );
+        movntq_r2m( mm0, *output );
+        movntq_r2m( mm2, *(output + 8) );
+        movntq_r2m( mm4, *(output + 16) );
+        movntq_r2m( mm6, *(output + 24) );
+        output += 32;
+        top += 32;
+        bot += 32;
+    }
+    width = (width & 0xf);
+
+    for( i = width/4; i; --i ) {
+        movq_m2r( *bot, mm0 );
+        movq_m2r( *top, mm1 );
+        pavgb_r2r( mm1, mm0 );
+        movntq_r2m( mm0, *output );
+        output += 8;
+        top += 8;
+        bot += 8;
+    }
+    width = width & 0x7;
+
+    /* Handle last few pixels. */
+    for( i = width * 2; i; --i ) {
+        *output++ = ((*top++) + (*bot++)) >> 1;
+    }
+
+    sfence();
+    emms();
+
+    SPEEDY_END();
+}
+
+#endif
+
+static void blit_colour_packed422_scanline_c( uint8_t *output, int width, int y, int cb, int cr )
+{
+    uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;
+    uint32_t *o = (uint32_t *) output;
+
+    SPEEDY_START();
+
+    for( width /= 2; width; --width ) {
+        *o++ = colour;
+    }
+
+    SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void blit_colour_packed422_scanline_mmx( uint8_t *output, int width, int y, int cb, int cr )
+{
+    uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;
+    int i;
+
+    SPEEDY_START();
+
+    movd_m2r( colour, mm1 );
+    movd_m2r( colour, mm2 );
+    psllq_i2r( 32, mm1 );
+    por_r2r( mm1, mm2 );
+
+    for( i = width / 16; i; --i ) {
+        movq_r2m( mm2, *output );
+        movq_r2m( mm2, *(output + 8) );
+        movq_r2m( mm2, *(output + 16) );
+        movq_r2m( mm2, *(output + 24) );
+        output += 32;
+    }
+    width = (width & 0xf);
+
+    for( i = width / 4; i; --i ) {
+        movq_r2m( mm2, *output );
+        output += 8;
+    }
+    width = (width & 0x7);
+
+    for( i = width / 2; i; --i ) {
+        *((uint32_t *) output) = colour;
+        output += 4;
+    }
+
+    if( width & 1 ) {
+        *output = y;
+        *(output + 1) = cb;
+    }
+
+    emms();
+
+    SPEEDY_END();
+}
+
+static void blit_colour_packed422_scanline_mmxext( uint8_t *output, int width, int y, int cb, int cr )
+{
+    uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;
+    int i;
+
+    SPEEDY_START();
+
+    movd_m2r( colour, mm1 );
+    movd_m2r( colour, mm2 );
+    psllq_i2r( 32, mm1 );
+    por_r2r( mm1, mm2 );
+
+    for( i = width / 16; i; --i ) {
+        movntq_r2m( mm2, *output );
+        movntq_r2m( mm2, *(output + 8) );
+        movntq_r2m( mm2, *(output + 16) );
+        movntq_r2m( mm2, *(output + 24) );
+        output += 32;
+    }
+    width = (width & 0xf);
+
+    for( i = width / 4; i; --i ) {
+        movntq_r2m( mm2, *output );
+        output += 8;
+    }
+    width = (width & 0x7);
+
+    for( i = width / 2; i; --i ) {
+        *((uint32_t *) output) = colour;
+        output += 4;
+    }
+
+    if( width & 1 ) {
+        *output = y;
+        *(output + 1) = cb;
+    }
+
+    sfence();
+    emms();
+
+    SPEEDY_END();
+}
+
+#endif
+
+static void blit_colour_packed4444_scanline_c( uint8_t *output, int width,
+                                        int alpha, int luma, int cb, int cr )
+{
+    int j;
+
+    SPEEDY_START();
+
+    for( j = 0; j < width; j++ ) {
+        *output++ = alpha;
+        *output++ = luma;
+        *output++ = cb;
+        *output++ = cr;
+    }
+
+    SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void blit_colour_packed4444_scanline_mmx( uint8_t *output, int width,
+                                          int alpha, int luma,
+                                          int cb, int cr )
+{
+    uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha;
+    int i;
+
+    SPEEDY_START();
+
+    movd_m2r( colour, mm1 );
+    movd_m2r( colour, mm2 );
+    psllq_i2r( 32, mm1 );
+    por_r2r( mm1, mm2 );
+
+    for( i = width / 8; i; --i ) {
+        movq_r2m( mm2, *output );
+        movq_r2m( mm2, *(output + 8) );
+        movq_r2m( mm2, *(output + 16) );
+        movq_r2m( mm2, *(output + 24) );
+        output += 32;
+    }
+    width = (width & 0x7);
+
+    for( i = width / 2; i; --i ) {
+        movq_r2m( mm2, *output );
+        output += 8;
+    }
+    width = (width & 0x1);
+
+    if( width ) {
+        *((uint32_t *) output) = colour;
+        output += 4;
+    }
+
+    emms();
+
+    SPEEDY_END();
+}
+
+void blit_colour_packed4444_scanline_mmxext( uint8_t *output, int width,
+                                             int alpha, int luma,
+                                             int cb, int cr )
+{
+    uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha;
+    int i;
+
+    SPEEDY_START();
+
+    movd_m2r( colour, mm1 );
+    movd_m2r( colour, mm2 );
+    psllq_i2r( 32, mm1 );
+    por_r2r( mm1, mm2 );
+
+    for( i = width / 8; i; --i ) {
+        movntq_r2m( mm2, *output );
+        movntq_r2m( mm2, *(output + 8) );
+        movntq_r2m( mm2, *(output + 16) );
+        movntq_r2m( mm2, *(output + 24) );
+        output += 32;
+    }
+    width = (width & 0x7);
+
+    for( i = width / 2; i; --i ) {
+        movntq_r2m( mm2, *output );
+        output += 8;
+    }
+    width = (width & 0x1);
+
+    if( width ) {
+        *((uint32_t *) output) = colour;
+        output += 4;
+    }
+
+    sfence();
+    emms();
+
+    SPEEDY_END();
+}
+
+#endif
+
+static void blit_packed422_scanline_c( uint8_t *dest, const uint8_t *src, int width )
+{
+    xine_fast_memcpy( dest, src, width*2 );
+}
+
+static void blit_packed422_scanline_mmx( uint8_t *dest, const uint8_t *src, int width )
+{
+    xine_fast_memcpy( dest, src, width*2 );
+}
+
+static void blit_packed422_scanline_mmxext( uint8_t *dest, const uint8_t *src, int width )
+{
+    xine_fast_memcpy( dest, src, width*2 );
+}
+
+static void composite_packed4444_alpha_to_packed422_scanline_c( uint8_t *output, uint8_t *input,
+                                                         uint8_t *foreground, int width, int alpha )
+{
+    int i;
+
+    SPEEDY_START();
+    for( i = 0; i < width; i++ ) {
+        int af = foreground[ 0 ];
+
+        if( af ) {
+            int a = ((af * alpha) + 0x80) >> 8;
+
+
+            if( a == 0xff ) {
+                output[ 0 ] = foreground[ 1 ];
+
+                if( ( i & 1 ) == 0 ) {
+                    output[ 1 ] = foreground[ 2 ];
+                    output[ 3 ] = foreground[ 3 ];
+                }
+            } else if( a ) {
+                /**
+                 * (1 - alpha)*B + alpha*F
+                 * (1 - af*a)*B + af*a*F
+                 *  B - af*a*B + af*a*F
+                 *  B + a*(af*F - af*B)
+                 */
+
+                output[ 0 ] = input[ 0 ]
+                            + ((alpha*( foreground[ 1 ]
+                                        - multiply_alpha( foreground[ 0 ], input[ 0 ] ) ) + 0x80) >> 8);
+
+                if( ( i & 1 ) == 0 ) {
+
+                    /**
+                     * At first I thought I was doing this incorrectly, but
+                     * the following math has convinced me otherwise.
+                     *
+                     * C_r = (1 - alpha)*B + alpha*F
+                     * C_r = B - af*a*B + af*a*F
+                     *
+                     * C_r = 128 + ((1 - af*a)*(B - 128) + a*af*(F - 128))
+                     * C_r = 128 + (B - af*a*B - 128 + af*a*128 + a*af*F - a*af*128)
+                     * C_r = B - af*a*B + a*af*F
+                     */
+
+                    output[ 1 ] = input[ 1 ] + ((alpha*( foreground[ 2 ]
+                                            - multiply_alpha( foreground[ 0 ], input[ 1 ] ) ) + 0x80) >> 8);
+                    output[ 3 ] = input[ 3 ] + ((alpha*( foreground[ 3 ]
+                                            - multiply_alpha( foreground[ 0 ], input[ 3 ] ) ) + 0x80) >> 8);
+                }
+            }
+        }
+        foreground += 4;
+        output += 2;
+        input += 2;
+    }
+    SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void composite_packed4444_alpha_to_packed422_scanline_mmxext( uint8_t *output,
+                                                              uint8_t *input,
+                                                              uint8_t *foreground,
+                                                              int width, int alpha )
+{
+    const mmx_t alpha2 = { 0x0000FFFF00000000ULL };
+    const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL };
+    const mmx_t round  = { 0x0080008000800080ULL };
+    int i;
+
+    if( !alpha ) {
+        blit_packed422_scanline( output, input, width );
+        return;
+    }
+
+    if( alpha == 256 ) {
+        composite_packed4444_to_packed422_scanline( output, input, foreground, width );
+        return;
+    }
+
+    SPEEDY_START();
+    READ_PREFETCH_2048( input );
+    READ_PREFETCH_2048( foreground );
+
+    movq_m2r( alpha, mm2 );
+    pshufw_r2r( mm2, mm2, 0 );
+    pxor_r2r( mm7, mm7 );
+
+    for( i = width/2; i; i-- ) {
+        int fg1 = *((uint32_t *) foreground);
+        int fg2 = *(((uint32_t *) foreground)+1);
+
+        if( fg1 || fg2 ) {
+            /* mm1 = [ cr ][ y ][ cb ][ y ] */
+            movd_m2r( *input, mm1 );
+            punpcklbw_r2r( mm7, mm1 );
+
+            movq_m2r( *foreground, mm3 );
+            movq_r2r( mm3, mm4 );
+            punpcklbw_r2r( mm7, mm3 );
+            punpckhbw_r2r( mm7, mm4 );
+            /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */
+
+            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */
+            pshufw_r2r( mm3, mm5, 0 );
+            pshufw_r2r( mm4, mm6, 0 );
+            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ]  == 11001000 == 201 */
+            pshufw_r2r( mm3, mm3, 201 );
+            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ]  == 00010000 == 16 */
+            pshufw_r2r( mm4, mm4, 16 );
+
+            pand_m2r( alpha1, mm3 );
+            pand_m2r( alpha2, mm4 );
+            pand_m2r( alpha1, mm5 );
+            pand_m2r( alpha2, mm6 );
+            por_r2r( mm4, mm3 );
+            por_r2r( mm6, mm5 );
+
+            /* now, mm5 is af and mm1 is B.  Need to multiply them. */
+            pmullw_r2r( mm1, mm5 );
+
+            /* Multiply by appalpha. */
+            pmullw_r2r( mm2, mm3 );
+            paddw_m2r( round, mm3 );
+            psrlw_i2r( 8, mm3 );
+            /* Result is now B + F. */
+            paddw_r2r( mm3, mm1 );
+
+            /* Round up appropriately. */
+            paddw_m2r( round, mm5 );
+
+            /* mm6 contains our i>>8; */
+            movq_r2r( mm5, mm6 );
+            psrlw_i2r( 8, mm6 );
+
+            /* Add mm6 back into mm5.  Now our result is in the high bytes. */
+            paddw_r2r( mm6, mm5 );
+
+            /* Shift down. */
+            psrlw_i2r( 8, mm5 );
+
+            /* Multiply by appalpha. */
+            pmullw_r2r( mm2, mm5 );
+            paddw_m2r( round, mm5 );
+            psrlw_i2r( 8, mm5 );
+
+            psubusw_r2r( mm5, mm1 );
+
+            /* mm1 = [ B + F - af*B ] */
+            packuswb_r2r( mm1, mm1 );
+            movd_r2m( mm1, *output );
+        }
+
+        foreground += 8;
+        output += 4;
+        input += 4;
+    }
+    sfence();
+    emms();
+
+    SPEEDY_END();
+}
+
+#endif
+
+static void composite_packed4444_to_packed422_scanline_c( uint8_t *output, uint8_t *input,
+                                                   uint8_t *foreground, int width )
+{
+    int i;
+    SPEEDY_START();
+    for( i = 0; i < width; i++ ) {
+        int a = foreground[ 0 ];
+
+        if( a == 0xff ) {
+            output[ 0 ] = foreground[ 1 ];
+
+            if( ( i & 1 ) == 0 ) {
+                output[ 1 ] = foreground[ 2 ];
+                output[ 3 ] = foreground[ 3 ];
+            }
+        } else if( a ) {
+            /**
+             * (1 - alpha)*B + alpha*F
+             *  B + af*F - af*B
+             */
+
+            output[ 0 ] = input[ 0 ] + foreground[ 1 ] - multiply_alpha( foreground[ 0 ], input[ 0 ] );
+
+            if( ( i & 1 ) == 0 ) {
+
+                /**
+                 * C_r = (1 - af)*B + af*F
+                 * C_r = B - af*B + af*F
+                 */
+
+                output[ 1 ] = input[ 1 ] + foreground[ 2 ] - multiply_alpha( foreground[ 0 ], input[ 1 ] );
+                output[ 3 ] = input[ 3 ] + foreground[ 3 ] - multiply_alpha( foreground[ 0 ], input[ 3 ] );
+            }
+        }
+        foreground += 4;
+        output += 2;
+        input += 2;
+    }
+    SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void composite_packed4444_to_packed422_scanline_mmxext( uint8_t *output, uint8_t *input,
+                                                        uint8_t *foreground, int width )
+{
+    const mmx_t alpha2 = { 0x0000FFFF00000000ULL };
+    const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL };
+    const mmx_t round  = { 0x0080008000800080ULL };
+    int i;
+
+    SPEEDY_START();
+    READ_PREFETCH_2048( input );
+    READ_PREFETCH_2048( foreground );
+
+    pxor_r2r( mm7, mm7 );
+    for( i = width/2; i; i-- ) {
+        int fg1 = *((uint32_t *) foreground);
+        int fg2 = *(((uint32_t *) foreground)+1);
+
+        if( (fg1 & 0xff) == 0xff && (fg2 & 0xff) == 0xff ) {
+            movq_m2r( *foreground, mm3 );
+            movq_r2r( mm3, mm4 );
+            punpcklbw_r2r( mm7, mm3 );
+            punpckhbw_r2r( mm7, mm4 );
+            /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */
+            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ]  == 11001000 == 201 */
+            pshufw_r2r( mm3, mm3, 201 );
+            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 0  a ][ 1 y ][ 0  a ][ 0 a ]  == 00010000 == 16 */
+            pshufw_r2r( mm4, mm4, 16 );
+            pand_m2r( alpha1, mm3 );
+            pand_m2r( alpha2, mm4 );
+            por_r2r( mm4, mm3 );
+            /* mm1 = [ B + F - af*B ] */
+            packuswb_r2r( mm3, mm3 );
+            movd_r2m( mm3, *output );
+        } else if( fg1 || fg2 ) {
+
+            /* mm1 = [ cr ][ y ][ cb ][ y ] */
+            movd_m2r( *input, mm1 );
+            punpcklbw_r2r( mm7, mm1 );
+
+            movq_m2r( *foreground, mm3 );
+            movq_r2r( mm3, mm4 );
+            punpcklbw_r2r( mm7, mm3 );
+            punpckhbw_r2r( mm7, mm4 );
+            /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */
+
+            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */
+            pshufw_r2r( mm3, mm5, 0 );
+            pshufw_r2r( mm4, mm6, 0 );
+            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ]  == 11001000 == 201 */
+            pshufw_r2r( mm3, mm3, 201 );
+            /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ]  -> [ 0  a ][ 1 y ][ 0  a ][ 0 a ]  == 00010000 == 16 */
+            pshufw_r2r( mm4, mm4, 16 );
+
+            pand_m2r( alpha1, mm3 );
+            pand_m2r( alpha2, mm4 );
+            pand_m2r( alpha1, mm5 );
+            pand_m2r( alpha2, mm6 );
+            por_r2r( mm4, mm3 );
+            por_r2r( mm6, mm5 );
+
+            /* now, mm5 is af and mm1 is B.  Need to multiply them. */
+            pmullw_r2r( mm1, mm5 );
+
+            /* Result is now B + F. */
+            paddw_r2r( mm3, mm1 );
+
+            /* Round up appropriately. */
+            paddw_m2r( round, mm5 );
+
+            /* mm6 contains our i>>8; */
+            movq_r2r( mm5, mm6 );
+            psrlw_i2r( 8, mm6 );
+
+            /* Add mm6 back into mm5.  Now our result is in the high bytes. */
+            paddw_r2r( mm6, mm5 );
+
+            /* Shift down. */
+            psrlw_i2r( 8, mm5 );
+
+            psubusw_r2r( mm5, mm1 );
+
+            /* mm1 = [ B + F - af*B ] */
+            packuswb_r2r( mm1, mm1 );
+            movd_r2m( mm1, *output );
+        }
+
+        foreground += 8;
+        output += 4;
+        input += 4;
+    }
+    sfence();
+    emms();
+
+    SPEEDY_END();
+}
+
+#endif
+
+/**
+ * um... just need some scrap paper...
+ *   D = (1 - alpha)*B + alpha*F
+ *   D = (1 - a)*B + a*textluma
+ *     = B - a*B + a*textluma
+ *     = B + a*(textluma - B)
+ *   Da = (1 - a)*b + a
+ */
+static void composite_alphamask_to_packed4444_scanline_c( uint8_t *output,
+                                                   uint8_t *input,
+                                                   uint8_t *mask,
+                                                   int width,
+                                                   int textluma, int textcb,
+                                                   int textcr )
+{
+    uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;
+    int i;
+
+    SPEEDY_START();
+
+    for( i = 0; i < width; i++ ) {
+        int a = *mask;
+
+        if( a == 0xff ) {
+            *((uint32_t *) output) = opaque;
+        } else if( (input[ 0 ] == 0x00) ) {
+            *((uint32_t *) output) = (multiply_alpha( a, textcr ) << 24)
+                                       | (multiply_alpha( a, textcb ) << 16)
+                                       | (multiply_alpha( a, textluma ) << 8) | a;
+        } else if( a ) {
+            *((uint32_t *) output) = ((input[ 3 ] + multiply_alpha( a, textcr - input[ 3 ] )) << 24)
+                                       | ((input[ 2 ] + multiply_alpha( a, textcb - input[ 2 ] )) << 16)
+                                       | ((input[ 1 ] + multiply_alpha( a, textluma - input[ 1 ] )) << 8)
+                                       |  (input[ 0 ] + multiply_alpha( a, 0xff - input[ 0 ] ));
+        }
+        mask++;
+        output += 4;
+        input += 4;
+    }
+    SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void composite_alphamask_to_packed4444_scanline_mmxext( uint8_t *output,
+                                                        uint8_t *input,
+                                                        uint8_t *mask,
+                                                        int width,
+                                                        int textluma, int textcb,
+                                                        int textcr )
+{
+    uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;
+    const mmx_t round = { 0x0080008000800080ULL };
+    const mmx_t fullalpha = { 0x00000000000000ffULL };
+    mmx_t colour;
+
+    SPEEDY_START();
+
+    colour.w[ 0 ] = 0x00;
+    colour.w[ 1 ] = textluma;
+    colour.w[ 2 ] = textcb;
+    colour.w[ 3 ] = textcr;
+
+    movq_m2r( colour, mm1 );
+    movq_r2r( mm1, mm0 );
+
+    /* mm0 = [ cr ][ cb ][ y ][ 0xff ] */
+    paddw_m2r( fullalpha, mm0 );
+
+    /* mm7 = 0 */
+    pxor_r2r( mm7, mm7 );
+
+    /* mm6 = round */
+    movq_m2r( round, mm6 );
+
+    while( width-- ) {
+        int a = *mask;
+
+        if( a == 0xff ) {
+            *((uint32_t *) output) = opaque;
+        } else if( (input[ 0 ] == 0x00) ) {
+            /* We just need to multiply our colour by the alpha value. */
+
+            /* mm2 = [ a ][ a ][ a ][ a ] */
+            movd_m2r( a, mm2 );
+            movq_r2r( mm2, mm3 );
+            pshufw_r2r( mm2, mm2, 0 );
+
+            /* mm5 = [ cr ][ cb ][ y ][ 0 ] */
+            movq_r2r( mm1, mm5 );
+
+            /* Multiply by alpha. */
+            pmullw_r2r( mm2, mm5 );
+            paddw_m2r( round, mm5 );
+            movq_r2r( mm5, mm6 );
+            psrlw_i2r( 8, mm6 );
+            paddw_r2r( mm6, mm5 );
+            psrlw_i2r( 8, mm5 );
+
+            /* Set alpha to a. */
+            por_r2r( mm3, mm5 );
+
+            /* Pack and write our result. */
+            packuswb_r2r( mm5, mm5 );
+            movd_r2m( mm5, *output );
+        } else if( a ) {
+            /* mm2 = [ a ][ a ][ a ][ a ] */
+            movd_m2r( a, mm2 );
+            pshufw_r2r( mm2, mm2, 0 );
+
+            /* mm3 = [ cr ][ cb ][ y ][ 0xff ] */
+            movq_r2r( mm0, mm3 );
+
+            /* mm4 = [ i_cr ][ i_cb ][ i_y ][ i_a ] */
+            movd_m2r( *input, mm4 );
+            punpcklbw_r2r( mm7, mm4 );
+
+            /* Subtract input and colour. */
+            psubw_r2r( mm4, mm3 );  /* mm3 = mm3 - mm4 */
+
+            /* Multiply alpha. */
+            pmullw_r2r( mm2, mm3 );
+            paddw_r2r( mm6, mm3 );
+            movq_r2r( mm3, mm2 );
+            psrlw_i2r( 8, mm3 );
+            paddw_r2r( mm2, mm3 );
+            psrlw_i2r( 8, mm3 );
+
+            /* Add back in the input. */
+            paddb_r2r( mm3, mm4 );
+
+            /* Write result. */
+            packuswb_r2r( mm4, mm4 );
+            movd_r2m( mm4, *output );
+        }
+        mask++;
+        output += 4;
+        input += 4;
+    }
+    sfence();
+    emms();
+    SPEEDY_END();
+}
+
+#endif
+
+static void composite_alphamask_alpha_to_packed4444_scanline_c( uint8_t *output,
+                                                       uint8_t *input,
+                                                       uint8_t *mask, int width,
+                                                       int textluma, int textcb,
+                                                       int textcr, int alpha )
+{
+    uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;
+    int i;
+
+    SPEEDY_START();
+
+    for( i = 0; i < width; i++ ) {
+        int af = *mask;
+
+        if( af ) {
+           int a = ((af * alpha) + 0x80) >> 8;
+
+           if( a == 0xff ) {
+               *((uint32_t *) output) = opaque;
+           } else if( input[ 0 ] == 0x00 ) {
+               *((uint32_t *) output) = (multiply_alpha( a, textcr ) << 24)
+                                          | (multiply_alpha( a, textcb ) << 16)
+                                          | (multiply_alpha( a, textluma ) << 8) | a;
+           } else if( a ) {
+               *((uint32_t *) output) = ((input[ 3 ] + multiply_alpha( a, textcr - input[ 3 ] )) << 24)
+                                         | ((input[ 2 ] + multiply_alpha( a, textcb - input[ 2 ] )) << 16)
+                                         | ((input[ 1 ] + multiply_alpha( a, textluma - input[ 1 ] )) << 8)
+                                         | (a + multiply_alpha( 0xff - a, input[ 0 ] ));
+           }
+        }
+        mask++;
+        output += 4;
+        input += 4;
+    }
+
+    SPEEDY_END();
+}
+
+static void premultiply_packed4444_scanline_c( uint8_t *output, uint8_t *input, int width )
+{
+    SPEEDY_START();
+
+    while( width-- ) {
+        unsigned int cur_a = input[ 0 ];
+
+        *((uint32_t *) output) = (multiply_alpha( cur_a, input[ 3 ] ) << 24)
+                               | (multiply_alpha( cur_a, input[ 2 ] ) << 16)
+                               | (multiply_alpha( cur_a, input[ 1 ] ) << 8)
+                               | cur_a;
+
+        output += 4;
+        input += 4;
+    }
+
+    SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void premultiply_packed4444_scanline_mmxext( uint8_t *output, uint8_t *input, int width )
+{
+    const mmx_t round  = { 0x0080008000800080ULL };
+    const mmx_t alpha  = { 0x00000000000000ffULL };
+    const mmx_t noalp  = { 0xffffffffffff0000ULL };
+
+    SPEEDY_START();
+
+    pxor_r2r( mm7, mm7 );
+    while( width-- ) {
+        movd_m2r( *input, mm0 );
+        punpcklbw_r2r( mm7, mm0 );
+
+        movq_r2r( mm0, mm2 );
+        pshufw_r2r( mm2, mm2, 0 );
+        movq_r2r( mm2, mm4 );
+        pand_m2r( alpha, mm4 );
+
+        pmullw_r2r( mm2, mm0 );
+        paddw_m2r( round, mm0 );
+
+        movq_r2r( mm0, mm3 );
+        psrlw_i2r( 8, mm3 );
+        paddw_r2r( mm3, mm0 );
+        psrlw_i2r( 8, mm0 );
+
+        pand_m2r( noalp, mm0 );
+        paddw_r2r( mm4, mm0 );
+
+        packuswb_r2r( mm0, mm0 );
+        movd_r2m( mm0, *output );
+
+        output += 4;
+        input += 4;
+    }
+    sfence();
+    emms();
+
+    SPEEDY_END();
+}
+
+#endif
+
+static void blend_packed422_scanline_c( uint8_t *output, uint8_t *src1,
+                                 uint8_t *src2, int width, int pos )
+{
+    if( pos == 0 ) {
+        blit_packed422_scanline( output, src1, width );
+    } else if( pos == 256 ) {
+        blit_packed422_scanline( output, src2, width );
+    } else if( pos == 128 ) {
+        interpolate_packed422_scanline( output, src1, src2, width );
+    } else {
+        width *= 2;
+        while( width-- ) {
+            *output++ = ( (*src1++ * ( 256 - pos )) + (*src2++ * pos) + 0x80 ) >> 8;
+        }
+    }
+}
+
+#ifdef ARCH_X86
+
+static void blend_packed422_scanline_mmxext( uint8_t *output, uint8_t *src1,
+                                      uint8_t *src2, int width, int pos )
+{
+    if( pos <= 0 ) {
+        blit_packed422_scanline( output, src1, width );
+    } else if( pos >= 256 ) {
+        blit_packed422_scanline( output, src2, width );
+    } else if( pos == 128 ) {
+        interpolate_packed422_scanline( output, src1, src2, width );
+    } else {
+        const mmx_t all256 = { 0x0100010001000100ULL };
+        const mmx_t round  = { 0x0080008000800080ULL };
+
+        SPEEDY_START();
+
+        movd_m2r( pos, mm0 );
+        pshufw_r2r( mm0, mm0, 0 );
+        movq_m2r( all256, mm1 );
+        psubw_r2r( mm0, mm1 );
+        pxor_r2r( mm7, mm7 );
+
+        for( width /= 2; width; width-- ) {
+            movd_m2r( *src1, mm3 );
+            movd_m2r( *src2, mm4 );
+            punpcklbw_r2r( mm7, mm3 );
+            punpcklbw_r2r( mm7, mm4 );
+
+            pmullw_r2r( mm1, mm3 );
+            pmullw_r2r( mm0, mm4 );
+            paddw_r2r( mm4, mm3 );
+            paddw_m2r( round, mm3 );
+            psrlw_i2r( 8, mm3 );
+
+            packuswb_r2r( mm3, mm3 );
+            movd_r2m( mm3, *output );
+
+            output += 4;
+            src1 += 4;
+            src2 += 4;
+        }
+        sfence();
+        emms();
+
+        SPEEDY_END();
+    }
+}
+
+static void quarter_blit_vertical_packed422_scanline_mmxext( uint8_t *output, uint8_t *one,
+                                                      uint8_t *three, int width )
+{
+    int i;
+
+    SPEEDY_START();
+    for( i = width/16; i; --i ) {
+        movq_m2r( *one, mm0 );
+        movq_m2r( *three, mm1 );
+        movq_m2r( *(one + 8), mm2 );
+        movq_m2r( *(three + 8), mm3 );
+        movq_m2r( *(one + 16), mm4 );
+        movq_m2r( *(three + 16), mm5 );
+        movq_m2r( *(one + 24), mm6 );
+        movq_m2r( *(three + 24), mm7 );
+        pavgb_r2r( mm1, mm0 );
+        pavgb_r2r( mm1, mm0 );
+        pavgb_r2r( mm3, mm2 );
+        pavgb_r2r( mm3, mm2 );
+        pavgb_r2r( mm5, mm4 );
+        pavgb_r2r( mm5, mm4 );
+        pavgb_r2r( mm7, mm6 );
+        pavgb_r2r( mm7, mm6 );
+        movntq_r2m( mm0, *output );
+        movntq_r2m( mm2, *(output + 8) );
+        movntq_r2m( mm4, *(output + 16) );
+        movntq_r2m( mm6, *(output + 24) );
+        output += 32;
+        one += 32;
+        three += 32;
+    }
+    width = (width & 0xf);
+
+    for( i = width/4; i; --i ) {
+        movq_m2r( *one, mm0 );
+        movq_m2r( *three, mm1 );
+        pavgb_r2r( mm1, mm0 );
+        pavgb_r2r( mm1, mm0 );
+        movntq_r2m( mm0, *output );
+        output += 8;
+        one += 8;
+        three += 8;
+    }
+    width = width & 0x7;
+
+    /* Handle last few pixels. */
+    for( i = width * 2; i; --i ) {
+        *output++ = (*one + *three + *three + *three + 2) / 4;
+        one++;
+        three++;
+    }
+
+    sfence();
+    emms();
+
+    SPEEDY_END();
+}
+
+#endif
+
+static void quarter_blit_vertical_packed422_scanline_c( uint8_t *output, uint8_t *one,
+                                                 uint8_t *three, int width )
+{
+    SPEEDY_START();
+    width *= 2;
+    while( width-- ) {
+        *output++ = (*one + *three + *three + *three + 2) / 4;
+        one++;
+        three++;
+    }
+    SPEEDY_END();
+}
+
+static void subpix_blit_vertical_packed422_scanline_c( uint8_t *output, uint8_t *top,
+                                                uint8_t *bot, int subpixpos, int width )
+{
+    if( subpixpos == 32768 ) {
+        interpolate_packed422_scanline( output, top, bot, width );
+    } else if( subpixpos == 16384 ) {
+        quarter_blit_vertical_packed422_scanline( output, top, bot, width );
+    } else if( subpixpos == 49152 ) {
+        quarter_blit_vertical_packed422_scanline( output, bot, top, width );
+    } else {
+        int x;
+
+        SPEEDY_START();
+
+        width *= 2;
+        for( x = 0; x < width; x++ ) {
+            output[ x ] = ( ( top[ x ] * subpixpos ) + ( bot[ x ] * ( 0xffff - subpixpos ) ) ) >> 16;
+        }
+        SPEEDY_END();
+    }
+}
+
+static void a8_subpix_blit_scanline_c( uint8_t *output, uint8_t *input,
+                                int lasta, int startpos, int width )
+{
+    int pos = 0xffff - (startpos & 0xffff);
+    int prev = lasta;
+    int x;
+
+    for( x = 0; x < width; x++ ) {
+        output[ x ] = ( ( prev * pos ) + ( input[ x ] * ( 0xffff - pos ) ) ) >> 16;
+        prev = input[ x ];
+    }
+}
+
+
+static uint32_t speedy_accel;
+
+void setup_speedy_calls( int verbose )
+{
+    speedy_accel = xine_mm_accel();
+
+    interpolate_packed422_scanline = interpolate_packed422_scanline_c;
+    blit_colour_packed422_scanline = blit_colour_packed422_scanline_c;
+    blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_c;
+    blit_packed422_scanline = blit_packed422_scanline_c;
+    composite_packed4444_to_packed422_scanline = composite_packed4444_to_packed422_scanline_c;
+    composite_packed4444_alpha_to_packed422_scanline = composite_packed4444_alpha_to_packed422_scanline_c;
+    composite_alphamask_to_packed4444_scanline = composite_alphamask_to_packed4444_scanline_c;
+    composite_alphamask_alpha_to_packed4444_scanline = composite_alphamask_alpha_to_packed4444_scanline_c;
+    premultiply_packed4444_scanline = premultiply_packed4444_scanline_c;
+    blend_packed422_scanline = blend_packed422_scanline_c;
+    filter_luma_121_packed422_inplace_scanline = filter_luma_121_packed422_inplace_scanline_c;
+    filter_luma_14641_packed422_inplace_scanline = filter_luma_14641_packed422_inplace_scanline_c;
+    comb_factor_packed422_scanline = 0;
+    diff_factor_packed422_scanline = diff_factor_packed422_scanline_c;
+    kill_chroma_packed422_inplace_scanline = kill_chroma_packed422_inplace_scanline_c;
+    mirror_packed422_inplace_scanline = mirror_packed422_inplace_scanline_c;
+    halfmirror_packed422_inplace_scanline = halfmirror_packed422_inplace_scanline_c;
+    speedy_memcpy = xine_fast_memcpy;
+    diff_packed422_block8x8 = diff_packed422_block8x8_c;
+    a8_subpix_blit_scanline = a8_subpix_blit_scanline_c;
+    quarter_blit_vertical_packed422_scanline = quarter_blit_vertical_packed422_scanline_c;
+    subpix_blit_vertical_packed422_scanline = subpix_blit_vertical_packed422_scanline_c;
+
+#ifdef ARCH_X86
+    if( speedy_accel & MM_ACCEL_X86_MMXEXT ) {
+        if( verbose ) {
+            fprintf( stderr, "speedycode: Using MMXEXT optimized functions.\n" );
+        }
+        interpolate_packed422_scanline = interpolate_packed422_scanline_mmxext;
+        blit_colour_packed422_scanline = blit_colour_packed422_scanline_mmxext;
+        blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_mmxext;
+        blit_packed422_scanline = blit_packed422_scanline_mmxext;
+        composite_packed4444_to_packed422_scanline = composite_packed4444_to_packed422_scanline_mmxext;
+        composite_packed4444_alpha_to_packed422_scanline = composite_packed4444_alpha_to_packed422_scanline_mmxext;
+        composite_alphamask_to_packed4444_scanline = composite_alphamask_to_packed4444_scanline_mmxext;
+        premultiply_packed4444_scanline = premultiply_packed4444_scanline_mmxext;
+        kill_chroma_packed422_inplace_scanline = kill_chroma_packed422_inplace_scanline_mmx;
+        blend_packed422_scanline = blend_packed422_scanline_mmxext;
+        diff_factor_packed422_scanline = diff_factor_packed422_scanline_mmx;
+        comb_factor_packed422_scanline = comb_factor_packed422_scanline_mmx;
+        diff_packed422_block8x8 = diff_packed422_block8x8_mmx;
+        quarter_blit_vertical_packed422_scanline = quarter_blit_vertical_packed422_scanline_mmxext;
+    } else if( speedy_accel & MM_ACCEL_X86_MMX ) {
+        if( verbose ) {
+            fprintf( stderr, "speedycode: Using MMX optimized functions.\n" );
+        }
+        interpolate_packed422_scanline = interpolate_packed422_scanline_mmx;
+        blit_colour_packed422_scanline = blit_colour_packed422_scanline_mmx;
+        blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_mmx;
+        blit_packed422_scanline = blit_packed422_scanline_mmx;
+        diff_factor_packed422_scanline = diff_factor_packed422_scanline_mmx;
+        comb_factor_packed422_scanline = comb_factor_packed422_scanline_mmx;
+        kill_chroma_packed422_inplace_scanline = kill_chroma_packed422_inplace_scanline_mmx;
+        diff_packed422_block8x8 = diff_packed422_block8x8_mmx;
+    } else
+#endif
+    {
+        if( verbose ) {
+            fprintf( stderr, "speedycode: No MMX or MMXEXT support detected, using C fallbacks.\n" );
+        }
+    }
+}
+
+int speedy_get_accel( void )
+{
+    return speedy_accel;
+}
+