summaryrefslogtreecommitdiff
path: root/src/post/deinterlace/speedy.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/post/deinterlace/speedy.c')
-rw-r--r--src/post/deinterlace/speedy.c1856
1 files changed, 1856 insertions, 0 deletions
diff --git a/src/post/deinterlace/speedy.c b/src/post/deinterlace/speedy.c
new file mode 100644
index 000000000..b06e4bc88
--- /dev/null
+++ b/src/post/deinterlace/speedy.c
@@ -0,0 +1,1856 @@
+/**
+ * Copyright (c) 2002, 2003 Billy Biggs <vektor@dumbterm.net>.
+ * Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * Uses code from:
+ *
+ * linux/arch/i386/kernel/setup.c
+ *
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Found in linux 2.4.20.
+ *
+ * Also helped from code in 'cpuinfo.c' found in mplayer.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <ctype.h>
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "attributes.h"
+#include "xineutils.h"
+#include "speedtools.h"
+#include "speedy.h"
+
+/* Function pointer definitions. */
+void (*interpolate_packed422_scanline)( uint8_t *output, uint8_t *top,
+ uint8_t *bot, int width );
+void (*blit_colour_packed422_scanline)( uint8_t *output,
+ int width, int y, int cb, int cr );
+void (*blit_colour_packed4444_scanline)( uint8_t *output,
+ int width, int alpha, int luma,
+ int cb, int cr );
+void (*blit_packed422_scanline)( uint8_t *dest, const uint8_t *src, int width );
+void (*composite_packed4444_to_packed422_scanline)( uint8_t *output, uint8_t *input,
+ uint8_t *foreground, int width );
+void (*composite_packed4444_alpha_to_packed422_scanline)( uint8_t *output,
+ uint8_t *input,
+ uint8_t *foreground,
+ int width, int alpha );
+void (*composite_alphamask_to_packed4444_scanline)( uint8_t *output,
+ uint8_t *input,
+ uint8_t *mask, int width,
+ int textluma, int textcb,
+ int textcr );
+void (*composite_alphamask_alpha_to_packed4444_scanline)( uint8_t *output,
+ uint8_t *input,
+ uint8_t *mask, int width,
+ int textluma, int textcb,
+ int textcr, int alpha );
+void (*premultiply_packed4444_scanline)( uint8_t *output, uint8_t *input, int width );
+void (*blend_packed422_scanline)( uint8_t *output, uint8_t *src1,
+ uint8_t *src2, int width, int pos );
+void (*filter_luma_121_packed422_inplace_scanline)( uint8_t *data, int width );
+void (*filter_luma_14641_packed422_inplace_scanline)( uint8_t *data, int width );
+unsigned int (*diff_factor_packed422_scanline)( uint8_t *cur, uint8_t *old, int width );
+unsigned int (*comb_factor_packed422_scanline)( uint8_t *top, uint8_t *mid,
+ uint8_t *bot, int width );
+void (*kill_chroma_packed422_inplace_scanline)( uint8_t *data, int width );
+void (*mirror_packed422_inplace_scanline)( uint8_t *data, int width );
+void (*halfmirror_packed422_inplace_scanline)( uint8_t *data, int width );
+void *(*speedy_memcpy)( void *output, const void *input, size_t size );
+void (*diff_packed422_block8x8)( pulldown_metrics_t *m, uint8_t *old,
+ uint8_t *new, int os, int ns );
+void (*a8_subpix_blit_scanline)( uint8_t *output, uint8_t *input,
+ int lasta, int startpos, int width );
+void (*quarter_blit_vertical_packed422_scanline)( uint8_t *output, uint8_t *one,
+ uint8_t *three, int width );
+void (*subpix_blit_vertical_packed422_scanline)( uint8_t *output, uint8_t *top,
+ uint8_t *bot, int subpixpos, int width );
+
+
+#define SPEEDY_START()
+
+#define SPEEDY_END()
+
+/**
+ * result = (1 - alpha)B + alpha*F
+ * = B - alpha*B + alpha*F
+ * = B + alpha*(F - B)
+ */
+
+static inline __attribute__ ((always_inline,const)) int multiply_alpha( int a, int r )
+{
+ int temp;
+ temp = (r * a) + 0x80;
+ return ((temp + (temp >> 8)) >> 8);
+}
+
+static inline __attribute__ ((always_inline,const)) uint8_t clip255( int x )
+{
+ if( x > 255 ) {
+ return 255;
+ } else if( x < 0 ) {
+ return 0;
+ } else {
+ return x;
+ }
+}
+
+#ifdef ARCH_X86
+
+static unsigned int comb_factor_packed422_scanline_mmx( uint8_t *top, uint8_t *mid,
+ uint8_t *bot, int width )
+{
+ const mmx_t qwYMask = { 0x00ff00ff00ff00ffULL };
+ const mmx_t qwOnes = { 0x0001000100010001ULL };
+ mmx_t qwThreshold;
+ unsigned int temp1, temp2;
+ unsigned long CombJaggieThreshold = 73;
+
+ SPEEDY_START();
+
+ width /= 4;
+
+ qwThreshold.uw[ 0 ] = CombJaggieThreshold;
+ qwThreshold.uw[ 1 ] = CombJaggieThreshold;
+ qwThreshold.uw[ 2 ] = CombJaggieThreshold;
+ qwThreshold.uw[ 3 ] = CombJaggieThreshold;
+
+ movq_m2r( qwThreshold, mm0 );
+ movq_m2r( qwYMask, mm1 );
+ movq_m2r( qwOnes, mm2 );
+ pxor_r2r( mm7, mm7 ); /* mm7 = 0. */
+
+ while( width-- ) {
+ /* Load and keep just the luma. */
+ movq_m2r( *top, mm3 );
+ movq_m2r( *mid, mm4 );
+ movq_m2r( *bot, mm5 );
+
+ pand_r2r( mm1, mm3 );
+ pand_r2r( mm1, mm4 );
+ pand_r2r( mm1, mm5 );
+
+ /* Work out mm6 = (top - mid) * (bot - mid) - ( (top - mid)^2 >> 7 ) */
+ psrlw_i2r( 1, mm3 );
+ psrlw_i2r( 1, mm4 );
+ psrlw_i2r( 1, mm5 );
+
+ /* mm6 = (top - mid) */
+ movq_r2r( mm3, mm6 );
+ psubw_r2r( mm4, mm6 );
+
+ /* mm3 = (top - bot) */
+ psubw_r2r( mm5, mm3 );
+
+ /* mm5 = (bot - mid) */
+ psubw_r2r( mm4, mm5 );
+
+ /* mm6 = (top - mid) * (bot - mid) */
+ pmullw_r2r( mm5, mm6 );
+
+ /* mm3 = (top - bot)^2 >> 7 */
+ pmullw_r2r( mm3, mm3 ); /* mm3 = (top - bot)^2 */
+ psrlw_i2r( 7, mm3 ); /* mm3 = ((top - bot)^2 >> 7) */
+
+ /* mm6 is what we want. */
+ psubw_r2r( mm3, mm6 );
+
+ /* FF's if greater than qwTheshold */
+ pcmpgtw_r2r( mm0, mm6 );
+
+ /* Add to count if we are greater than threshold */
+ pand_r2r( mm2, mm6 );
+ paddw_r2r( mm6, mm7 );
+
+ top += 8;
+ mid += 8;
+ bot += 8;
+ }
+
+ movd_r2m( mm7, temp1 );
+ psrlq_i2r( 32, mm7 );
+ movd_r2m( mm7, temp2 );
+ temp1 += temp2;
+ temp2 = temp1;
+ temp1 >>= 16;
+ temp1 += temp2 & 0xffff;
+
+ emms();
+
+ SPEEDY_END();
+
+ return temp1;
+}
+
+#endif
+
+static unsigned long BitShift = 6;
+
+static unsigned int diff_factor_packed422_scanline_c( uint8_t *cur, uint8_t *old, int width )
+{
+ unsigned int ret = 0;
+
+ SPEEDY_START();
+
+ width /= 4;
+
+ while( width-- ) {
+ unsigned int tmp1 = (cur[ 0 ] + cur[ 2 ] + cur[ 4 ] + cur[ 6 ] + 2)>>2;
+ unsigned int tmp2 = (old[ 0 ] + old[ 2 ] + old[ 4 ] + old[ 6 ] + 2)>>2;
+ tmp1 = (tmp1 - tmp2);
+ tmp1 *= tmp1;
+ tmp1 >>= BitShift;
+ ret += tmp1;
+ cur += 8;
+ old += 8;
+ }
+ SPEEDY_END();
+
+ return ret;
+}
+
+static unsigned int diff_factor_packed422_scanline_test_c( uint8_t *cur, uint8_t *old, int width )
+{
+ unsigned int ret = 0;
+
+ SPEEDY_START();
+
+ width /= 16;
+
+ while( width-- ) {
+ unsigned int tmp1 = (cur[ 0 ] + cur[ 2 ] + cur[ 4 ] + cur[ 6 ])>>2;
+ unsigned int tmp2 = (old[ 0 ] + old[ 2 ] + old[ 4 ] + old[ 6 ])>>2;
+ tmp1 = (tmp1 - tmp2);
+ tmp1 *= tmp1;
+ tmp1 >>= BitShift;
+ ret += tmp1;
+ cur += (8*4);
+ old += (8*4);
+ }
+ SPEEDY_END();
+
+ return ret;
+}
+
+#ifdef ARCH_X86
+
+static unsigned int diff_factor_packed422_scanline_mmx( uint8_t *cur, uint8_t *old, int width )
+{
+ const mmx_t qwYMask = { 0x00ff00ff00ff00ffULL };
+ unsigned int temp1, temp2;
+
+ SPEEDY_START();
+
+ width /= 4;
+
+ movq_m2r( qwYMask, mm1 );
+ movd_m2r( BitShift, mm7 );
+ pxor_r2r( mm0, mm0 );
+
+ while( width-- ) {
+ movq_m2r( *cur, mm4 );
+ movq_m2r( *old, mm5 );
+
+ pand_r2r( mm1, mm4 );
+ pand_r2r( mm1, mm5 );
+
+ psubw_r2r( mm5, mm4 ); /* mm4 = Y1 - Y2 */
+ pmaddwd_r2r( mm4, mm4 ); /* mm4 = (Y1 - Y2)^2 */
+ psrld_r2r( mm7, mm4 ); /* divide mm4 by 2^BitShift */
+ paddd_r2r( mm4, mm0 ); /* keep total in mm0 */
+
+ cur += 8;
+ old += 8;
+ }
+
+ movd_r2m( mm0, temp1 );
+ psrlq_i2r( 32, mm0 );
+ movd_r2m( mm0, temp2 );
+ temp1 += temp2;
+
+ emms();
+
+ SPEEDY_END();
+
+ return temp1;
+}
+
+#define ABS(a) (((a) < 0)?-(a):(a))
+
+static void diff_packed422_block8x8_mmx( pulldown_metrics_t *m, uint8_t *old,
+ uint8_t *new, int os, int ns )
+{
+ const mmx_t ymask = { 0x00ff00ff00ff00ffULL };
+ short out[ 24 ]; /* Output buffer for the partial metrics from the mmx code. */
+ uint8_t *outdata = (uint8_t *) out;
+ uint8_t *oldp, *newp;
+ int i;
+
+ SPEEDY_START();
+
+ pxor_r2r( mm4, mm4 ); // 4 even difference sums.
+ pxor_r2r( mm5, mm5 ); // 4 odd difference sums.
+ pxor_r2r( mm7, mm7 ); // zeros
+
+ oldp = old; newp = new;
+ for( i = 4; i; --i ) {
+ // Even difference.
+ movq_m2r( oldp[0], mm0 );
+ movq_m2r( oldp[8], mm2 );
+ pand_m2r( ymask, mm0 );
+ pand_m2r( ymask, mm2 );
+ oldp += os;
+
+ movq_m2r( newp[0], mm1 );
+ movq_m2r( newp[8], mm3 );
+ pand_m2r( ymask, mm1 );
+ pand_m2r( ymask, mm3 );
+ newp += ns;
+
+ movq_r2r( mm0, mm6 );
+ psubusb_r2r( mm1, mm0 );
+ psubusb_r2r( mm6, mm1 );
+ movq_r2r( mm2, mm6 );
+ psubusb_r2r( mm3, mm2 );
+ psubusb_r2r( mm6, mm3 );
+
+ paddw_r2r( mm0, mm4 );
+ paddw_r2r( mm1, mm4 );
+ paddw_r2r( mm2, mm4 );
+ paddw_r2r( mm3, mm4 );
+
+ // Odd difference.
+ movq_m2r( oldp[0], mm0 );
+ movq_m2r( oldp[8], mm2 );
+ pand_m2r( ymask, mm0 );
+ pand_m2r( ymask, mm2 );
+ oldp += os;
+
+ movq_m2r( newp[0], mm1 );
+ movq_m2r( newp[8], mm3 );
+ pand_m2r( ymask, mm1 );
+ pand_m2r( ymask, mm3 );
+ newp += ns;
+
+ movq_r2r( mm0, mm6 );
+ psubusb_r2r( mm1, mm0 );
+ psubusb_r2r( mm6, mm1 );
+ movq_r2r( mm2, mm6 );
+ psubusb_r2r( mm3, mm2 );
+ psubusb_r2r( mm6, mm3 );
+
+ paddw_r2r( mm0, mm5 );
+ paddw_r2r( mm1, mm5 );
+ paddw_r2r( mm2, mm5 );
+ paddw_r2r( mm3, mm5 );
+ }
+ movq_r2m( mm4, outdata[0] );
+ movq_r2m( mm5, outdata[8] );
+
+ m->e = out[0] + out[1] + out[2] + out[3];
+ m->o = out[4] + out[5] + out[6] + out[7];
+ m->d = m->e + m->o;
+
+ pxor_r2r( mm4, mm4 ); // Past spacial noise.
+ pxor_r2r( mm5, mm5 ); // Temporal noise.
+ pxor_r2r( mm6, mm6 ); // Current spacial noise.
+
+ // First loop to measure first four columns
+ oldp = old; newp = new;
+ for( i = 4; i; --i ) {
+ movq_m2r( oldp[0], mm0 );
+ movq_m2r( oldp[os], mm1 );
+ pand_m2r( ymask, mm0 );
+ pand_m2r( ymask, mm1 );
+ oldp += (os*2);
+
+ movq_m2r( newp[0], mm2 );
+ movq_m2r( newp[ns], mm3 );
+ pand_m2r( ymask, mm2 );
+ pand_m2r( ymask, mm3 );
+ newp += (ns*2);
+
+ paddw_r2r( mm1, mm4 );
+ paddw_r2r( mm1, mm5 );
+ paddw_r2r( mm3, mm6 );
+ psubw_r2r( mm0, mm4 );
+ psubw_r2r( mm2, mm5 );
+ psubw_r2r( mm2, mm6 );
+ }
+ movq_r2m( mm4, outdata[0] );
+ movq_r2m( mm5, outdata[16] );
+ movq_r2m( mm6, outdata[32] );
+
+ pxor_r2r( mm4, mm4 );
+ pxor_r2r( mm5, mm5 );
+ pxor_r2r( mm6, mm6 );
+
+ // Second loop for the last four columns
+ oldp = old; newp = new;
+ for( i = 4; i; --i ) {
+ movq_m2r( oldp[8], mm0 );
+ movq_m2r( oldp[os+8], mm1 );
+ pand_m2r( ymask, mm0 );
+ pand_m2r( ymask, mm1 );
+ oldp += (os*2);
+
+ movq_m2r( newp[8], mm2 );
+ movq_m2r( newp[ns+8], mm3 );
+ pand_m2r( ymask, mm2 );
+ pand_m2r( ymask, mm3 );
+ newp += (ns*2);
+
+ paddw_r2r( mm1, mm4 );
+ paddw_r2r( mm1, mm5 );
+ paddw_r2r( mm3, mm6 );
+ psubw_r2r( mm0, mm4 );
+ psubw_r2r( mm2, mm5 );
+ psubw_r2r( mm2, mm6 );
+ }
+ movq_r2m( mm4, outdata[8] );
+ movq_r2m( mm5, outdata[24] );
+ movq_r2m( mm6, outdata[40] );
+
+ m->p = m->t = m->s = 0;
+ for (i=0; i<8; i++) {
+ // FIXME: move abs() into the mmx code!
+ m->p += ABS(out[i]);
+ m->t += ABS(out[8+i]);
+ m->s += ABS(out[16+i]);
+ }
+
+ emms();
+
+ SPEEDY_END();
+}
+
+#endif
+
+static void diff_packed422_block8x8_c( pulldown_metrics_t *m, uint8_t *old,
+ uint8_t *new, int os, int ns )
+{
+ int x, y, e=0, o=0, s=0, p=0, t=0;
+ uint8_t *oldp, *newp;
+
+ SPEEDY_START();
+ m->s = m->p = m->t = 0;
+ for (x = 8; x; x--) {
+ oldp = old; old += 2;
+ newp = new; new += 2;
+ s = p = t = 0;
+ for (y = 4; y; y--) {
+ e += ABS(newp[0] - oldp[0]);
+ o += ABS(newp[ns] - oldp[os]);
+ s += newp[ns]-newp[0];
+ p += oldp[os]-oldp[0];
+ t += oldp[os]-newp[0];
+ oldp += os<<1;
+ newp += ns<<1;
+ }
+ m->s += ABS(s);
+ m->p += ABS(p);
+ m->t += ABS(t);
+ }
+ m->e = e;
+ m->o = o;
+ m->d = e+o;
+ SPEEDY_END();
+}
+
+static void packed444_to_packed422_scanline_c( uint8_t *output, uint8_t *input, int width )
+{
+ SPEEDY_START();
+ width /= 2;
+ while( width-- ) {
+ output[ 0 ] = input[ 0 ];
+ output[ 1 ] = input[ 1 ];
+ output[ 2 ] = input[ 3 ];
+ output[ 3 ] = input[ 2 ];
+ output += 4;
+ input += 6;
+ }
+ SPEEDY_END();
+}
+
+static void packed422_to_packed444_scanline_c( uint8_t *output, uint8_t *input, int width )
+{
+ SPEEDY_START();
+ width /= 2;
+ while( width-- ) {
+ output[ 0 ] = input[ 0 ];
+ output[ 1 ] = input[ 1 ];
+ output[ 2 ] = input[ 3 ];
+ output[ 3 ] = input[ 2 ];
+ output[ 4 ] = input[ 1 ];
+ output[ 5 ] = input[ 3 ];
+ output += 6;
+ input += 4;
+ }
+ SPEEDY_END();
+}
+
+/**
+ * For the middle pixels, the filter kernel is:
+ *
+ * [-1 3 -6 12 -24 80 80 -24 12 -6 3 -1]
+ */
+void packed422_to_packed444_rec601_scanline( uint8_t *dest, uint8_t *src, int width )
+{
+ int i;
+
+ SPEEDY_START();
+ /* Process two input pixels at a time. Input is [Y'][Cb][Y'][Cr]. */
+ for( i = 0; i < width / 2; i++ ) {
+ dest[ (i*6) + 0 ] = src[ (i*4) + 0 ];
+ dest[ (i*6) + 1 ] = src[ (i*4) + 1 ];
+ dest[ (i*6) + 2 ] = src[ (i*4) + 3 ];
+
+ dest[ (i*6) + 3 ] = src[ (i*4) + 2 ];
+ if( i > (5*2) && i < ((width/2) - (6*2)) ) {
+ dest[ (i*6) + 4 ] = clip255( (( (80*(src[ (i*4) + 1 ] + src[ (i*4) + 5 ]))
+ - (24*(src[ (i*4) - 3 ] + src[ (i*4) + 9 ]))
+ + (12*(src[ (i*4) - 7 ] + src[ (i*4) + 13]))
+ - ( 6*(src[ (i*4) - 11] + src[ (i*4) + 17]))
+ + ( 3*(src[ (i*4) - 15] + src[ (i*4) + 21]))
+ - ( (src[ (i*4) - 19] + src[ (i*4) + 25]))) + 64) >> 7 );
+ dest[ (i*6) + 5 ] = clip255( (( (80*(src[ (i*4) + 3 ] + src[ (i*4) + 7 ]))
+ - (24*(src[ (i*4) - 1 ] + src[ (i*4) + 11]))
+ + (12*(src[ (i*4) - 5 ] + src[ (i*4) + 15]))
+ - ( 6*(src[ (i*4) - 9 ] + src[ (i*4) + 19]))
+ + ( 3*(src[ (i*4) - 13] + src[ (i*4) + 23]))
+ - ( (src[ (i*4) - 17] + src[ (i*4) + 27]))) + 64) >> 7 );
+ } else if( i < ((width/2) - 1) ) {
+ dest[ (i*6) + 4 ] = (src[ (i*4) + 1 ] + src[ (i*4) + 5 ] + 1) >> 1;
+ dest[ (i*6) + 5 ] = (src[ (i*4) + 3 ] + src[ (i*4) + 7 ] + 1) >> 1;
+ } else {
+ dest[ (i*6) + 4 ] = src[ (i*4) + 1 ];
+ dest[ (i*6) + 5 ] = src[ (i*4) + 3 ];
+ }
+ }
+ SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void kill_chroma_packed422_inplace_scanline_mmx( uint8_t *data, int width )
+{
+ const mmx_t ymask = { 0x00ff00ff00ff00ffULL };
+ const mmx_t nullchroma = { 0x8000800080008000ULL };
+
+ SPEEDY_START();
+
+ movq_m2r( ymask, mm7 );
+ movq_m2r( nullchroma, mm6 );
+ for(; width > 4; width -= 4 ) {
+ movq_m2r( *data, mm0 );
+ pand_r2r( mm7, mm0 );
+ paddb_r2r( mm6, mm0 );
+ movq_r2m( mm0, *data );
+ data += 8;
+ }
+ emms();
+
+ while( width-- ) {
+ data[ 1 ] = 128;
+ data += 2;
+ }
+ SPEEDY_END();
+}
+
+#endif
+
+static void kill_chroma_packed422_inplace_scanline_c( uint8_t *data, int width )
+{
+ SPEEDY_START();
+ while( width-- ) {
+ data[ 1 ] = 128;
+ data += 2;
+ }
+ SPEEDY_END();
+}
+
+/*
+// this duplicates alternate lines in alternate frames to highlight or mute
+// the effects of chroma crawl. it is not a solution or proper filter. it's
+// only for testing.
+static void testing_packed422_inplace_scanline_c( uint8_t *data, int width, int scanline )
+{
+ volatile static int topbottom = 0;
+ static uint8_t scanbuffer[2048];
+
+ SPEEDY_START();
+ if( scanline <= 1 ) {
+ topbottom = scanline;
+ memcpy(scanbuffer, data, width*2);
+ }
+ if ( scanline < 10 ) {
+ printf("scanline: %d %d\n", scanline, topbottom);
+ }
+ if ( ((scanline-topbottom)/2)%2 && scanline > 1 ) {
+ memcpy(data, scanbuffer, width*2);
+ } else {
+ memcpy(scanbuffer, data, width*2);
+ }
+ SPEEDY_END();
+}
+*/
+
+static void mirror_packed422_inplace_scanline_c( uint8_t *data, int width )
+{
+ int x, tmp1, tmp2;
+ int width2 = width*2;
+
+ SPEEDY_START();
+ for( x = 0; x < width; x += 2 ) {
+ tmp1 = data[ x ];
+ tmp2 = data[ x+1 ];
+ data[ x ] = data[ width2 - x ];
+ data[ x+1 ] = data[ width2 - x + 1 ];
+ data[ width2 - x ] = tmp1;
+ data[ width2 - x + 1 ] = tmp2;
+ }
+ SPEEDY_END();
+}
+
+static void halfmirror_packed422_inplace_scanline_c( uint8_t *data, int width )
+{
+ int x;
+
+ SPEEDY_START();
+ for( x = 0; x < width; x += 2 ) {
+ data[ width + x ] = data[ width - x ];
+ data[ width + x + 1 ] = data[ width - x + 1 ];
+ }
+ SPEEDY_END();
+}
+
+static void filter_luma_121_packed422_inplace_scanline_c( uint8_t *data, int width )
+{
+ int r1 = 0;
+ int r2 = 0;
+
+ SPEEDY_START();
+ data += 2;
+ width -= 1;
+ while( width-- ) {
+ int s1, s2;
+ s1 = *data + r1; r1 = *data;
+ s2 = s1 + r2; r2 = s1;
+ *(data - 2) = s2 >> 2;
+ data += 2;
+ }
+ SPEEDY_END();
+}
+
+static void filter_luma_14641_packed422_inplace_scanline_c( uint8_t *data, int width )
+{
+ int r1 = 0;
+ int r2 = 0;
+ int r3 = 0;
+ int r4 = 0;
+
+ SPEEDY_START();
+ width -= 4;
+ data += 4;
+ while( width-- ) {
+ int s1, s2, s3, s4;
+ s1 = *data + r1; r1 = *data;
+ s2 = s1 + r2; r2 = s1;
+ s3 = s2 + r3; r3 = s2;
+ s4 = s3 + r4; r4 = s3;
+ *(data - 4) = s4 >> 4;
+ data += 2;
+ }
+ SPEEDY_END();
+}
+
+static void interpolate_packed422_scanline_c( uint8_t *output, uint8_t *top,
+ uint8_t *bot, int width )
+{
+ int i;
+
+ SPEEDY_START();
+
+ for( i = width*2; i; --i ) {
+ *output++ = ((*top++) + (*bot++)) >> 1;
+ }
+
+ SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void interpolate_packed422_scanline_mmx( uint8_t *output, uint8_t *top,
+ uint8_t *bot, int width )
+{
+ const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */
+ int i;
+
+ SPEEDY_START();
+
+ for( i = width/16; i; --i ) {
+ movq_m2r( *bot, mm0 );
+ movq_m2r( *top, mm1 );
+ movq_m2r( *(bot + 8), mm2 );
+ movq_m2r( *(top + 8), mm3 );
+ movq_m2r( *(bot + 16), mm4 );
+ movq_m2r( *(top + 16), mm5 );
+ movq_m2r( *(bot + 24), mm6 );
+ movq_m2r( *(top + 24), mm7 );
+ pand_m2r( shiftmask, mm0 );
+ pand_m2r( shiftmask, mm1 );
+ pand_m2r( shiftmask, mm2 );
+ pand_m2r( shiftmask, mm3 );
+ pand_m2r( shiftmask, mm4 );
+ pand_m2r( shiftmask, mm5 );
+ pand_m2r( shiftmask, mm6 );
+ pand_m2r( shiftmask, mm7 );
+ psrlw_i2r( 1, mm0 );
+ psrlw_i2r( 1, mm1 );
+ psrlw_i2r( 1, mm2 );
+ psrlw_i2r( 1, mm3 );
+ psrlw_i2r( 1, mm4 );
+ psrlw_i2r( 1, mm5 );
+ psrlw_i2r( 1, mm6 );
+ psrlw_i2r( 1, mm7 );
+ paddb_r2r( mm1, mm0 );
+ paddb_r2r( mm3, mm2 );
+ paddb_r2r( mm5, mm4 );
+ paddb_r2r( mm7, mm6 );
+ movq_r2m( mm0, *output );
+ movq_r2m( mm2, *(output + 8) );
+ movq_r2m( mm4, *(output + 16) );
+ movq_r2m( mm6, *(output + 24) );
+ output += 32;
+ top += 32;
+ bot += 32;
+ }
+ width = (width & 0xf);
+
+ for( i = width/4; i; --i ) {
+ movq_m2r( *bot, mm0 );
+ movq_m2r( *top, mm1 );
+ pand_m2r( shiftmask, mm0 );
+ pand_m2r( shiftmask, mm1 );
+ psrlw_i2r( 1, mm0 );
+ psrlw_i2r( 1, mm1 );
+ paddb_r2r( mm1, mm0 );
+ movq_r2m( mm0, *output );
+ output += 8;
+ top += 8;
+ bot += 8;
+ }
+ width = width & 0x7;
+
+ /* Handle last few pixels. */
+ for( i = width * 2; i; --i ) {
+ *output++ = ((*top++) + (*bot++)) >> 1;
+ }
+
+ emms();
+
+ SPEEDY_END();
+}
+
+static void interpolate_packed422_scanline_mmxext( uint8_t *output, uint8_t *top,
+ uint8_t *bot, int width )
+{
+ int i;
+
+ SPEEDY_START();
+
+ for( i = width/16; i; --i ) {
+ movq_m2r( *bot, mm0 );
+ movq_m2r( *top, mm1 );
+ movq_m2r( *(bot + 8), mm2 );
+ movq_m2r( *(top + 8), mm3 );
+ movq_m2r( *(bot + 16), mm4 );
+ movq_m2r( *(top + 16), mm5 );
+ movq_m2r( *(bot + 24), mm6 );
+ movq_m2r( *(top + 24), mm7 );
+ pavgb_r2r( mm1, mm0 );
+ pavgb_r2r( mm3, mm2 );
+ pavgb_r2r( mm5, mm4 );
+ pavgb_r2r( mm7, mm6 );
+ movntq_r2m( mm0, *output );
+ movntq_r2m( mm2, *(output + 8) );
+ movntq_r2m( mm4, *(output + 16) );
+ movntq_r2m( mm6, *(output + 24) );
+ output += 32;
+ top += 32;
+ bot += 32;
+ }
+ width = (width & 0xf);
+
+ for( i = width/4; i; --i ) {
+ movq_m2r( *bot, mm0 );
+ movq_m2r( *top, mm1 );
+ pavgb_r2r( mm1, mm0 );
+ movntq_r2m( mm0, *output );
+ output += 8;
+ top += 8;
+ bot += 8;
+ }
+ width = width & 0x7;
+
+ /* Handle last few pixels. */
+ for( i = width * 2; i; --i ) {
+ *output++ = ((*top++) + (*bot++)) >> 1;
+ }
+
+ sfence();
+ emms();
+
+ SPEEDY_END();
+}
+
+#endif
+
+static void blit_colour_packed422_scanline_c( uint8_t *output, int width, int y, int cb, int cr )
+{
+ uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;
+ uint32_t *o = (uint32_t *) output;
+
+ SPEEDY_START();
+
+ for( width /= 2; width; --width ) {
+ *o++ = colour;
+ }
+
+ SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void blit_colour_packed422_scanline_mmx( uint8_t *output, int width, int y, int cb, int cr )
+{
+ uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;
+ int i;
+
+ SPEEDY_START();
+
+ movd_m2r( colour, mm1 );
+ movd_m2r( colour, mm2 );
+ psllq_i2r( 32, mm1 );
+ por_r2r( mm1, mm2 );
+
+ for( i = width / 16; i; --i ) {
+ movq_r2m( mm2, *output );
+ movq_r2m( mm2, *(output + 8) );
+ movq_r2m( mm2, *(output + 16) );
+ movq_r2m( mm2, *(output + 24) );
+ output += 32;
+ }
+ width = (width & 0xf);
+
+ for( i = width / 4; i; --i ) {
+ movq_r2m( mm2, *output );
+ output += 8;
+ }
+ width = (width & 0x7);
+
+ for( i = width / 2; i; --i ) {
+ *((uint32_t *) output) = colour;
+ output += 4;
+ }
+
+ if( width & 1 ) {
+ *output = y;
+ *(output + 1) = cb;
+ }
+
+ emms();
+
+ SPEEDY_END();
+}
+
+static void blit_colour_packed422_scanline_mmxext( uint8_t *output, int width, int y, int cb, int cr )
+{
+ uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;
+ int i;
+
+ SPEEDY_START();
+
+ movd_m2r( colour, mm1 );
+ movd_m2r( colour, mm2 );
+ psllq_i2r( 32, mm1 );
+ por_r2r( mm1, mm2 );
+
+ for( i = width / 16; i; --i ) {
+ movntq_r2m( mm2, *output );
+ movntq_r2m( mm2, *(output + 8) );
+ movntq_r2m( mm2, *(output + 16) );
+ movntq_r2m( mm2, *(output + 24) );
+ output += 32;
+ }
+ width = (width & 0xf);
+
+ for( i = width / 4; i; --i ) {
+ movntq_r2m( mm2, *output );
+ output += 8;
+ }
+ width = (width & 0x7);
+
+ for( i = width / 2; i; --i ) {
+ *((uint32_t *) output) = colour;
+ output += 4;
+ }
+
+ if( width & 1 ) {
+ *output = y;
+ *(output + 1) = cb;
+ }
+
+ sfence();
+ emms();
+
+ SPEEDY_END();
+}
+
+#endif
+
+static void blit_colour_packed4444_scanline_c( uint8_t *output, int width,
+ int alpha, int luma, int cb, int cr )
+{
+ int j;
+
+ SPEEDY_START();
+
+ for( j = 0; j < width; j++ ) {
+ *output++ = alpha;
+ *output++ = luma;
+ *output++ = cb;
+ *output++ = cr;
+ }
+
+ SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void blit_colour_packed4444_scanline_mmx( uint8_t *output, int width,
+ int alpha, int luma,
+ int cb, int cr )
+{
+ uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha;
+ int i;
+
+ SPEEDY_START();
+
+ movd_m2r( colour, mm1 );
+ movd_m2r( colour, mm2 );
+ psllq_i2r( 32, mm1 );
+ por_r2r( mm1, mm2 );
+
+ for( i = width / 8; i; --i ) {
+ movq_r2m( mm2, *output );
+ movq_r2m( mm2, *(output + 8) );
+ movq_r2m( mm2, *(output + 16) );
+ movq_r2m( mm2, *(output + 24) );
+ output += 32;
+ }
+ width = (width & 0x7);
+
+ for( i = width / 2; i; --i ) {
+ movq_r2m( mm2, *output );
+ output += 8;
+ }
+ width = (width & 0x1);
+
+ if( width ) {
+ *((uint32_t *) output) = colour;
+ output += 4;
+ }
+
+ emms();
+
+ SPEEDY_END();
+}
+
+void blit_colour_packed4444_scanline_mmxext( uint8_t *output, int width,
+ int alpha, int luma,
+ int cb, int cr )
+{
+ uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha;
+ int i;
+
+ SPEEDY_START();
+
+ movd_m2r( colour, mm1 );
+ movd_m2r( colour, mm2 );
+ psllq_i2r( 32, mm1 );
+ por_r2r( mm1, mm2 );
+
+ for( i = width / 8; i; --i ) {
+ movntq_r2m( mm2, *output );
+ movntq_r2m( mm2, *(output + 8) );
+ movntq_r2m( mm2, *(output + 16) );
+ movntq_r2m( mm2, *(output + 24) );
+ output += 32;
+ }
+ width = (width & 0x7);
+
+ for( i = width / 2; i; --i ) {
+ movntq_r2m( mm2, *output );
+ output += 8;
+ }
+ width = (width & 0x1);
+
+ if( width ) {
+ *((uint32_t *) output) = colour;
+ output += 4;
+ }
+
+ sfence();
+ emms();
+
+ SPEEDY_END();
+}
+
+#endif
+
+static void blit_packed422_scanline_c( uint8_t *dest, const uint8_t *src, int width )
+{
+ xine_fast_memcpy( dest, src, width*2 );
+}
+
+static void blit_packed422_scanline_mmx( uint8_t *dest, const uint8_t *src, int width )
+{
+ xine_fast_memcpy( dest, src, width*2 );
+}
+
+static void blit_packed422_scanline_mmxext( uint8_t *dest, const uint8_t *src, int width )
+{
+ xine_fast_memcpy( dest, src, width*2 );
+}
+
+static void composite_packed4444_alpha_to_packed422_scanline_c( uint8_t *output, uint8_t *input,
+ uint8_t *foreground, int width, int alpha )
+{
+ int i;
+
+ SPEEDY_START();
+ for( i = 0; i < width; i++ ) {
+ int af = foreground[ 0 ];
+
+ if( af ) {
+ int a = ((af * alpha) + 0x80) >> 8;
+
+
+ if( a == 0xff ) {
+ output[ 0 ] = foreground[ 1 ];
+
+ if( ( i & 1 ) == 0 ) {
+ output[ 1 ] = foreground[ 2 ];
+ output[ 3 ] = foreground[ 3 ];
+ }
+ } else if( a ) {
+ /**
+ * (1 - alpha)*B + alpha*F
+ * (1 - af*a)*B + af*a*F
+ * B - af*a*B + af*a*F
+ * B + a*(af*F - af*B)
+ */
+
+ output[ 0 ] = input[ 0 ]
+ + ((alpha*( foreground[ 1 ]
+ - multiply_alpha( foreground[ 0 ], input[ 0 ] ) ) + 0x80) >> 8);
+
+ if( ( i & 1 ) == 0 ) {
+
+ /**
+ * At first I thought I was doing this incorrectly, but
+ * the following math has convinced me otherwise.
+ *
+ * C_r = (1 - alpha)*B + alpha*F
+ * C_r = B - af*a*B + af*a*F
+ *
+ * C_r = 128 + ((1 - af*a)*(B - 128) + a*af*(F - 128))
+ * C_r = 128 + (B - af*a*B - 128 + af*a*128 + a*af*F - a*af*128)
+ * C_r = B - af*a*B + a*af*F
+ */
+
+ output[ 1 ] = input[ 1 ] + ((alpha*( foreground[ 2 ]
+ - multiply_alpha( foreground[ 0 ], input[ 1 ] ) ) + 0x80) >> 8);
+ output[ 3 ] = input[ 3 ] + ((alpha*( foreground[ 3 ]
+ - multiply_alpha( foreground[ 0 ], input[ 3 ] ) ) + 0x80) >> 8);
+ }
+ }
+ }
+ foreground += 4;
+ output += 2;
+ input += 2;
+ }
+ SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void composite_packed4444_alpha_to_packed422_scanline_mmxext( uint8_t *output,
+ uint8_t *input,
+ uint8_t *foreground,
+ int width, int alpha )
+{
+ const mmx_t alpha2 = { 0x0000FFFF00000000ULL };
+ const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL };
+ const mmx_t round = { 0x0080008000800080ULL };
+ int i;
+
+ if( !alpha ) {
+ blit_packed422_scanline( output, input, width );
+ return;
+ }
+
+ if( alpha == 256 ) {
+ composite_packed4444_to_packed422_scanline( output, input, foreground, width );
+ return;
+ }
+
+ SPEEDY_START();
+ READ_PREFETCH_2048( input );
+ READ_PREFETCH_2048( foreground );
+
+ movq_m2r( alpha, mm2 );
+ pshufw_r2r( mm2, mm2, 0 );
+ pxor_r2r( mm7, mm7 );
+
+ for( i = width/2; i; i-- ) {
+ int fg1 = *((uint32_t *) foreground);
+ int fg2 = *(((uint32_t *) foreground)+1);
+
+ if( fg1 || fg2 ) {
+ /* mm1 = [ cr ][ y ][ cb ][ y ] */
+ movd_m2r( *input, mm1 );
+ punpcklbw_r2r( mm7, mm1 );
+
+ movq_m2r( *foreground, mm3 );
+ movq_r2r( mm3, mm4 );
+ punpcklbw_r2r( mm7, mm3 );
+ punpckhbw_r2r( mm7, mm4 );
+ /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */
+
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */
+ pshufw_r2r( mm3, mm5, 0 );
+ pshufw_r2r( mm4, mm6, 0 );
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */
+ pshufw_r2r( mm3, mm3, 201 );
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */
+ pshufw_r2r( mm4, mm4, 16 );
+
+ pand_m2r( alpha1, mm3 );
+ pand_m2r( alpha2, mm4 );
+ pand_m2r( alpha1, mm5 );
+ pand_m2r( alpha2, mm6 );
+ por_r2r( mm4, mm3 );
+ por_r2r( mm6, mm5 );
+
+ /* now, mm5 is af and mm1 is B. Need to multiply them. */
+ pmullw_r2r( mm1, mm5 );
+
+ /* Multiply by appalpha. */
+ pmullw_r2r( mm2, mm3 );
+ paddw_m2r( round, mm3 );
+ psrlw_i2r( 8, mm3 );
+ /* Result is now B + F. */
+ paddw_r2r( mm3, mm1 );
+
+ /* Round up appropriately. */
+ paddw_m2r( round, mm5 );
+
+ /* mm6 contains our i>>8; */
+ movq_r2r( mm5, mm6 );
+ psrlw_i2r( 8, mm6 );
+
+ /* Add mm6 back into mm5. Now our result is in the high bytes. */
+ paddw_r2r( mm6, mm5 );
+
+ /* Shift down. */
+ psrlw_i2r( 8, mm5 );
+
+ /* Multiply by appalpha. */
+ pmullw_r2r( mm2, mm5 );
+ paddw_m2r( round, mm5 );
+ psrlw_i2r( 8, mm5 );
+
+ psubusw_r2r( mm5, mm1 );
+
+ /* mm1 = [ B + F - af*B ] */
+ packuswb_r2r( mm1, mm1 );
+ movd_r2m( mm1, *output );
+ }
+
+ foreground += 8;
+ output += 4;
+ input += 4;
+ }
+ sfence();
+ emms();
+
+ SPEEDY_END();
+}
+
+#endif
+
+static void composite_packed4444_to_packed422_scanline_c( uint8_t *output, uint8_t *input,
+ uint8_t *foreground, int width )
+{
+ int i;
+ SPEEDY_START();
+ for( i = 0; i < width; i++ ) {
+ int a = foreground[ 0 ];
+
+ if( a == 0xff ) {
+ output[ 0 ] = foreground[ 1 ];
+
+ if( ( i & 1 ) == 0 ) {
+ output[ 1 ] = foreground[ 2 ];
+ output[ 3 ] = foreground[ 3 ];
+ }
+ } else if( a ) {
+ /**
+ * (1 - alpha)*B + alpha*F
+ * B + af*F - af*B
+ */
+
+ output[ 0 ] = input[ 0 ] + foreground[ 1 ] - multiply_alpha( foreground[ 0 ], input[ 0 ] );
+
+ if( ( i & 1 ) == 0 ) {
+
+ /**
+ * C_r = (1 - af)*B + af*F
+ * C_r = B - af*B + af*F
+ */
+
+ output[ 1 ] = input[ 1 ] + foreground[ 2 ] - multiply_alpha( foreground[ 0 ], input[ 1 ] );
+ output[ 3 ] = input[ 3 ] + foreground[ 3 ] - multiply_alpha( foreground[ 0 ], input[ 3 ] );
+ }
+ }
+ foreground += 4;
+ output += 2;
+ input += 2;
+ }
+ SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void composite_packed4444_to_packed422_scanline_mmxext( uint8_t *output, uint8_t *input,
+ uint8_t *foreground, int width )
+{
+ const mmx_t alpha2 = { 0x0000FFFF00000000ULL };
+ const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL };
+ const mmx_t round = { 0x0080008000800080ULL };
+ int i;
+
+ SPEEDY_START();
+ READ_PREFETCH_2048( input );
+ READ_PREFETCH_2048( foreground );
+
+ pxor_r2r( mm7, mm7 );
+ for( i = width/2; i; i-- ) {
+ int fg1 = *((uint32_t *) foreground);
+ int fg2 = *(((uint32_t *) foreground)+1);
+
+ if( (fg1 & 0xff) == 0xff && (fg2 & 0xff) == 0xff ) {
+ movq_m2r( *foreground, mm3 );
+ movq_r2r( mm3, mm4 );
+ punpcklbw_r2r( mm7, mm3 );
+ punpckhbw_r2r( mm7, mm4 );
+ /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */
+ pshufw_r2r( mm3, mm3, 201 );
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */
+ pshufw_r2r( mm4, mm4, 16 );
+ pand_m2r( alpha1, mm3 );
+ pand_m2r( alpha2, mm4 );
+ por_r2r( mm4, mm3 );
+ /* mm1 = [ B + F - af*B ] */
+ packuswb_r2r( mm3, mm3 );
+ movd_r2m( mm3, *output );
+ } else if( fg1 || fg2 ) {
+
+ /* mm1 = [ cr ][ y ][ cb ][ y ] */
+ movd_m2r( *input, mm1 );
+ punpcklbw_r2r( mm7, mm1 );
+
+ movq_m2r( *foreground, mm3 );
+ movq_r2r( mm3, mm4 );
+ punpcklbw_r2r( mm7, mm3 );
+ punpckhbw_r2r( mm7, mm4 );
+ /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */
+
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */
+ pshufw_r2r( mm3, mm5, 0 );
+ pshufw_r2r( mm4, mm6, 0 );
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */
+ pshufw_r2r( mm3, mm3, 201 );
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */
+ pshufw_r2r( mm4, mm4, 16 );
+
+ pand_m2r( alpha1, mm3 );
+ pand_m2r( alpha2, mm4 );
+ pand_m2r( alpha1, mm5 );
+ pand_m2r( alpha2, mm6 );
+ por_r2r( mm4, mm3 );
+ por_r2r( mm6, mm5 );
+
+ /* now, mm5 is af and mm1 is B. Need to multiply them. */
+ pmullw_r2r( mm1, mm5 );
+
+ /* Result is now B + F. */
+ paddw_r2r( mm3, mm1 );
+
+ /* Round up appropriately. */
+ paddw_m2r( round, mm5 );
+
+ /* mm6 contains our i>>8; */
+ movq_r2r( mm5, mm6 );
+ psrlw_i2r( 8, mm6 );
+
+ /* Add mm6 back into mm5. Now our result is in the high bytes. */
+ paddw_r2r( mm6, mm5 );
+
+ /* Shift down. */
+ psrlw_i2r( 8, mm5 );
+
+ psubusw_r2r( mm5, mm1 );
+
+ /* mm1 = [ B + F - af*B ] */
+ packuswb_r2r( mm1, mm1 );
+ movd_r2m( mm1, *output );
+ }
+
+ foreground += 8;
+ output += 4;
+ input += 4;
+ }
+ sfence();
+ emms();
+
+ SPEEDY_END();
+}
+
+#endif
+
+/**
+ * um... just need some scrap paper...
+ * D = (1 - alpha)*B + alpha*F
+ * D = (1 - a)*B + a*textluma
+ * = B - a*B + a*textluma
+ * = B + a*(textluma - B)
+ * Da = (1 - a)*b + a
+ */
+static void composite_alphamask_to_packed4444_scanline_c( uint8_t *output,
+ uint8_t *input,
+ uint8_t *mask,
+ int width,
+ int textluma, int textcb,
+ int textcr )
+{
+ uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;
+ int i;
+
+ SPEEDY_START();
+
+ for( i = 0; i < width; i++ ) {
+ int a = *mask;
+
+ if( a == 0xff ) {
+ *((uint32_t *) output) = opaque;
+ } else if( (input[ 0 ] == 0x00) ) {
+ *((uint32_t *) output) = (multiply_alpha( a, textcr ) << 24)
+ | (multiply_alpha( a, textcb ) << 16)
+ | (multiply_alpha( a, textluma ) << 8) | a;
+ } else if( a ) {
+ *((uint32_t *) output) = ((input[ 3 ] + multiply_alpha( a, textcr - input[ 3 ] )) << 24)
+ | ((input[ 2 ] + multiply_alpha( a, textcb - input[ 2 ] )) << 16)
+ | ((input[ 1 ] + multiply_alpha( a, textluma - input[ 1 ] )) << 8)
+ | (input[ 0 ] + multiply_alpha( a, 0xff - input[ 0 ] ));
+ }
+ mask++;
+ output += 4;
+ input += 4;
+ }
+ SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void composite_alphamask_to_packed4444_scanline_mmxext( uint8_t *output,
+ uint8_t *input,
+ uint8_t *mask,
+ int width,
+ int textluma, int textcb,
+ int textcr )
+{
+ uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;
+ const mmx_t round = { 0x0080008000800080ULL };
+ const mmx_t fullalpha = { 0x00000000000000ffULL };
+ mmx_t colour;
+
+ SPEEDY_START();
+
+ colour.w[ 0 ] = 0x00;
+ colour.w[ 1 ] = textluma;
+ colour.w[ 2 ] = textcb;
+ colour.w[ 3 ] = textcr;
+
+ movq_m2r( colour, mm1 );
+ movq_r2r( mm1, mm0 );
+
+ /* mm0 = [ cr ][ cb ][ y ][ 0xff ] */
+ paddw_m2r( fullalpha, mm0 );
+
+ /* mm7 = 0 */
+ pxor_r2r( mm7, mm7 );
+
+ /* mm6 = round */
+ movq_m2r( round, mm6 );
+
+ while( width-- ) {
+ int a = *mask;
+
+ if( a == 0xff ) {
+ *((uint32_t *) output) = opaque;
+ } else if( (input[ 0 ] == 0x00) ) {
+ /* We just need to multiply our colour by the alpha value. */
+
+ /* mm2 = [ a ][ a ][ a ][ a ] */
+ movd_m2r( a, mm2 );
+ movq_r2r( mm2, mm3 );
+ pshufw_r2r( mm2, mm2, 0 );
+
+ /* mm5 = [ cr ][ cb ][ y ][ 0 ] */
+ movq_r2r( mm1, mm5 );
+
+ /* Multiply by alpha. */
+ pmullw_r2r( mm2, mm5 );
+ paddw_m2r( round, mm5 );
+ movq_r2r( mm5, mm6 );
+ psrlw_i2r( 8, mm6 );
+ paddw_r2r( mm6, mm5 );
+ psrlw_i2r( 8, mm5 );
+
+ /* Set alpha to a. */
+ por_r2r( mm3, mm5 );
+
+ /* Pack and write our result. */
+ packuswb_r2r( mm5, mm5 );
+ movd_r2m( mm5, *output );
+ } else if( a ) {
+ /* mm2 = [ a ][ a ][ a ][ a ] */
+ movd_m2r( a, mm2 );
+ pshufw_r2r( mm2, mm2, 0 );
+
+ /* mm3 = [ cr ][ cb ][ y ][ 0xff ] */
+ movq_r2r( mm0, mm3 );
+
+ /* mm4 = [ i_cr ][ i_cb ][ i_y ][ i_a ] */
+ movd_m2r( *input, mm4 );
+ punpcklbw_r2r( mm7, mm4 );
+
+ /* Subtract input and colour. */
+ psubw_r2r( mm4, mm3 ); /* mm3 = mm3 - mm4 */
+
+ /* Multiply alpha. */
+ pmullw_r2r( mm2, mm3 );
+ paddw_r2r( mm6, mm3 );
+ movq_r2r( mm3, mm2 );
+ psrlw_i2r( 8, mm3 );
+ paddw_r2r( mm2, mm3 );
+ psrlw_i2r( 8, mm3 );
+
+ /* Add back in the input. */
+ paddb_r2r( mm3, mm4 );
+
+ /* Write result. */
+ packuswb_r2r( mm4, mm4 );
+ movd_r2m( mm4, *output );
+ }
+ mask++;
+ output += 4;
+ input += 4;
+ }
+ sfence();
+ emms();
+ SPEEDY_END();
+}
+
+#endif
+
+static void composite_alphamask_alpha_to_packed4444_scanline_c( uint8_t *output,
+ uint8_t *input,
+ uint8_t *mask, int width,
+ int textluma, int textcb,
+ int textcr, int alpha )
+{
+ uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;
+ int i;
+
+ SPEEDY_START();
+
+ for( i = 0; i < width; i++ ) {
+ int af = *mask;
+
+ if( af ) {
+ int a = ((af * alpha) + 0x80) >> 8;
+
+ if( a == 0xff ) {
+ *((uint32_t *) output) = opaque;
+ } else if( input[ 0 ] == 0x00 ) {
+ *((uint32_t *) output) = (multiply_alpha( a, textcr ) << 24)
+ | (multiply_alpha( a, textcb ) << 16)
+ | (multiply_alpha( a, textluma ) << 8) | a;
+ } else if( a ) {
+ *((uint32_t *) output) = ((input[ 3 ] + multiply_alpha( a, textcr - input[ 3 ] )) << 24)
+ | ((input[ 2 ] + multiply_alpha( a, textcb - input[ 2 ] )) << 16)
+ | ((input[ 1 ] + multiply_alpha( a, textluma - input[ 1 ] )) << 8)
+ | (a + multiply_alpha( 0xff - a, input[ 0 ] ));
+ }
+ }
+ mask++;
+ output += 4;
+ input += 4;
+ }
+
+ SPEEDY_END();
+}
+
+static void premultiply_packed4444_scanline_c( uint8_t *output, uint8_t *input, int width )
+{
+ SPEEDY_START();
+
+ while( width-- ) {
+ unsigned int cur_a = input[ 0 ];
+
+ *((uint32_t *) output) = (multiply_alpha( cur_a, input[ 3 ] ) << 24)
+ | (multiply_alpha( cur_a, input[ 2 ] ) << 16)
+ | (multiply_alpha( cur_a, input[ 1 ] ) << 8)
+ | cur_a;
+
+ output += 4;
+ input += 4;
+ }
+
+ SPEEDY_END();
+}
+
+#ifdef ARCH_X86
+
+static void premultiply_packed4444_scanline_mmxext( uint8_t *output, uint8_t *input, int width )
+{
+ const mmx_t round = { 0x0080008000800080ULL };
+ const mmx_t alpha = { 0x00000000000000ffULL };
+ const mmx_t noalp = { 0xffffffffffff0000ULL };
+
+ SPEEDY_START();
+
+ pxor_r2r( mm7, mm7 );
+ while( width-- ) {
+ movd_m2r( *input, mm0 );
+ punpcklbw_r2r( mm7, mm0 );
+
+ movq_r2r( mm0, mm2 );
+ pshufw_r2r( mm2, mm2, 0 );
+ movq_r2r( mm2, mm4 );
+ pand_m2r( alpha, mm4 );
+
+ pmullw_r2r( mm2, mm0 );
+ paddw_m2r( round, mm0 );
+
+ movq_r2r( mm0, mm3 );
+ psrlw_i2r( 8, mm3 );
+ paddw_r2r( mm3, mm0 );
+ psrlw_i2r( 8, mm0 );
+
+ pand_m2r( noalp, mm0 );
+ paddw_r2r( mm4, mm0 );
+
+ packuswb_r2r( mm0, mm0 );
+ movd_r2m( mm0, *output );
+
+ output += 4;
+ input += 4;
+ }
+ sfence();
+ emms();
+
+ SPEEDY_END();
+}
+
+#endif
+
+static void blend_packed422_scanline_c( uint8_t *output, uint8_t *src1,
+ uint8_t *src2, int width, int pos )
+{
+ if( pos == 0 ) {
+ blit_packed422_scanline( output, src1, width );
+ } else if( pos == 256 ) {
+ blit_packed422_scanline( output, src2, width );
+ } else if( pos == 128 ) {
+ interpolate_packed422_scanline( output, src1, src2, width );
+ } else {
+ width *= 2;
+ while( width-- ) {
+ *output++ = ( (*src1++ * ( 256 - pos )) + (*src2++ * pos) + 0x80 ) >> 8;
+ }
+ }
+}
+
+#ifdef ARCH_X86
+
+static void blend_packed422_scanline_mmxext( uint8_t *output, uint8_t *src1,
+ uint8_t *src2, int width, int pos )
+{
+ if( pos <= 0 ) {
+ blit_packed422_scanline( output, src1, width );
+ } else if( pos >= 256 ) {
+ blit_packed422_scanline( output, src2, width );
+ } else if( pos == 128 ) {
+ interpolate_packed422_scanline( output, src1, src2, width );
+ } else {
+ const mmx_t all256 = { 0x0100010001000100ULL };
+ const mmx_t round = { 0x0080008000800080ULL };
+
+ SPEEDY_START();
+
+ movd_m2r( pos, mm0 );
+ pshufw_r2r( mm0, mm0, 0 );
+ movq_m2r( all256, mm1 );
+ psubw_r2r( mm0, mm1 );
+ pxor_r2r( mm7, mm7 );
+
+ for( width /= 2; width; width-- ) {
+ movd_m2r( *src1, mm3 );
+ movd_m2r( *src2, mm4 );
+ punpcklbw_r2r( mm7, mm3 );
+ punpcklbw_r2r( mm7, mm4 );
+
+ pmullw_r2r( mm1, mm3 );
+ pmullw_r2r( mm0, mm4 );
+ paddw_r2r( mm4, mm3 );
+ paddw_m2r( round, mm3 );
+ psrlw_i2r( 8, mm3 );
+
+ packuswb_r2r( mm3, mm3 );
+ movd_r2m( mm3, *output );
+
+ output += 4;
+ src1 += 4;
+ src2 += 4;
+ }
+ sfence();
+ emms();
+
+ SPEEDY_END();
+ }
+}
+
+static void quarter_blit_vertical_packed422_scanline_mmxext( uint8_t *output, uint8_t *one,
+ uint8_t *three, int width )
+{
+ int i;
+
+ SPEEDY_START();
+ for( i = width/16; i; --i ) {
+ movq_m2r( *one, mm0 );
+ movq_m2r( *three, mm1 );
+ movq_m2r( *(one + 8), mm2 );
+ movq_m2r( *(three + 8), mm3 );
+ movq_m2r( *(one + 16), mm4 );
+ movq_m2r( *(three + 16), mm5 );
+ movq_m2r( *(one + 24), mm6 );
+ movq_m2r( *(three + 24), mm7 );
+ pavgb_r2r( mm1, mm0 );
+ pavgb_r2r( mm1, mm0 );
+ pavgb_r2r( mm3, mm2 );
+ pavgb_r2r( mm3, mm2 );
+ pavgb_r2r( mm5, mm4 );
+ pavgb_r2r( mm5, mm4 );
+ pavgb_r2r( mm7, mm6 );
+ pavgb_r2r( mm7, mm6 );
+ movntq_r2m( mm0, *output );
+ movntq_r2m( mm2, *(output + 8) );
+ movntq_r2m( mm4, *(output + 16) );
+ movntq_r2m( mm6, *(output + 24) );
+ output += 32;
+ one += 32;
+ three += 32;
+ }
+ width = (width & 0xf);
+
+ for( i = width/4; i; --i ) {
+ movq_m2r( *one, mm0 );
+ movq_m2r( *three, mm1 );
+ pavgb_r2r( mm1, mm0 );
+ pavgb_r2r( mm1, mm0 );
+ movntq_r2m( mm0, *output );
+ output += 8;
+ one += 8;
+ three += 8;
+ }
+ width = width & 0x7;
+
+ /* Handle last few pixels. */
+ for( i = width * 2; i; --i ) {
+ *output++ = (*one + *three + *three + *three + 2) / 4;
+ one++;
+ three++;
+ }
+
+ sfence();
+ emms();
+
+ SPEEDY_END();
+}
+
+#endif
+
+static void quarter_blit_vertical_packed422_scanline_c( uint8_t *output, uint8_t *one,
+ uint8_t *three, int width )
+{
+ SPEEDY_START();
+ width *= 2;
+ while( width-- ) {
+ *output++ = (*one + *three + *three + *three + 2) / 4;
+ one++;
+ three++;
+ }
+ SPEEDY_END();
+}
+
+static void subpix_blit_vertical_packed422_scanline_c( uint8_t *output, uint8_t *top,
+ uint8_t *bot, int subpixpos, int width )
+{
+ if( subpixpos == 32768 ) {
+ interpolate_packed422_scanline( output, top, bot, width );
+ } else if( subpixpos == 16384 ) {
+ quarter_blit_vertical_packed422_scanline( output, top, bot, width );
+ } else if( subpixpos == 49152 ) {
+ quarter_blit_vertical_packed422_scanline( output, bot, top, width );
+ } else {
+ int x;
+
+ SPEEDY_START();
+
+ width *= 2;
+ for( x = 0; x < width; x++ ) {
+ output[ x ] = ( ( top[ x ] * subpixpos ) + ( bot[ x ] * ( 0xffff - subpixpos ) ) ) >> 16;
+ }
+ SPEEDY_END();
+ }
+}
+
+static void a8_subpix_blit_scanline_c( uint8_t *output, uint8_t *input,
+ int lasta, int startpos, int width )
+{
+ int pos = 0xffff - (startpos & 0xffff);
+ int prev = lasta;
+ int x;
+
+ for( x = 0; x < width; x++ ) {
+ output[ x ] = ( ( prev * pos ) + ( input[ x ] * ( 0xffff - pos ) ) ) >> 16;
+ prev = input[ x ];
+ }
+}
+
+
+static uint32_t speedy_accel;
+
+void setup_speedy_calls( int verbose )
+{
+ speedy_accel = xine_mm_accel();
+
+ interpolate_packed422_scanline = interpolate_packed422_scanline_c;
+ blit_colour_packed422_scanline = blit_colour_packed422_scanline_c;
+ blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_c;
+ blit_packed422_scanline = blit_packed422_scanline_c;
+ composite_packed4444_to_packed422_scanline = composite_packed4444_to_packed422_scanline_c;
+ composite_packed4444_alpha_to_packed422_scanline = composite_packed4444_alpha_to_packed422_scanline_c;
+ composite_alphamask_to_packed4444_scanline = composite_alphamask_to_packed4444_scanline_c;
+ composite_alphamask_alpha_to_packed4444_scanline = composite_alphamask_alpha_to_packed4444_scanline_c;
+ premultiply_packed4444_scanline = premultiply_packed4444_scanline_c;
+ blend_packed422_scanline = blend_packed422_scanline_c;
+ filter_luma_121_packed422_inplace_scanline = filter_luma_121_packed422_inplace_scanline_c;
+ filter_luma_14641_packed422_inplace_scanline = filter_luma_14641_packed422_inplace_scanline_c;
+ comb_factor_packed422_scanline = 0;
+ diff_factor_packed422_scanline = diff_factor_packed422_scanline_c;
+ kill_chroma_packed422_inplace_scanline = kill_chroma_packed422_inplace_scanline_c;
+ mirror_packed422_inplace_scanline = mirror_packed422_inplace_scanline_c;
+ halfmirror_packed422_inplace_scanline = halfmirror_packed422_inplace_scanline_c;
+ speedy_memcpy = xine_fast_memcpy;
+ diff_packed422_block8x8 = diff_packed422_block8x8_c;
+ a8_subpix_blit_scanline = a8_subpix_blit_scanline_c;
+ quarter_blit_vertical_packed422_scanline = quarter_blit_vertical_packed422_scanline_c;
+ subpix_blit_vertical_packed422_scanline = subpix_blit_vertical_packed422_scanline_c;
+
+#ifdef ARCH_X86
+ if( speedy_accel & MM_ACCEL_X86_MMXEXT ) {
+ if( verbose ) {
+ fprintf( stderr, "speedycode: Using MMXEXT optimized functions.\n" );
+ }
+ interpolate_packed422_scanline = interpolate_packed422_scanline_mmxext;
+ blit_colour_packed422_scanline = blit_colour_packed422_scanline_mmxext;
+ blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_mmxext;
+ blit_packed422_scanline = blit_packed422_scanline_mmxext;
+ composite_packed4444_to_packed422_scanline = composite_packed4444_to_packed422_scanline_mmxext;
+ composite_packed4444_alpha_to_packed422_scanline = composite_packed4444_alpha_to_packed422_scanline_mmxext;
+ composite_alphamask_to_packed4444_scanline = composite_alphamask_to_packed4444_scanline_mmxext;
+ premultiply_packed4444_scanline = premultiply_packed4444_scanline_mmxext;
+ kill_chroma_packed422_inplace_scanline = kill_chroma_packed422_inplace_scanline_mmx;
+ blend_packed422_scanline = blend_packed422_scanline_mmxext;
+ diff_factor_packed422_scanline = diff_factor_packed422_scanline_mmx;
+ comb_factor_packed422_scanline = comb_factor_packed422_scanline_mmx;
+ diff_packed422_block8x8 = diff_packed422_block8x8_mmx;
+ quarter_blit_vertical_packed422_scanline = quarter_blit_vertical_packed422_scanline_mmxext;
+ } else if( speedy_accel & MM_ACCEL_X86_MMX ) {
+ if( verbose ) {
+ fprintf( stderr, "speedycode: Using MMX optimized functions.\n" );
+ }
+ interpolate_packed422_scanline = interpolate_packed422_scanline_mmx;
+ blit_colour_packed422_scanline = blit_colour_packed422_scanline_mmx;
+ blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_mmx;
+ blit_packed422_scanline = blit_packed422_scanline_mmx;
+ diff_factor_packed422_scanline = diff_factor_packed422_scanline_mmx;
+ comb_factor_packed422_scanline = comb_factor_packed422_scanline_mmx;
+ kill_chroma_packed422_inplace_scanline = kill_chroma_packed422_inplace_scanline_mmx;
+ diff_packed422_block8x8 = diff_packed422_block8x8_mmx;
+ } else
+#endif
+ {
+ if( verbose ) {
+ fprintf( stderr, "speedycode: No MMX or MMXEXT support detected, using C fallbacks.\n" );
+ }
+ }
+}
+
+int speedy_get_accel( void )
+{
+ return speedy_accel;
+}
+