diff options
Diffstat (limited to 'src/post/deinterlace/speedy.c')
-rw-r--r-- | src/post/deinterlace/speedy.c | 1856 |
1 files changed, 1856 insertions, 0 deletions
diff --git a/src/post/deinterlace/speedy.c b/src/post/deinterlace/speedy.c new file mode 100644 index 000000000..b06e4bc88 --- /dev/null +++ b/src/post/deinterlace/speedy.c @@ -0,0 +1,1856 @@ +/** + * Copyright (c) 2002, 2003 Billy Biggs <vektor@dumbterm.net>. + * Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +/* + * Uses code from: + * + * linux/arch/i386/kernel/setup.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Found in linux 2.4.20. + * + * Also helped from code in 'cpuinfo.c' found in mplayer. + */ + +#include <stdio.h> +#include <string.h> +#include <sys/time.h> +#include <stdint.h> +#include <unistd.h> +#include <ctype.h> + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "attributes.h" +#include "xineutils.h" +#include "speedtools.h" +#include "speedy.h" + +/* Function pointer definitions. */ +void (*interpolate_packed422_scanline)( uint8_t *output, uint8_t *top, + uint8_t *bot, int width ); +void (*blit_colour_packed422_scanline)( uint8_t *output, + int width, int y, int cb, int cr ); +void (*blit_colour_packed4444_scanline)( uint8_t *output, + int width, int alpha, int luma, + int cb, int cr ); +void (*blit_packed422_scanline)( uint8_t *dest, const uint8_t *src, int width ); +void (*composite_packed4444_to_packed422_scanline)( uint8_t *output, uint8_t *input, + uint8_t *foreground, int width ); +void (*composite_packed4444_alpha_to_packed422_scanline)( uint8_t *output, + uint8_t *input, + uint8_t *foreground, + int width, int alpha ); +void (*composite_alphamask_to_packed4444_scanline)( uint8_t *output, + uint8_t *input, + uint8_t *mask, int width, + int textluma, int textcb, + int textcr ); +void (*composite_alphamask_alpha_to_packed4444_scanline)( uint8_t *output, + uint8_t *input, + uint8_t *mask, int width, + int textluma, int textcb, + int textcr, int alpha ); +void (*premultiply_packed4444_scanline)( uint8_t *output, uint8_t *input, int width ); +void (*blend_packed422_scanline)( uint8_t *output, uint8_t *src1, + uint8_t *src2, int width, int pos ); +void (*filter_luma_121_packed422_inplace_scanline)( uint8_t *data, int width ); +void (*filter_luma_14641_packed422_inplace_scanline)( uint8_t *data, int width ); +unsigned int (*diff_factor_packed422_scanline)( uint8_t *cur, uint8_t *old, int width ); +unsigned int (*comb_factor_packed422_scanline)( uint8_t *top, uint8_t *mid, + uint8_t *bot, int width ); +void (*kill_chroma_packed422_inplace_scanline)( uint8_t *data, int width ); +void (*mirror_packed422_inplace_scanline)( uint8_t *data, int width ); +void (*halfmirror_packed422_inplace_scanline)( uint8_t *data, int width ); +void *(*speedy_memcpy)( void *output, const void *input, size_t size ); +void (*diff_packed422_block8x8)( pulldown_metrics_t *m, uint8_t *old, + uint8_t *new, int os, int ns ); +void (*a8_subpix_blit_scanline)( uint8_t *output, uint8_t *input, + int lasta, int startpos, int width ); +void (*quarter_blit_vertical_packed422_scanline)( uint8_t *output, uint8_t *one, + uint8_t *three, int width ); +void (*subpix_blit_vertical_packed422_scanline)( uint8_t *output, uint8_t *top, + uint8_t *bot, int subpixpos, int width ); + + +#define SPEEDY_START() + +#define SPEEDY_END() + +/** + * result = (1 - alpha)B + alpha*F + * = B - alpha*B + alpha*F + * = B + alpha*(F - B) + */ + +static inline __attribute__ ((always_inline,const)) int multiply_alpha( int a, int r ) +{ + int temp; + temp = (r * a) + 0x80; + return ((temp + (temp >> 8)) >> 8); +} + +static inline __attribute__ ((always_inline,const)) uint8_t clip255( int x ) +{ + if( x > 255 ) { + return 255; + } else if( x < 0 ) { + return 0; + } else { + return x; + } +} + +#ifdef ARCH_X86 + +static unsigned int comb_factor_packed422_scanline_mmx( uint8_t *top, uint8_t *mid, + uint8_t *bot, int width ) +{ + const mmx_t qwYMask = { 0x00ff00ff00ff00ffULL }; + const mmx_t qwOnes = { 0x0001000100010001ULL }; + mmx_t qwThreshold; + unsigned int temp1, temp2; + unsigned long CombJaggieThreshold = 73; + + SPEEDY_START(); + + width /= 4; + + qwThreshold.uw[ 0 ] = CombJaggieThreshold; + qwThreshold.uw[ 1 ] = CombJaggieThreshold; + qwThreshold.uw[ 2 ] = CombJaggieThreshold; + qwThreshold.uw[ 3 ] = CombJaggieThreshold; + + movq_m2r( qwThreshold, mm0 ); + movq_m2r( qwYMask, mm1 ); + movq_m2r( qwOnes, mm2 ); + pxor_r2r( mm7, mm7 ); /* mm7 = 0. */ + + while( width-- ) { + /* Load and keep just the luma. */ + movq_m2r( *top, mm3 ); + movq_m2r( *mid, mm4 ); + movq_m2r( *bot, mm5 ); + + pand_r2r( mm1, mm3 ); + pand_r2r( mm1, mm4 ); + pand_r2r( mm1, mm5 ); + + /* Work out mm6 = (top - mid) * (bot - mid) - ( (top - mid)^2 >> 7 ) */ + psrlw_i2r( 1, mm3 ); + psrlw_i2r( 1, mm4 ); + psrlw_i2r( 1, mm5 ); + + /* mm6 = (top - mid) */ + movq_r2r( mm3, mm6 ); + psubw_r2r( mm4, mm6 ); + + /* mm3 = (top - bot) */ + psubw_r2r( mm5, mm3 ); + + /* mm5 = (bot - mid) */ + psubw_r2r( mm4, mm5 ); + + /* mm6 = (top - mid) * (bot - mid) */ + pmullw_r2r( mm5, mm6 ); + + /* mm3 = (top - bot)^2 >> 7 */ + pmullw_r2r( mm3, mm3 ); /* mm3 = (top - bot)^2 */ + psrlw_i2r( 7, mm3 ); /* mm3 = ((top - bot)^2 >> 7) */ + + /* mm6 is what we want. */ + psubw_r2r( mm3, mm6 ); + + /* FF's if greater than qwTheshold */ + pcmpgtw_r2r( mm0, mm6 ); + + /* Add to count if we are greater than threshold */ + pand_r2r( mm2, mm6 ); + paddw_r2r( mm6, mm7 ); + + top += 8; + mid += 8; + bot += 8; + } + + movd_r2m( mm7, temp1 ); + psrlq_i2r( 32, mm7 ); + movd_r2m( mm7, temp2 ); + temp1 += temp2; + temp2 = temp1; + temp1 >>= 16; + temp1 += temp2 & 0xffff; + + emms(); + + SPEEDY_END(); + + return temp1; +} + +#endif + +static unsigned long BitShift = 6; + +static unsigned int diff_factor_packed422_scanline_c( uint8_t *cur, uint8_t *old, int width ) +{ + unsigned int ret = 0; + + SPEEDY_START(); + + width /= 4; + + while( width-- ) { + unsigned int tmp1 = (cur[ 0 ] + cur[ 2 ] + cur[ 4 ] + cur[ 6 ] + 2)>>2; + unsigned int tmp2 = (old[ 0 ] + old[ 2 ] + old[ 4 ] + old[ 6 ] + 2)>>2; + tmp1 = (tmp1 - tmp2); + tmp1 *= tmp1; + tmp1 >>= BitShift; + ret += tmp1; + cur += 8; + old += 8; + } + SPEEDY_END(); + + return ret; +} + +static unsigned int diff_factor_packed422_scanline_test_c( uint8_t *cur, uint8_t *old, int width ) +{ + unsigned int ret = 0; + + SPEEDY_START(); + + width /= 16; + + while( width-- ) { + unsigned int tmp1 = (cur[ 0 ] + cur[ 2 ] + cur[ 4 ] + cur[ 6 ])>>2; + unsigned int tmp2 = (old[ 0 ] + old[ 2 ] + old[ 4 ] + old[ 6 ])>>2; + tmp1 = (tmp1 - tmp2); + tmp1 *= tmp1; + tmp1 >>= BitShift; + ret += tmp1; + cur += (8*4); + old += (8*4); + } + SPEEDY_END(); + + return ret; +} + +#ifdef ARCH_X86 + +static unsigned int diff_factor_packed422_scanline_mmx( uint8_t *cur, uint8_t *old, int width ) +{ + const mmx_t qwYMask = { 0x00ff00ff00ff00ffULL }; + unsigned int temp1, temp2; + + SPEEDY_START(); + + width /= 4; + + movq_m2r( qwYMask, mm1 ); + movd_m2r( BitShift, mm7 ); + pxor_r2r( mm0, mm0 ); + + while( width-- ) { + movq_m2r( *cur, mm4 ); + movq_m2r( *old, mm5 ); + + pand_r2r( mm1, mm4 ); + pand_r2r( mm1, mm5 ); + + psubw_r2r( mm5, mm4 ); /* mm4 = Y1 - Y2 */ + pmaddwd_r2r( mm4, mm4 ); /* mm4 = (Y1 - Y2)^2 */ + psrld_r2r( mm7, mm4 ); /* divide mm4 by 2^BitShift */ + paddd_r2r( mm4, mm0 ); /* keep total in mm0 */ + + cur += 8; + old += 8; + } + + movd_r2m( mm0, temp1 ); + psrlq_i2r( 32, mm0 ); + movd_r2m( mm0, temp2 ); + temp1 += temp2; + + emms(); + + SPEEDY_END(); + + return temp1; +} + +#define ABS(a) (((a) < 0)?-(a):(a)) + +static void diff_packed422_block8x8_mmx( pulldown_metrics_t *m, uint8_t *old, + uint8_t *new, int os, int ns ) +{ + const mmx_t ymask = { 0x00ff00ff00ff00ffULL }; + short out[ 24 ]; /* Output buffer for the partial metrics from the mmx code. */ + uint8_t *outdata = (uint8_t *) out; + uint8_t *oldp, *newp; + int i; + + SPEEDY_START(); + + pxor_r2r( mm4, mm4 ); // 4 even difference sums. + pxor_r2r( mm5, mm5 ); // 4 odd difference sums. + pxor_r2r( mm7, mm7 ); // zeros + + oldp = old; newp = new; + for( i = 4; i; --i ) { + // Even difference. + movq_m2r( oldp[0], mm0 ); + movq_m2r( oldp[8], mm2 ); + pand_m2r( ymask, mm0 ); + pand_m2r( ymask, mm2 ); + oldp += os; + + movq_m2r( newp[0], mm1 ); + movq_m2r( newp[8], mm3 ); + pand_m2r( ymask, mm1 ); + pand_m2r( ymask, mm3 ); + newp += ns; + + movq_r2r( mm0, mm6 ); + psubusb_r2r( mm1, mm0 ); + psubusb_r2r( mm6, mm1 ); + movq_r2r( mm2, mm6 ); + psubusb_r2r( mm3, mm2 ); + psubusb_r2r( mm6, mm3 ); + + paddw_r2r( mm0, mm4 ); + paddw_r2r( mm1, mm4 ); + paddw_r2r( mm2, mm4 ); + paddw_r2r( mm3, mm4 ); + + // Odd difference. + movq_m2r( oldp[0], mm0 ); + movq_m2r( oldp[8], mm2 ); + pand_m2r( ymask, mm0 ); + pand_m2r( ymask, mm2 ); + oldp += os; + + movq_m2r( newp[0], mm1 ); + movq_m2r( newp[8], mm3 ); + pand_m2r( ymask, mm1 ); + pand_m2r( ymask, mm3 ); + newp += ns; + + movq_r2r( mm0, mm6 ); + psubusb_r2r( mm1, mm0 ); + psubusb_r2r( mm6, mm1 ); + movq_r2r( mm2, mm6 ); + psubusb_r2r( mm3, mm2 ); + psubusb_r2r( mm6, mm3 ); + + paddw_r2r( mm0, mm5 ); + paddw_r2r( mm1, mm5 ); + paddw_r2r( mm2, mm5 ); + paddw_r2r( mm3, mm5 ); + } + movq_r2m( mm4, outdata[0] ); + movq_r2m( mm5, outdata[8] ); + + m->e = out[0] + out[1] + out[2] + out[3]; + m->o = out[4] + out[5] + out[6] + out[7]; + m->d = m->e + m->o; + + pxor_r2r( mm4, mm4 ); // Past spacial noise. + pxor_r2r( mm5, mm5 ); // Temporal noise. + pxor_r2r( mm6, mm6 ); // Current spacial noise. + + // First loop to measure first four columns + oldp = old; newp = new; + for( i = 4; i; --i ) { + movq_m2r( oldp[0], mm0 ); + movq_m2r( oldp[os], mm1 ); + pand_m2r( ymask, mm0 ); + pand_m2r( ymask, mm1 ); + oldp += (os*2); + + movq_m2r( newp[0], mm2 ); + movq_m2r( newp[ns], mm3 ); + pand_m2r( ymask, mm2 ); + pand_m2r( ymask, mm3 ); + newp += (ns*2); + + paddw_r2r( mm1, mm4 ); + paddw_r2r( mm1, mm5 ); + paddw_r2r( mm3, mm6 ); + psubw_r2r( mm0, mm4 ); + psubw_r2r( mm2, mm5 ); + psubw_r2r( mm2, mm6 ); + } + movq_r2m( mm4, outdata[0] ); + movq_r2m( mm5, outdata[16] ); + movq_r2m( mm6, outdata[32] ); + + pxor_r2r( mm4, mm4 ); + pxor_r2r( mm5, mm5 ); + pxor_r2r( mm6, mm6 ); + + // Second loop for the last four columns + oldp = old; newp = new; + for( i = 4; i; --i ) { + movq_m2r( oldp[8], mm0 ); + movq_m2r( oldp[os+8], mm1 ); + pand_m2r( ymask, mm0 ); + pand_m2r( ymask, mm1 ); + oldp += (os*2); + + movq_m2r( newp[8], mm2 ); + movq_m2r( newp[ns+8], mm3 ); + pand_m2r( ymask, mm2 ); + pand_m2r( ymask, mm3 ); + newp += (ns*2); + + paddw_r2r( mm1, mm4 ); + paddw_r2r( mm1, mm5 ); + paddw_r2r( mm3, mm6 ); + psubw_r2r( mm0, mm4 ); + psubw_r2r( mm2, mm5 ); + psubw_r2r( mm2, mm6 ); + } + movq_r2m( mm4, outdata[8] ); + movq_r2m( mm5, outdata[24] ); + movq_r2m( mm6, outdata[40] ); + + m->p = m->t = m->s = 0; + for (i=0; i<8; i++) { + // FIXME: move abs() into the mmx code! + m->p += ABS(out[i]); + m->t += ABS(out[8+i]); + m->s += ABS(out[16+i]); + } + + emms(); + + SPEEDY_END(); +} + +#endif + +static void diff_packed422_block8x8_c( pulldown_metrics_t *m, uint8_t *old, + uint8_t *new, int os, int ns ) +{ + int x, y, e=0, o=0, s=0, p=0, t=0; + uint8_t *oldp, *newp; + + SPEEDY_START(); + m->s = m->p = m->t = 0; + for (x = 8; x; x--) { + oldp = old; old += 2; + newp = new; new += 2; + s = p = t = 0; + for (y = 4; y; y--) { + e += ABS(newp[0] - oldp[0]); + o += ABS(newp[ns] - oldp[os]); + s += newp[ns]-newp[0]; + p += oldp[os]-oldp[0]; + t += oldp[os]-newp[0]; + oldp += os<<1; + newp += ns<<1; + } + m->s += ABS(s); + m->p += ABS(p); + m->t += ABS(t); + } + m->e = e; + m->o = o; + m->d = e+o; + SPEEDY_END(); +} + +static void packed444_to_packed422_scanline_c( uint8_t *output, uint8_t *input, int width ) +{ + SPEEDY_START(); + width /= 2; + while( width-- ) { + output[ 0 ] = input[ 0 ]; + output[ 1 ] = input[ 1 ]; + output[ 2 ] = input[ 3 ]; + output[ 3 ] = input[ 2 ]; + output += 4; + input += 6; + } + SPEEDY_END(); +} + +static void packed422_to_packed444_scanline_c( uint8_t *output, uint8_t *input, int width ) +{ + SPEEDY_START(); + width /= 2; + while( width-- ) { + output[ 0 ] = input[ 0 ]; + output[ 1 ] = input[ 1 ]; + output[ 2 ] = input[ 3 ]; + output[ 3 ] = input[ 2 ]; + output[ 4 ] = input[ 1 ]; + output[ 5 ] = input[ 3 ]; + output += 6; + input += 4; + } + SPEEDY_END(); +} + +/** + * For the middle pixels, the filter kernel is: + * + * [-1 3 -6 12 -24 80 80 -24 12 -6 3 -1] + */ +void packed422_to_packed444_rec601_scanline( uint8_t *dest, uint8_t *src, int width ) +{ + int i; + + SPEEDY_START(); + /* Process two input pixels at a time. Input is [Y'][Cb][Y'][Cr]. */ + for( i = 0; i < width / 2; i++ ) { + dest[ (i*6) + 0 ] = src[ (i*4) + 0 ]; + dest[ (i*6) + 1 ] = src[ (i*4) + 1 ]; + dest[ (i*6) + 2 ] = src[ (i*4) + 3 ]; + + dest[ (i*6) + 3 ] = src[ (i*4) + 2 ]; + if( i > (5*2) && i < ((width/2) - (6*2)) ) { + dest[ (i*6) + 4 ] = clip255( (( (80*(src[ (i*4) + 1 ] + src[ (i*4) + 5 ])) + - (24*(src[ (i*4) - 3 ] + src[ (i*4) + 9 ])) + + (12*(src[ (i*4) - 7 ] + src[ (i*4) + 13])) + - ( 6*(src[ (i*4) - 11] + src[ (i*4) + 17])) + + ( 3*(src[ (i*4) - 15] + src[ (i*4) + 21])) + - ( (src[ (i*4) - 19] + src[ (i*4) + 25]))) + 64) >> 7 ); + dest[ (i*6) + 5 ] = clip255( (( (80*(src[ (i*4) + 3 ] + src[ (i*4) + 7 ])) + - (24*(src[ (i*4) - 1 ] + src[ (i*4) + 11])) + + (12*(src[ (i*4) - 5 ] + src[ (i*4) + 15])) + - ( 6*(src[ (i*4) - 9 ] + src[ (i*4) + 19])) + + ( 3*(src[ (i*4) - 13] + src[ (i*4) + 23])) + - ( (src[ (i*4) - 17] + src[ (i*4) + 27]))) + 64) >> 7 ); + } else if( i < ((width/2) - 1) ) { + dest[ (i*6) + 4 ] = (src[ (i*4) + 1 ] + src[ (i*4) + 5 ] + 1) >> 1; + dest[ (i*6) + 5 ] = (src[ (i*4) + 3 ] + src[ (i*4) + 7 ] + 1) >> 1; + } else { + dest[ (i*6) + 4 ] = src[ (i*4) + 1 ]; + dest[ (i*6) + 5 ] = src[ (i*4) + 3 ]; + } + } + SPEEDY_END(); +} + +#ifdef ARCH_X86 + +static void kill_chroma_packed422_inplace_scanline_mmx( uint8_t *data, int width ) +{ + const mmx_t ymask = { 0x00ff00ff00ff00ffULL }; + const mmx_t nullchroma = { 0x8000800080008000ULL }; + + SPEEDY_START(); + + movq_m2r( ymask, mm7 ); + movq_m2r( nullchroma, mm6 ); + for(; width > 4; width -= 4 ) { + movq_m2r( *data, mm0 ); + pand_r2r( mm7, mm0 ); + paddb_r2r( mm6, mm0 ); + movq_r2m( mm0, *data ); + data += 8; + } + emms(); + + while( width-- ) { + data[ 1 ] = 128; + data += 2; + } + SPEEDY_END(); +} + +#endif + +static void kill_chroma_packed422_inplace_scanline_c( uint8_t *data, int width ) +{ + SPEEDY_START(); + while( width-- ) { + data[ 1 ] = 128; + data += 2; + } + SPEEDY_END(); +} + +/* +// this duplicates alternate lines in alternate frames to highlight or mute +// the effects of chroma crawl. it is not a solution or proper filter. it's +// only for testing. +static void testing_packed422_inplace_scanline_c( uint8_t *data, int width, int scanline ) +{ + volatile static int topbottom = 0; + static uint8_t scanbuffer[2048]; + + SPEEDY_START(); + if( scanline <= 1 ) { + topbottom = scanline; + memcpy(scanbuffer, data, width*2); + } + if ( scanline < 10 ) { + printf("scanline: %d %d\n", scanline, topbottom); + } + if ( ((scanline-topbottom)/2)%2 && scanline > 1 ) { + memcpy(data, scanbuffer, width*2); + } else { + memcpy(scanbuffer, data, width*2); + } + SPEEDY_END(); +} +*/ + +static void mirror_packed422_inplace_scanline_c( uint8_t *data, int width ) +{ + int x, tmp1, tmp2; + int width2 = width*2; + + SPEEDY_START(); + for( x = 0; x < width; x += 2 ) { + tmp1 = data[ x ]; + tmp2 = data[ x+1 ]; + data[ x ] = data[ width2 - x ]; + data[ x+1 ] = data[ width2 - x + 1 ]; + data[ width2 - x ] = tmp1; + data[ width2 - x + 1 ] = tmp2; + } + SPEEDY_END(); +} + +static void halfmirror_packed422_inplace_scanline_c( uint8_t *data, int width ) +{ + int x; + + SPEEDY_START(); + for( x = 0; x < width; x += 2 ) { + data[ width + x ] = data[ width - x ]; + data[ width + x + 1 ] = data[ width - x + 1 ]; + } + SPEEDY_END(); +} + +static void filter_luma_121_packed422_inplace_scanline_c( uint8_t *data, int width ) +{ + int r1 = 0; + int r2 = 0; + + SPEEDY_START(); + data += 2; + width -= 1; + while( width-- ) { + int s1, s2; + s1 = *data + r1; r1 = *data; + s2 = s1 + r2; r2 = s1; + *(data - 2) = s2 >> 2; + data += 2; + } + SPEEDY_END(); +} + +static void filter_luma_14641_packed422_inplace_scanline_c( uint8_t *data, int width ) +{ + int r1 = 0; + int r2 = 0; + int r3 = 0; + int r4 = 0; + + SPEEDY_START(); + width -= 4; + data += 4; + while( width-- ) { + int s1, s2, s3, s4; + s1 = *data + r1; r1 = *data; + s2 = s1 + r2; r2 = s1; + s3 = s2 + r3; r3 = s2; + s4 = s3 + r4; r4 = s3; + *(data - 4) = s4 >> 4; + data += 2; + } + SPEEDY_END(); +} + +static void interpolate_packed422_scanline_c( uint8_t *output, uint8_t *top, + uint8_t *bot, int width ) +{ + int i; + + SPEEDY_START(); + + for( i = width*2; i; --i ) { + *output++ = ((*top++) + (*bot++)) >> 1; + } + + SPEEDY_END(); +} + +#ifdef ARCH_X86 + +static void interpolate_packed422_scanline_mmx( uint8_t *output, uint8_t *top, + uint8_t *bot, int width ) +{ + const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */ + int i; + + SPEEDY_START(); + + for( i = width/16; i; --i ) { + movq_m2r( *bot, mm0 ); + movq_m2r( *top, mm1 ); + movq_m2r( *(bot + 8), mm2 ); + movq_m2r( *(top + 8), mm3 ); + movq_m2r( *(bot + 16), mm4 ); + movq_m2r( *(top + 16), mm5 ); + movq_m2r( *(bot + 24), mm6 ); + movq_m2r( *(top + 24), mm7 ); + pand_m2r( shiftmask, mm0 ); + pand_m2r( shiftmask, mm1 ); + pand_m2r( shiftmask, mm2 ); + pand_m2r( shiftmask, mm3 ); + pand_m2r( shiftmask, mm4 ); + pand_m2r( shiftmask, mm5 ); + pand_m2r( shiftmask, mm6 ); + pand_m2r( shiftmask, mm7 ); + psrlw_i2r( 1, mm0 ); + psrlw_i2r( 1, mm1 ); + psrlw_i2r( 1, mm2 ); + psrlw_i2r( 1, mm3 ); + psrlw_i2r( 1, mm4 ); + psrlw_i2r( 1, mm5 ); + psrlw_i2r( 1, mm6 ); + psrlw_i2r( 1, mm7 ); + paddb_r2r( mm1, mm0 ); + paddb_r2r( mm3, mm2 ); + paddb_r2r( mm5, mm4 ); + paddb_r2r( mm7, mm6 ); + movq_r2m( mm0, *output ); + movq_r2m( mm2, *(output + 8) ); + movq_r2m( mm4, *(output + 16) ); + movq_r2m( mm6, *(output + 24) ); + output += 32; + top += 32; + bot += 32; + } + width = (width & 0xf); + + for( i = width/4; i; --i ) { + movq_m2r( *bot, mm0 ); + movq_m2r( *top, mm1 ); + pand_m2r( shiftmask, mm0 ); + pand_m2r( shiftmask, mm1 ); + psrlw_i2r( 1, mm0 ); + psrlw_i2r( 1, mm1 ); + paddb_r2r( mm1, mm0 ); + movq_r2m( mm0, *output ); + output += 8; + top += 8; + bot += 8; + } + width = width & 0x7; + + /* Handle last few pixels. */ + for( i = width * 2; i; --i ) { + *output++ = ((*top++) + (*bot++)) >> 1; + } + + emms(); + + SPEEDY_END(); +} + +static void interpolate_packed422_scanline_mmxext( uint8_t *output, uint8_t *top, + uint8_t *bot, int width ) +{ + int i; + + SPEEDY_START(); + + for( i = width/16; i; --i ) { + movq_m2r( *bot, mm0 ); + movq_m2r( *top, mm1 ); + movq_m2r( *(bot + 8), mm2 ); + movq_m2r( *(top + 8), mm3 ); + movq_m2r( *(bot + 16), mm4 ); + movq_m2r( *(top + 16), mm5 ); + movq_m2r( *(bot + 24), mm6 ); + movq_m2r( *(top + 24), mm7 ); + pavgb_r2r( mm1, mm0 ); + pavgb_r2r( mm3, mm2 ); + pavgb_r2r( mm5, mm4 ); + pavgb_r2r( mm7, mm6 ); + movntq_r2m( mm0, *output ); + movntq_r2m( mm2, *(output + 8) ); + movntq_r2m( mm4, *(output + 16) ); + movntq_r2m( mm6, *(output + 24) ); + output += 32; + top += 32; + bot += 32; + } + width = (width & 0xf); + + for( i = width/4; i; --i ) { + movq_m2r( *bot, mm0 ); + movq_m2r( *top, mm1 ); + pavgb_r2r( mm1, mm0 ); + movntq_r2m( mm0, *output ); + output += 8; + top += 8; + bot += 8; + } + width = width & 0x7; + + /* Handle last few pixels. */ + for( i = width * 2; i; --i ) { + *output++ = ((*top++) + (*bot++)) >> 1; + } + + sfence(); + emms(); + + SPEEDY_END(); +} + +#endif + +static void blit_colour_packed422_scanline_c( uint8_t *output, int width, int y, int cb, int cr ) +{ + uint32_t colour = cr << 24 | y << 16 | cb << 8 | y; + uint32_t *o = (uint32_t *) output; + + SPEEDY_START(); + + for( width /= 2; width; --width ) { + *o++ = colour; + } + + SPEEDY_END(); +} + +#ifdef ARCH_X86 + +static void blit_colour_packed422_scanline_mmx( uint8_t *output, int width, int y, int cb, int cr ) +{ + uint32_t colour = cr << 24 | y << 16 | cb << 8 | y; + int i; + + SPEEDY_START(); + + movd_m2r( colour, mm1 ); + movd_m2r( colour, mm2 ); + psllq_i2r( 32, mm1 ); + por_r2r( mm1, mm2 ); + + for( i = width / 16; i; --i ) { + movq_r2m( mm2, *output ); + movq_r2m( mm2, *(output + 8) ); + movq_r2m( mm2, *(output + 16) ); + movq_r2m( mm2, *(output + 24) ); + output += 32; + } + width = (width & 0xf); + + for( i = width / 4; i; --i ) { + movq_r2m( mm2, *output ); + output += 8; + } + width = (width & 0x7); + + for( i = width / 2; i; --i ) { + *((uint32_t *) output) = colour; + output += 4; + } + + if( width & 1 ) { + *output = y; + *(output + 1) = cb; + } + + emms(); + + SPEEDY_END(); +} + +static void blit_colour_packed422_scanline_mmxext( uint8_t *output, int width, int y, int cb, int cr ) +{ + uint32_t colour = cr << 24 | y << 16 | cb << 8 | y; + int i; + + SPEEDY_START(); + + movd_m2r( colour, mm1 ); + movd_m2r( colour, mm2 ); + psllq_i2r( 32, mm1 ); + por_r2r( mm1, mm2 ); + + for( i = width / 16; i; --i ) { + movntq_r2m( mm2, *output ); + movntq_r2m( mm2, *(output + 8) ); + movntq_r2m( mm2, *(output + 16) ); + movntq_r2m( mm2, *(output + 24) ); + output += 32; + } + width = (width & 0xf); + + for( i = width / 4; i; --i ) { + movntq_r2m( mm2, *output ); + output += 8; + } + width = (width & 0x7); + + for( i = width / 2; i; --i ) { + *((uint32_t *) output) = colour; + output += 4; + } + + if( width & 1 ) { + *output = y; + *(output + 1) = cb; + } + + sfence(); + emms(); + + SPEEDY_END(); +} + +#endif + +static void blit_colour_packed4444_scanline_c( uint8_t *output, int width, + int alpha, int luma, int cb, int cr ) +{ + int j; + + SPEEDY_START(); + + for( j = 0; j < width; j++ ) { + *output++ = alpha; + *output++ = luma; + *output++ = cb; + *output++ = cr; + } + + SPEEDY_END(); +} + +#ifdef ARCH_X86 + +static void blit_colour_packed4444_scanline_mmx( uint8_t *output, int width, + int alpha, int luma, + int cb, int cr ) +{ + uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha; + int i; + + SPEEDY_START(); + + movd_m2r( colour, mm1 ); + movd_m2r( colour, mm2 ); + psllq_i2r( 32, mm1 ); + por_r2r( mm1, mm2 ); + + for( i = width / 8; i; --i ) { + movq_r2m( mm2, *output ); + movq_r2m( mm2, *(output + 8) ); + movq_r2m( mm2, *(output + 16) ); + movq_r2m( mm2, *(output + 24) ); + output += 32; + } + width = (width & 0x7); + + for( i = width / 2; i; --i ) { + movq_r2m( mm2, *output ); + output += 8; + } + width = (width & 0x1); + + if( width ) { + *((uint32_t *) output) = colour; + output += 4; + } + + emms(); + + SPEEDY_END(); +} + +void blit_colour_packed4444_scanline_mmxext( uint8_t *output, int width, + int alpha, int luma, + int cb, int cr ) +{ + uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha; + int i; + + SPEEDY_START(); + + movd_m2r( colour, mm1 ); + movd_m2r( colour, mm2 ); + psllq_i2r( 32, mm1 ); + por_r2r( mm1, mm2 ); + + for( i = width / 8; i; --i ) { + movntq_r2m( mm2, *output ); + movntq_r2m( mm2, *(output + 8) ); + movntq_r2m( mm2, *(output + 16) ); + movntq_r2m( mm2, *(output + 24) ); + output += 32; + } + width = (width & 0x7); + + for( i = width / 2; i; --i ) { + movntq_r2m( mm2, *output ); + output += 8; + } + width = (width & 0x1); + + if( width ) { + *((uint32_t *) output) = colour; + output += 4; + } + + sfence(); + emms(); + + SPEEDY_END(); +} + +#endif + +static void blit_packed422_scanline_c( uint8_t *dest, const uint8_t *src, int width ) +{ + xine_fast_memcpy( dest, src, width*2 ); +} + +static void blit_packed422_scanline_mmx( uint8_t *dest, const uint8_t *src, int width ) +{ + xine_fast_memcpy( dest, src, width*2 ); +} + +static void blit_packed422_scanline_mmxext( uint8_t *dest, const uint8_t *src, int width ) +{ + xine_fast_memcpy( dest, src, width*2 ); +} + +static void composite_packed4444_alpha_to_packed422_scanline_c( uint8_t *output, uint8_t *input, + uint8_t *foreground, int width, int alpha ) +{ + int i; + + SPEEDY_START(); + for( i = 0; i < width; i++ ) { + int af = foreground[ 0 ]; + + if( af ) { + int a = ((af * alpha) + 0x80) >> 8; + + + if( a == 0xff ) { + output[ 0 ] = foreground[ 1 ]; + + if( ( i & 1 ) == 0 ) { + output[ 1 ] = foreground[ 2 ]; + output[ 3 ] = foreground[ 3 ]; + } + } else if( a ) { + /** + * (1 - alpha)*B + alpha*F + * (1 - af*a)*B + af*a*F + * B - af*a*B + af*a*F + * B + a*(af*F - af*B) + */ + + output[ 0 ] = input[ 0 ] + + ((alpha*( foreground[ 1 ] + - multiply_alpha( foreground[ 0 ], input[ 0 ] ) ) + 0x80) >> 8); + + if( ( i & 1 ) == 0 ) { + + /** + * At first I thought I was doing this incorrectly, but + * the following math has convinced me otherwise. + * + * C_r = (1 - alpha)*B + alpha*F + * C_r = B - af*a*B + af*a*F + * + * C_r = 128 + ((1 - af*a)*(B - 128) + a*af*(F - 128)) + * C_r = 128 + (B - af*a*B - 128 + af*a*128 + a*af*F - a*af*128) + * C_r = B - af*a*B + a*af*F + */ + + output[ 1 ] = input[ 1 ] + ((alpha*( foreground[ 2 ] + - multiply_alpha( foreground[ 0 ], input[ 1 ] ) ) + 0x80) >> 8); + output[ 3 ] = input[ 3 ] + ((alpha*( foreground[ 3 ] + - multiply_alpha( foreground[ 0 ], input[ 3 ] ) ) + 0x80) >> 8); + } + } + } + foreground += 4; + output += 2; + input += 2; + } + SPEEDY_END(); +} + +#ifdef ARCH_X86 + +static void composite_packed4444_alpha_to_packed422_scanline_mmxext( uint8_t *output, + uint8_t *input, + uint8_t *foreground, + int width, int alpha ) +{ + const mmx_t alpha2 = { 0x0000FFFF00000000ULL }; + const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL }; + const mmx_t round = { 0x0080008000800080ULL }; + int i; + + if( !alpha ) { + blit_packed422_scanline( output, input, width ); + return; + } + + if( alpha == 256 ) { + composite_packed4444_to_packed422_scanline( output, input, foreground, width ); + return; + } + + SPEEDY_START(); + READ_PREFETCH_2048( input ); + READ_PREFETCH_2048( foreground ); + + movq_m2r( alpha, mm2 ); + pshufw_r2r( mm2, mm2, 0 ); + pxor_r2r( mm7, mm7 ); + + for( i = width/2; i; i-- ) { + int fg1 = *((uint32_t *) foreground); + int fg2 = *(((uint32_t *) foreground)+1); + + if( fg1 || fg2 ) { + /* mm1 = [ cr ][ y ][ cb ][ y ] */ + movd_m2r( *input, mm1 ); + punpcklbw_r2r( mm7, mm1 ); + + movq_m2r( *foreground, mm3 ); + movq_r2r( mm3, mm4 ); + punpcklbw_r2r( mm7, mm3 ); + punpckhbw_r2r( mm7, mm4 ); + /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */ + + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */ + pshufw_r2r( mm3, mm5, 0 ); + pshufw_r2r( mm4, mm6, 0 ); + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */ + pshufw_r2r( mm3, mm3, 201 ); + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */ + pshufw_r2r( mm4, mm4, 16 ); + + pand_m2r( alpha1, mm3 ); + pand_m2r( alpha2, mm4 ); + pand_m2r( alpha1, mm5 ); + pand_m2r( alpha2, mm6 ); + por_r2r( mm4, mm3 ); + por_r2r( mm6, mm5 ); + + /* now, mm5 is af and mm1 is B. Need to multiply them. */ + pmullw_r2r( mm1, mm5 ); + + /* Multiply by appalpha. */ + pmullw_r2r( mm2, mm3 ); + paddw_m2r( round, mm3 ); + psrlw_i2r( 8, mm3 ); + /* Result is now B + F. */ + paddw_r2r( mm3, mm1 ); + + /* Round up appropriately. */ + paddw_m2r( round, mm5 ); + + /* mm6 contains our i>>8; */ + movq_r2r( mm5, mm6 ); + psrlw_i2r( 8, mm6 ); + + /* Add mm6 back into mm5. Now our result is in the high bytes. */ + paddw_r2r( mm6, mm5 ); + + /* Shift down. */ + psrlw_i2r( 8, mm5 ); + + /* Multiply by appalpha. */ + pmullw_r2r( mm2, mm5 ); + paddw_m2r( round, mm5 ); + psrlw_i2r( 8, mm5 ); + + psubusw_r2r( mm5, mm1 ); + + /* mm1 = [ B + F - af*B ] */ + packuswb_r2r( mm1, mm1 ); + movd_r2m( mm1, *output ); + } + + foreground += 8; + output += 4; + input += 4; + } + sfence(); + emms(); + + SPEEDY_END(); +} + +#endif + +static void composite_packed4444_to_packed422_scanline_c( uint8_t *output, uint8_t *input, + uint8_t *foreground, int width ) +{ + int i; + SPEEDY_START(); + for( i = 0; i < width; i++ ) { + int a = foreground[ 0 ]; + + if( a == 0xff ) { + output[ 0 ] = foreground[ 1 ]; + + if( ( i & 1 ) == 0 ) { + output[ 1 ] = foreground[ 2 ]; + output[ 3 ] = foreground[ 3 ]; + } + } else if( a ) { + /** + * (1 - alpha)*B + alpha*F + * B + af*F - af*B + */ + + output[ 0 ] = input[ 0 ] + foreground[ 1 ] - multiply_alpha( foreground[ 0 ], input[ 0 ] ); + + if( ( i & 1 ) == 0 ) { + + /** + * C_r = (1 - af)*B + af*F + * C_r = B - af*B + af*F + */ + + output[ 1 ] = input[ 1 ] + foreground[ 2 ] - multiply_alpha( foreground[ 0 ], input[ 1 ] ); + output[ 3 ] = input[ 3 ] + foreground[ 3 ] - multiply_alpha( foreground[ 0 ], input[ 3 ] ); + } + } + foreground += 4; + output += 2; + input += 2; + } + SPEEDY_END(); +} + +#ifdef ARCH_X86 + +static void composite_packed4444_to_packed422_scanline_mmxext( uint8_t *output, uint8_t *input, + uint8_t *foreground, int width ) +{ + const mmx_t alpha2 = { 0x0000FFFF00000000ULL }; + const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL }; + const mmx_t round = { 0x0080008000800080ULL }; + int i; + + SPEEDY_START(); + READ_PREFETCH_2048( input ); + READ_PREFETCH_2048( foreground ); + + pxor_r2r( mm7, mm7 ); + for( i = width/2; i; i-- ) { + int fg1 = *((uint32_t *) foreground); + int fg2 = *(((uint32_t *) foreground)+1); + + if( (fg1 & 0xff) == 0xff && (fg2 & 0xff) == 0xff ) { + movq_m2r( *foreground, mm3 ); + movq_r2r( mm3, mm4 ); + punpcklbw_r2r( mm7, mm3 ); + punpckhbw_r2r( mm7, mm4 ); + /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */ + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */ + pshufw_r2r( mm3, mm3, 201 ); + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */ + pshufw_r2r( mm4, mm4, 16 ); + pand_m2r( alpha1, mm3 ); + pand_m2r( alpha2, mm4 ); + por_r2r( mm4, mm3 ); + /* mm1 = [ B + F - af*B ] */ + packuswb_r2r( mm3, mm3 ); + movd_r2m( mm3, *output ); + } else if( fg1 || fg2 ) { + + /* mm1 = [ cr ][ y ][ cb ][ y ] */ + movd_m2r( *input, mm1 ); + punpcklbw_r2r( mm7, mm1 ); + + movq_m2r( *foreground, mm3 ); + movq_r2r( mm3, mm4 ); + punpcklbw_r2r( mm7, mm3 ); + punpckhbw_r2r( mm7, mm4 ); + /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */ + + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */ + pshufw_r2r( mm3, mm5, 0 ); + pshufw_r2r( mm4, mm6, 0 ); + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */ + pshufw_r2r( mm3, mm3, 201 ); + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */ + pshufw_r2r( mm4, mm4, 16 ); + + pand_m2r( alpha1, mm3 ); + pand_m2r( alpha2, mm4 ); + pand_m2r( alpha1, mm5 ); + pand_m2r( alpha2, mm6 ); + por_r2r( mm4, mm3 ); + por_r2r( mm6, mm5 ); + + /* now, mm5 is af and mm1 is B. Need to multiply them. */ + pmullw_r2r( mm1, mm5 ); + + /* Result is now B + F. */ + paddw_r2r( mm3, mm1 ); + + /* Round up appropriately. */ + paddw_m2r( round, mm5 ); + + /* mm6 contains our i>>8; */ + movq_r2r( mm5, mm6 ); + psrlw_i2r( 8, mm6 ); + + /* Add mm6 back into mm5. Now our result is in the high bytes. */ + paddw_r2r( mm6, mm5 ); + + /* Shift down. */ + psrlw_i2r( 8, mm5 ); + + psubusw_r2r( mm5, mm1 ); + + /* mm1 = [ B + F - af*B ] */ + packuswb_r2r( mm1, mm1 ); + movd_r2m( mm1, *output ); + } + + foreground += 8; + output += 4; + input += 4; + } + sfence(); + emms(); + + SPEEDY_END(); +} + +#endif + +/** + * um... just need some scrap paper... + * D = (1 - alpha)*B + alpha*F + * D = (1 - a)*B + a*textluma + * = B - a*B + a*textluma + * = B + a*(textluma - B) + * Da = (1 - a)*b + a + */ +static void composite_alphamask_to_packed4444_scanline_c( uint8_t *output, + uint8_t *input, + uint8_t *mask, + int width, + int textluma, int textcb, + int textcr ) +{ + uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff; + int i; + + SPEEDY_START(); + + for( i = 0; i < width; i++ ) { + int a = *mask; + + if( a == 0xff ) { + *((uint32_t *) output) = opaque; + } else if( (input[ 0 ] == 0x00) ) { + *((uint32_t *) output) = (multiply_alpha( a, textcr ) << 24) + | (multiply_alpha( a, textcb ) << 16) + | (multiply_alpha( a, textluma ) << 8) | a; + } else if( a ) { + *((uint32_t *) output) = ((input[ 3 ] + multiply_alpha( a, textcr - input[ 3 ] )) << 24) + | ((input[ 2 ] + multiply_alpha( a, textcb - input[ 2 ] )) << 16) + | ((input[ 1 ] + multiply_alpha( a, textluma - input[ 1 ] )) << 8) + | (input[ 0 ] + multiply_alpha( a, 0xff - input[ 0 ] )); + } + mask++; + output += 4; + input += 4; + } + SPEEDY_END(); +} + +#ifdef ARCH_X86 + +static void composite_alphamask_to_packed4444_scanline_mmxext( uint8_t *output, + uint8_t *input, + uint8_t *mask, + int width, + int textluma, int textcb, + int textcr ) +{ + uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff; + const mmx_t round = { 0x0080008000800080ULL }; + const mmx_t fullalpha = { 0x00000000000000ffULL }; + mmx_t colour; + + SPEEDY_START(); + + colour.w[ 0 ] = 0x00; + colour.w[ 1 ] = textluma; + colour.w[ 2 ] = textcb; + colour.w[ 3 ] = textcr; + + movq_m2r( colour, mm1 ); + movq_r2r( mm1, mm0 ); + + /* mm0 = [ cr ][ cb ][ y ][ 0xff ] */ + paddw_m2r( fullalpha, mm0 ); + + /* mm7 = 0 */ + pxor_r2r( mm7, mm7 ); + + /* mm6 = round */ + movq_m2r( round, mm6 ); + + while( width-- ) { + int a = *mask; + + if( a == 0xff ) { + *((uint32_t *) output) = opaque; + } else if( (input[ 0 ] == 0x00) ) { + /* We just need to multiply our colour by the alpha value. */ + + /* mm2 = [ a ][ a ][ a ][ a ] */ + movd_m2r( a, mm2 ); + movq_r2r( mm2, mm3 ); + pshufw_r2r( mm2, mm2, 0 ); + + /* mm5 = [ cr ][ cb ][ y ][ 0 ] */ + movq_r2r( mm1, mm5 ); + + /* Multiply by alpha. */ + pmullw_r2r( mm2, mm5 ); + paddw_m2r( round, mm5 ); + movq_r2r( mm5, mm6 ); + psrlw_i2r( 8, mm6 ); + paddw_r2r( mm6, mm5 ); + psrlw_i2r( 8, mm5 ); + + /* Set alpha to a. */ + por_r2r( mm3, mm5 ); + + /* Pack and write our result. */ + packuswb_r2r( mm5, mm5 ); + movd_r2m( mm5, *output ); + } else if( a ) { + /* mm2 = [ a ][ a ][ a ][ a ] */ + movd_m2r( a, mm2 ); + pshufw_r2r( mm2, mm2, 0 ); + + /* mm3 = [ cr ][ cb ][ y ][ 0xff ] */ + movq_r2r( mm0, mm3 ); + + /* mm4 = [ i_cr ][ i_cb ][ i_y ][ i_a ] */ + movd_m2r( *input, mm4 ); + punpcklbw_r2r( mm7, mm4 ); + + /* Subtract input and colour. */ + psubw_r2r( mm4, mm3 ); /* mm3 = mm3 - mm4 */ + + /* Multiply alpha. */ + pmullw_r2r( mm2, mm3 ); + paddw_r2r( mm6, mm3 ); + movq_r2r( mm3, mm2 ); + psrlw_i2r( 8, mm3 ); + paddw_r2r( mm2, mm3 ); + psrlw_i2r( 8, mm3 ); + + /* Add back in the input. */ + paddb_r2r( mm3, mm4 ); + + /* Write result. */ + packuswb_r2r( mm4, mm4 ); + movd_r2m( mm4, *output ); + } + mask++; + output += 4; + input += 4; + } + sfence(); + emms(); + SPEEDY_END(); +} + +#endif + +static void composite_alphamask_alpha_to_packed4444_scanline_c( uint8_t *output, + uint8_t *input, + uint8_t *mask, int width, + int textluma, int textcb, + int textcr, int alpha ) +{ + uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff; + int i; + + SPEEDY_START(); + + for( i = 0; i < width; i++ ) { + int af = *mask; + + if( af ) { + int a = ((af * alpha) + 0x80) >> 8; + + if( a == 0xff ) { + *((uint32_t *) output) = opaque; + } else if( input[ 0 ] == 0x00 ) { + *((uint32_t *) output) = (multiply_alpha( a, textcr ) << 24) + | (multiply_alpha( a, textcb ) << 16) + | (multiply_alpha( a, textluma ) << 8) | a; + } else if( a ) { + *((uint32_t *) output) = ((input[ 3 ] + multiply_alpha( a, textcr - input[ 3 ] )) << 24) + | ((input[ 2 ] + multiply_alpha( a, textcb - input[ 2 ] )) << 16) + | ((input[ 1 ] + multiply_alpha( a, textluma - input[ 1 ] )) << 8) + | (a + multiply_alpha( 0xff - a, input[ 0 ] )); + } + } + mask++; + output += 4; + input += 4; + } + + SPEEDY_END(); +} + +static void premultiply_packed4444_scanline_c( uint8_t *output, uint8_t *input, int width ) +{ + SPEEDY_START(); + + while( width-- ) { + unsigned int cur_a = input[ 0 ]; + + *((uint32_t *) output) = (multiply_alpha( cur_a, input[ 3 ] ) << 24) + | (multiply_alpha( cur_a, input[ 2 ] ) << 16) + | (multiply_alpha( cur_a, input[ 1 ] ) << 8) + | cur_a; + + output += 4; + input += 4; + } + + SPEEDY_END(); +} + +#ifdef ARCH_X86 + +static void premultiply_packed4444_scanline_mmxext( uint8_t *output, uint8_t *input, int width ) +{ + const mmx_t round = { 0x0080008000800080ULL }; + const mmx_t alpha = { 0x00000000000000ffULL }; + const mmx_t noalp = { 0xffffffffffff0000ULL }; + + SPEEDY_START(); + + pxor_r2r( mm7, mm7 ); + while( width-- ) { + movd_m2r( *input, mm0 ); + punpcklbw_r2r( mm7, mm0 ); + + movq_r2r( mm0, mm2 ); + pshufw_r2r( mm2, mm2, 0 ); + movq_r2r( mm2, mm4 ); + pand_m2r( alpha, mm4 ); + + pmullw_r2r( mm2, mm0 ); + paddw_m2r( round, mm0 ); + + movq_r2r( mm0, mm3 ); + psrlw_i2r( 8, mm3 ); + paddw_r2r( mm3, mm0 ); + psrlw_i2r( 8, mm0 ); + + pand_m2r( noalp, mm0 ); + paddw_r2r( mm4, mm0 ); + + packuswb_r2r( mm0, mm0 ); + movd_r2m( mm0, *output ); + + output += 4; + input += 4; + } + sfence(); + emms(); + + SPEEDY_END(); +} + +#endif + +static void blend_packed422_scanline_c( uint8_t *output, uint8_t *src1, + uint8_t *src2, int width, int pos ) +{ + if( pos == 0 ) { + blit_packed422_scanline( output, src1, width ); + } else if( pos == 256 ) { + blit_packed422_scanline( output, src2, width ); + } else if( pos == 128 ) { + interpolate_packed422_scanline( output, src1, src2, width ); + } else { + width *= 2; + while( width-- ) { + *output++ = ( (*src1++ * ( 256 - pos )) + (*src2++ * pos) + 0x80 ) >> 8; + } + } +} + +#ifdef ARCH_X86 + +static void blend_packed422_scanline_mmxext( uint8_t *output, uint8_t *src1, + uint8_t *src2, int width, int pos ) +{ + if( pos <= 0 ) { + blit_packed422_scanline( output, src1, width ); + } else if( pos >= 256 ) { + blit_packed422_scanline( output, src2, width ); + } else if( pos == 128 ) { + interpolate_packed422_scanline( output, src1, src2, width ); + } else { + const mmx_t all256 = { 0x0100010001000100ULL }; + const mmx_t round = { 0x0080008000800080ULL }; + + SPEEDY_START(); + + movd_m2r( pos, mm0 ); + pshufw_r2r( mm0, mm0, 0 ); + movq_m2r( all256, mm1 ); + psubw_r2r( mm0, mm1 ); + pxor_r2r( mm7, mm7 ); + + for( width /= 2; width; width-- ) { + movd_m2r( *src1, mm3 ); + movd_m2r( *src2, mm4 ); + punpcklbw_r2r( mm7, mm3 ); + punpcklbw_r2r( mm7, mm4 ); + + pmullw_r2r( mm1, mm3 ); + pmullw_r2r( mm0, mm4 ); + paddw_r2r( mm4, mm3 ); + paddw_m2r( round, mm3 ); + psrlw_i2r( 8, mm3 ); + + packuswb_r2r( mm3, mm3 ); + movd_r2m( mm3, *output ); + + output += 4; + src1 += 4; + src2 += 4; + } + sfence(); + emms(); + + SPEEDY_END(); + } +} + +static void quarter_blit_vertical_packed422_scanline_mmxext( uint8_t *output, uint8_t *one, + uint8_t *three, int width ) +{ + int i; + + SPEEDY_START(); + for( i = width/16; i; --i ) { + movq_m2r( *one, mm0 ); + movq_m2r( *three, mm1 ); + movq_m2r( *(one + 8), mm2 ); + movq_m2r( *(three + 8), mm3 ); + movq_m2r( *(one + 16), mm4 ); + movq_m2r( *(three + 16), mm5 ); + movq_m2r( *(one + 24), mm6 ); + movq_m2r( *(three + 24), mm7 ); + pavgb_r2r( mm1, mm0 ); + pavgb_r2r( mm1, mm0 ); + pavgb_r2r( mm3, mm2 ); + pavgb_r2r( mm3, mm2 ); + pavgb_r2r( mm5, mm4 ); + pavgb_r2r( mm5, mm4 ); + pavgb_r2r( mm7, mm6 ); + pavgb_r2r( mm7, mm6 ); + movntq_r2m( mm0, *output ); + movntq_r2m( mm2, *(output + 8) ); + movntq_r2m( mm4, *(output + 16) ); + movntq_r2m( mm6, *(output + 24) ); + output += 32; + one += 32; + three += 32; + } + width = (width & 0xf); + + for( i = width/4; i; --i ) { + movq_m2r( *one, mm0 ); + movq_m2r( *three, mm1 ); + pavgb_r2r( mm1, mm0 ); + pavgb_r2r( mm1, mm0 ); + movntq_r2m( mm0, *output ); + output += 8; + one += 8; + three += 8; + } + width = width & 0x7; + + /* Handle last few pixels. */ + for( i = width * 2; i; --i ) { + *output++ = (*one + *three + *three + *three + 2) / 4; + one++; + three++; + } + + sfence(); + emms(); + + SPEEDY_END(); +} + +#endif + +static void quarter_blit_vertical_packed422_scanline_c( uint8_t *output, uint8_t *one, + uint8_t *three, int width ) +{ + SPEEDY_START(); + width *= 2; + while( width-- ) { + *output++ = (*one + *three + *three + *three + 2) / 4; + one++; + three++; + } + SPEEDY_END(); +} + +static void subpix_blit_vertical_packed422_scanline_c( uint8_t *output, uint8_t *top, + uint8_t *bot, int subpixpos, int width ) +{ + if( subpixpos == 32768 ) { + interpolate_packed422_scanline( output, top, bot, width ); + } else if( subpixpos == 16384 ) { + quarter_blit_vertical_packed422_scanline( output, top, bot, width ); + } else if( subpixpos == 49152 ) { + quarter_blit_vertical_packed422_scanline( output, bot, top, width ); + } else { + int x; + + SPEEDY_START(); + + width *= 2; + for( x = 0; x < width; x++ ) { + output[ x ] = ( ( top[ x ] * subpixpos ) + ( bot[ x ] * ( 0xffff - subpixpos ) ) ) >> 16; + } + SPEEDY_END(); + } +} + +static void a8_subpix_blit_scanline_c( uint8_t *output, uint8_t *input, + int lasta, int startpos, int width ) +{ + int pos = 0xffff - (startpos & 0xffff); + int prev = lasta; + int x; + + for( x = 0; x < width; x++ ) { + output[ x ] = ( ( prev * pos ) + ( input[ x ] * ( 0xffff - pos ) ) ) >> 16; + prev = input[ x ]; + } +} + + +static uint32_t speedy_accel; + +void setup_speedy_calls( int verbose ) +{ + speedy_accel = xine_mm_accel(); + + interpolate_packed422_scanline = interpolate_packed422_scanline_c; + blit_colour_packed422_scanline = blit_colour_packed422_scanline_c; + blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_c; + blit_packed422_scanline = blit_packed422_scanline_c; + composite_packed4444_to_packed422_scanline = composite_packed4444_to_packed422_scanline_c; + composite_packed4444_alpha_to_packed422_scanline = composite_packed4444_alpha_to_packed422_scanline_c; + composite_alphamask_to_packed4444_scanline = composite_alphamask_to_packed4444_scanline_c; + composite_alphamask_alpha_to_packed4444_scanline = composite_alphamask_alpha_to_packed4444_scanline_c; + premultiply_packed4444_scanline = premultiply_packed4444_scanline_c; + blend_packed422_scanline = blend_packed422_scanline_c; + filter_luma_121_packed422_inplace_scanline = filter_luma_121_packed422_inplace_scanline_c; + filter_luma_14641_packed422_inplace_scanline = filter_luma_14641_packed422_inplace_scanline_c; + comb_factor_packed422_scanline = 0; + diff_factor_packed422_scanline = diff_factor_packed422_scanline_c; + kill_chroma_packed422_inplace_scanline = kill_chroma_packed422_inplace_scanline_c; + mirror_packed422_inplace_scanline = mirror_packed422_inplace_scanline_c; + halfmirror_packed422_inplace_scanline = halfmirror_packed422_inplace_scanline_c; + speedy_memcpy = xine_fast_memcpy; + diff_packed422_block8x8 = diff_packed422_block8x8_c; + a8_subpix_blit_scanline = a8_subpix_blit_scanline_c; + quarter_blit_vertical_packed422_scanline = quarter_blit_vertical_packed422_scanline_c; + subpix_blit_vertical_packed422_scanline = subpix_blit_vertical_packed422_scanline_c; + +#ifdef ARCH_X86 + if( speedy_accel & MM_ACCEL_X86_MMXEXT ) { + if( verbose ) { + fprintf( stderr, "speedycode: Using MMXEXT optimized functions.\n" ); + } + interpolate_packed422_scanline = interpolate_packed422_scanline_mmxext; + blit_colour_packed422_scanline = blit_colour_packed422_scanline_mmxext; + blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_mmxext; + blit_packed422_scanline = blit_packed422_scanline_mmxext; + composite_packed4444_to_packed422_scanline = composite_packed4444_to_packed422_scanline_mmxext; + composite_packed4444_alpha_to_packed422_scanline = composite_packed4444_alpha_to_packed422_scanline_mmxext; + composite_alphamask_to_packed4444_scanline = composite_alphamask_to_packed4444_scanline_mmxext; + premultiply_packed4444_scanline = premultiply_packed4444_scanline_mmxext; + kill_chroma_packed422_inplace_scanline = kill_chroma_packed422_inplace_scanline_mmx; + blend_packed422_scanline = blend_packed422_scanline_mmxext; + diff_factor_packed422_scanline = diff_factor_packed422_scanline_mmx; + comb_factor_packed422_scanline = comb_factor_packed422_scanline_mmx; + diff_packed422_block8x8 = diff_packed422_block8x8_mmx; + quarter_blit_vertical_packed422_scanline = quarter_blit_vertical_packed422_scanline_mmxext; + } else if( speedy_accel & MM_ACCEL_X86_MMX ) { + if( verbose ) { + fprintf( stderr, "speedycode: Using MMX optimized functions.\n" ); + } + interpolate_packed422_scanline = interpolate_packed422_scanline_mmx; + blit_colour_packed422_scanline = blit_colour_packed422_scanline_mmx; + blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_mmx; + blit_packed422_scanline = blit_packed422_scanline_mmx; + diff_factor_packed422_scanline = diff_factor_packed422_scanline_mmx; + comb_factor_packed422_scanline = comb_factor_packed422_scanline_mmx; + kill_chroma_packed422_inplace_scanline = kill_chroma_packed422_inplace_scanline_mmx; + diff_packed422_block8x8 = diff_packed422_block8x8_mmx; + } else +#endif + { + if( verbose ) { + fprintf( stderr, "speedycode: No MMX or MMXEXT support detected, using C fallbacks.\n" ); + } + } +} + +int speedy_get_accel( void ) +{ + return speedy_accel; +} + |