From 2fa5c6fef4325885385ee65788bcb265b584bbf9 Mon Sep 17 00:00:00 2001 From: Mike Melanson Date: Thu, 12 Sep 2002 01:25:28 +0000 Subject: revised the MMX color converter to get rid of unsightly green dot in lower right corner of frame, fixed Y split when frame is not divisible by 8, and optimized filtered sample transfer to memory CVS patchset: 2655 CVS date: 2002/09/12 01:25:28 --- src/xine-utils/color.c | 103 ++++++++++++++++++++++--------------------------- 1 file changed, 46 insertions(+), 57 deletions(-) (limited to 'src') diff --git a/src/xine-utils/color.c b/src/xine-utils/color.c index b99e663b1..d9d1d74cb 100644 --- a/src/xine-utils/color.c +++ b/src/xine-utils/color.c @@ -26,6 +26,8 @@ * to an RGB value in a palette table, or each pixel is encoded with * red, green, and blue values. In the latter case, typically either * 15, 16, 24, or 32 bits are used to represent a single pixel. + * The facilities in this file are designed to ease the pain of converting + * RGB -> YUV. * * If you want to use these facilities in your decoder, include the * xineutils.h header file. Then declare a yuv_planes_t structure. This @@ -59,7 +61,7 @@ * instructions), these macros will automatically map to those special * instructions. * - * $Id: color.c,v 1.6 2002/08/28 03:32:48 tmmm Exp $ + * $Id: color.c,v 1.7 2002/09/12 01:25:28 tmmm Exp $ */ #include "xine_internal.h" @@ -143,9 +145,7 @@ void init_yuv_planes(yuv_planes_t *yuv_planes, int width, int height) { yuv_planes->row_width = width; yuv_planes->row_count = height; - /* add 6 extra bytes to the plane size to account for residual filtering - * on the C planes */ - plane_size = yuv_planes->row_width * yuv_planes->row_count + 6; + plane_size = yuv_planes->row_width * yuv_planes->row_count; yuv_planes->y = xine_xmalloc(plane_size); yuv_planes->u = xine_xmalloc(plane_size); @@ -287,64 +287,47 @@ void yuv444_to_yuy2_c(yuv_planes_t *yuv_planes, unsigned char *yuy2_map, * There is a special case when the filter hits the end of the line since * it is always necessary to rely on phantom samples beyond the end of the * line in order to compute the final 1-3 C samples of a line. This function - * uses zeros in those phantom positions in order to compute the final - * samples. However, the function might read up to 6 samples from the next - * line which might not exist if the filter is already operation on the - * last line of the plane. This is why the planes are allocated to be 6 - * bytes larger than width * height. + * rewinds the C sample stream by a few bytes and reuses a few samples in + * order to compute the final samples. This is not strictly correct; a + * better approach would be to mirror the final samples before computing + * the filter. But this reuse method is fast and apparently accurate + * enough. * */ void yuv444_to_yuy2_mmx(yuv_planes_t *yuv_planes, unsigned char *yuy2_map, int pitch) { #ifdef ARCH_X86 int h, i, j, k; + int width_div_8 = yuv_planes->row_width / 8; + int width_mod_8 = yuv_planes->row_width % 8; unsigned char *source_plane; unsigned char *dest_plane; - unsigned char vector[8]; unsigned char filter[] = { 0x01, 0x00, 0x03, 0x00, 0x03, 0x00, 0x01, 0x00 }; - unsigned char advance2_andmask[] = { - 0xFF, 0xFF, - 0x00, 0x00, - 0x00, 0x00, - 0x00, 0x00 - }; - unsigned char advance4_andmask[] = { - 0xFF, 0xFF, - 0xFF, 0xFF, - 0x00, 0x00, - 0x00, 0x00 - }; - unsigned char advance6_andmask[] = { - 0xFF, 0xFF, - 0xFF, 0xFF, - 0xFF, 0xFF, - 0x00, 0x00 - }; int block_loops = yuv_planes->row_width / 6; int filter_loops; - int advance_count; + int residual_filter_loops; int row_inc = (pitch - 2 * yuv_planes->row_width); + residual_filter_loops = (yuv_planes->row_width % 6) / 2; + if (!residual_filter_loops) + residual_filter_loops = 3; + /* set up some MMX registers: - * mm0 = 0, mm7 = color filter, - * mm4..6 = advance 2,4,6 and masks */ + * mm0 = 0, mm7 = color filter */ pxor_r2r(mm0, mm0); movq_m2r(*filter, mm7); - movq_m2r(*advance2_andmask, mm4); - movq_m2r(*advance4_andmask, mm5); - movq_m2r(*advance6_andmask, mm6); /* copy the Y samples */ source_plane = yuv_planes->y; dest_plane = yuy2_map; for (i = 0; i < yuv_planes->row_count; i++) { - /* iterate through blocks of 8 samples, disregarding extra 2 samples */ - for (j = 0; j < yuv_planes->row_width / 8; j++) { + /* iterate through blocks of 8 Y samples */ + for (j = 0; j < width_div_8; j++) { movq_m2r(*source_plane, mm1); /* load 8 Y samples */ source_plane += 8; @@ -360,6 +343,14 @@ void yuv444_to_yuy2_mmx(yuv_planes_t *yuv_planes, unsigned char *yuy2_map, dest_plane += 8; } + /* iterate through residual samples in row if row is not divisible by 8 */ + for (j = 0; j < width_mod_8; j++) { + + *dest_plane = *source_plane; + dest_plane += 2; + source_plane++; + } + dest_plane += row_inc; } @@ -382,28 +373,22 @@ void yuv444_to_yuy2_mmx(yuv_planes_t *yuv_planes, unsigned char *yuy2_map, /* iterate through blocks of 6 samples */ for (j = 0; j <= block_loops; j++) { - /* special case for end-of-line residual */ - if (j != block_loops) { - movq_m2r(*source_plane, mm1); /* load 8 C samples */ - source_plane += 6; - } else { - advance_count = yuv_planes->row_width % 6; - if (!advance_count) - advance_count = 6; - filter_loops = advance_count / 2; + if (j == block_loops) { + /* special case for end-of-line residual */ + filter_loops = residual_filter_loops; + source_plane -= (8 - residual_filter_loops * 2); movq_m2r(*source_plane, mm1); /* load 8 C samples */ - source_plane += advance_count; + source_plane += 8; + if (residual_filter_loops == 1) + psrlq_i2r(32, mm1); /* toss out 4 samples before starting */ + else if (residual_filter_loops == 2) + psrlq_i2r(16, mm1); /* toss out 2 samples before starting */ - /* zero out the rest of the samples */ -/* - if (advance_count == 2) - pand_r2r(mm4, mm1); - else if (advance_count == 4) - pand_r2r(mm5, mm1); - else if (advance_count == 6) - pand_r2r(mm6, mm1); -*/ + } else { + /* normal case */ + movq_m2r(*source_plane, mm1); /* load 8 C samples */ + source_plane += 6; } for (k = 0; k < filter_loops; k++) { @@ -416,8 +401,12 @@ void yuv444_to_yuy2_mmx(yuv_planes_t *yuv_planes, unsigned char *yuy2_map, paddd_r2r(mm3, mm2); /* mm2 += mm3 */ psrlq_i2r(3, mm2); /* divide by 8 */ - movq_r2m(mm2, *vector); - dest_plane[0] = vector[0]; + /* move the lower 32 bits of mm2 into eax */ + movd_r2r(mm2, eax); + /* move al (the final filtered sample) to its spot it memory */ + __asm__ __volatile__ ("mov %%" "al" ", %0" + : "=X" (*dest_plane) + : /* nothing */ ); dest_plane += 4; psrlq_i2r(16, mm1); /* toss out 2 C samples and loop again */ -- cgit v1.2.3