/* * Copyright (C) 2000-2002 the xine project * * This file is part of xine, a free video player. * * xine is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * xine is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA * * Color Conversion Utility Functions * * Overview: xine's video output modules only accept YUV images from * video decoder modules. A video decoder can either send a planar (YV12) * image or a packed (YUY2) image to a video output module. However, many * older video codecs are RGB-based. Either each pixel is an index * to an RGB value in a palette table, or each pixel is encoded with * red, green, and blue values. In the latter case, typically either * 15, 16, 24, or 32 bits are used to represent a single pixel. * * If you want to use these facilities in your decoder, include the * xineutils.h header file. Then declare a yuv_planes_t structure. This * structure represents 3 non-subsampled YUV planes. "Non-subsampled" * means that there is a Y, U, and V sample for each pixel in the RGB * image, whereas YUV formats are usually subsampled so that the U and * V samples correspond to more than 1 pixel in the output image. When * you need to convert RGB values to Y, U, and V, values, use the * COMPUTE_Y(r, g, b), COMPUTE_U(r, g, b), COMPUTE_V(r, g, b) macros found * in xineutils.h * * The yuv_planes_t structure has 2 other fields: row_width and row_count * which are equivalent to the frame width and height, respectively. * * When an image has been fully decoded into the yuv_planes_t structure, * call yuv444_to_yuy2() with the structure and the final (pre-allocated) * YUY2 buffer. xine will have already chosen the best conversion * function to use based on the CPU type. The YUY2 buffer will then be * ready to pass to the video output module. * * If your decoder is rendering an image based on an RGB palette, a good * strategy is to maintain a YUV palette rather than an RGB palette and * render the image directly in YUV. * * Some utility macros that you may find useful in your decoder are * UNPACK_RGB15, UNPACK_RGB16, UNPACK_BGR15, and UNPACK_BGR16. All are * located in xineutils.h. All of them take a packed pixel, either in * RGB or BGR format depending on the macro, and unpack them into the * component red, green, and blue bytes. If a CPU has special instructions * to facilitate these operations (such as the PPC AltiVec pixel-unpacking * instructions), these macros will automatically map to those special * instructions. * * $Id: color.c,v 1.6 2002/08/28 03:32:48 tmmm Exp $ */ #include "xine_internal.h" #include "xineutils.h" /* * In search of the perfect colorspace conversion formulae... * These are the conversion equations that xine currently uses: * * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B * U = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 * V = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 * * Feel free to experiment with different coefficients by altering the * next 9 defines. */ #if 1 #define Y_R (SCALEFACTOR * 0.29900) #define Y_G (SCALEFACTOR * 0.58700) #define Y_B (SCALEFACTOR * 0.11400) #define U_R (SCALEFACTOR * -0.16874) #define U_G (SCALEFACTOR * -0.33126) #define U_B (SCALEFACTOR * 0.50000) #define V_R (SCALEFACTOR * 0.50000) #define V_G (SCALEFACTOR * -0.41869) #define V_B (SCALEFACTOR * -0.08131) #else /* * Here is another promising set of coefficients. If you use these, you * must also add 16 to the Y calculation in the COMPUTE_Y macro found * in xineutils.h. */ #define Y_R (SCALEFACTOR * 0.257) #define Y_G (SCALEFACTOR * 0.504) #define Y_B (SCALEFACTOR * 0.098) #define U_R (SCALEFACTOR * -0.148) #define U_G (SCALEFACTOR * -0.291) #define U_B (SCALEFACTOR * 0.439) #define V_R (SCALEFACTOR * 0.439) #define V_G (SCALEFACTOR * -0.368) #define V_B (SCALEFACTOR * -0.071) #endif /* * Precalculate all of the YUV tables since it requires fewer than * 10 kilobytes to store them. */ int y_r_table[256]; int y_g_table[256]; int y_b_table[256]; int u_r_table[256]; int u_g_table[256]; int u_b_table[256]; int v_r_table[256]; int v_g_table[256]; int v_b_table[256]; void (*yuv444_to_yuy2) (yuv_planes_t *yuv_planes, unsigned char *yuy2_map, int pitch); /* * init_yuv_planes * * This function initializes a yuv_planes_t structure based on the width * and height passed to it. The width must be divisible by 2. */ void init_yuv_planes(yuv_planes_t *yuv_planes, int width, int height) { int plane_size; yuv_planes->row_width = width; yuv_planes->row_count = height; /* add 6 extra bytes to the plane size to account for residual filtering * on the C planes */ plane_size = yuv_planes->row_width * yuv_planes->row_count + 6; yuv_planes->y = xine_xmalloc(plane_size); yuv_planes->u = xine_xmalloc(plane_size); yuv_planes->v = xine_xmalloc(plane_size); } /* * free_yuv_planes * * This frees the memory used by the YUV planes. */ void free_yuv_planes(yuv_planes_t *yuv_planes) { free(yuv_planes->y); free(yuv_planes->u); free(yuv_planes->v); } /* * yuv444_to_yuy2_c * * This is the simple, portable C version of the yuv444_to_yuy2() function. * It is not especially accurate in its method. But it is fast. * * yuv_planes contains the 3 non-subsampled planes that represent Y, U, * and V samples for every pixel in the image. For each pair of pixels, * use both Y samples but use the first pixel's U value and the second * pixel's V value. * * Y plane: Y0 Y1 Y2 Y3 ... * U plane: U0 U1 U2 U3 ... * V plane: V0 V1 V2 V3 ... * * YUY2 map: Y0 U0 Y1 V1 Y2 U2 Y3 V3 */ void yuv444_to_yuy2_c(yuv_planes_t *yuv_planes, unsigned char *yuy2_map, int pitch) { int row_ptr, pixel_ptr; int yuy2_index; /* copy the Y samples */ yuy2_index = 0; for (row_ptr = 0; row_ptr < yuv_planes->row_width * yuv_planes->row_count; row_ptr += yuv_planes->row_width) { for (pixel_ptr = 0; pixel_ptr < yuv_planes->row_width; pixel_ptr++, yuy2_index += 2) yuy2_map[yuy2_index] = yuv_planes->y[row_ptr + pixel_ptr]; yuy2_index += (pitch - 2*yuv_planes->row_width); } /* copy the C samples */ yuy2_index = 1; for (row_ptr = 0; row_ptr < yuv_planes->row_width * yuv_planes->row_count; row_ptr += yuv_planes->row_width) { for (pixel_ptr = 0; pixel_ptr < yuv_planes->row_width;) { yuy2_map[yuy2_index] = yuv_planes->u[row_ptr + pixel_ptr]; pixel_ptr++; yuy2_index += 2; yuy2_map[yuy2_index] = yuv_planes->v[row_ptr + pixel_ptr]; pixel_ptr++; yuy2_index += 2; } yuy2_index += (pitch - 2*yuv_planes->row_width); } } /* * yuv444_to_yuy2_mmx * * This is the proper, filtering version of the yuv444_to_yuy2() function * optimized with basic Intel MMX instructions. * * yuv_planes contains the 3 non-subsampled planes that represent Y, U, * and V samples for every pixel in the image. The goal is to convert the * 3 planes to a single packed YUY2 byte stream. Dealing with the Y * samples is easy because every Y sample is used in the final image. * This can still be sped up using MMX instructions. Initialize mm0 to 0. * Then load blocks of 8 Y samples into mm1: * * in memory: Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7 * in mm1: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 * * Use the punpck*bw instructions to interleave the Y samples with zeros. * For example, executing punpcklbw_r2r(mm0, mm1) will result in: * * mm1: 00 Y3 00 Y2 00 Y1 00 Y0 * * which will be written back to memory (in the YUY2 map) as: * * in memory: Y0 00 Y1 00 Y2 00 Y3 00 * * Do the same with the top 4 samples and soon all of the Y samples are * split apart and ready to have the U and V values interleaved. * * The C planes (U and V) must be filtered. The filter looks like this: * * (1 * C1 + 3 * C2 + 3 * C3 + 1 * C4) / 8 * * This filter slides across each row of each color plane. In the end, all * of the samples are filtered and the converter only uses every other * one. Since half of the filtered samples will not be used, their * calculations can safely be skipped. * * This implementation of the converter uses the MMX pmaddwd instruction * which performs 4 16x16 multiplications and 2 additions in parallel. * * First, initialize mm0 to 0 and mm7 to the filter coefficients: * mm0 = 0 * mm7 = 0001 0003 0003 0001 * * For each C plane, init the YUY2 map pointer to either 1 (for the U * plane) or 3 (for the V plane). For each set of 8 C samples, compute * 3 final C samples: 1 for [C0..C3], 1 for [C2..C5], and 1 for [C4..C7]. * Load 8 samples: * mm1 = C7 C6 .. C1 C0 (opposite order than in memory) * * Interleave zeros with the first 4 C samples: * mm2 = 00 C3 00 C2 00 C1 00 C0 * * Use pmaddwd to multiply and add: * mm2 = [C0 * 1 + C1 * 3] [C2 * 3 + C3 * 1] * * Copy mm2 to mm3, shift the high 32 bits in mm3 down, do the final * accumulation, and then divide by 8 (shift right by 3): * mm3 = mm2 * mm3 >>= 32 * mm2 += mm3 * mm2 >>= 3 * * At this point, the lower 8 bits of mm2 contain a filtered C sample. * Move it out to the YUY2 map and advance the map pointer by 4. Toss out * 2 of the samples in mm1 (C0 and C1) and loop twice more, once for * [C2..C5] and once for [C4..C7]. After computing 3 filtered samples, * increment the plane pointer by 6 and repeat the whole process. * * There is a special case when the filter hits the end of the line since * it is always necessary to rely on phantom samples beyond the end of the * line in order to compute the final 1-3 C samples of a line. This function * uses zeros in those phantom positions in order to compute the final * samples. However, the function might read up to 6 samples from the next * line which might not exist if the filter is already operation on the * last line of the plane. This is why the planes are allocated to be 6 * bytes larger than width * height. * */ void yuv444_to_yuy2_mmx(yuv_planes_t *yuv_planes, unsigned char *yuy2_map, int pitch) { #ifdef ARCH_X86 int h, i, j, k; unsigned char *source_plane; unsigned char *dest_plane; unsigned char vector[8]; unsigned char filter[] = { 0x01, 0x00, 0x03, 0x00, 0x03, 0x00, 0x01, 0x00 }; unsigned char advance2_andmask[] = { 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; unsigned char advance4_andmask[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00 }; unsigned char advance6_andmask[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00 }; int block_loops = yuv_planes->row_width / 6; int filter_loops; int advance_count; int row_inc = (pitch - 2 * yuv_planes->row_width); /* set up some MMX registers: * mm0 = 0, mm7 = color filter, * mm4..6 = advance 2,4,6 and masks */ pxor_r2r(mm0, mm0); movq_m2r(*filter, mm7); movq_m2r(*advance2_andmask, mm4); movq_m2r(*advance4_andmask, mm5); movq_m2r(*advance6_andmask, mm6); /* copy the Y samples */ source_plane = yuv_planes->y; dest_plane = yuy2_map; for (i = 0; i < yuv_planes->row_count; i++) { /* iterate through blocks of 8 samples, disregarding extra 2 samples */ for (j = 0; j < yuv_planes->row_width / 8; j++) { movq_m2r(*source_plane, mm1); /* load 8 Y samples */ source_plane += 8; movq_r2r(mm1, mm2); /* mm2 = mm1 */ punpcklbw_r2r(mm0, mm1); /* interleave lower 4 samples with zeros */ movq_r2m(mm1, *dest_plane); dest_plane += 8; punpckhbw_r2r(mm0, mm2); /* interleave upper 4 samples with zeros */ movq_r2m(mm2, *dest_plane); dest_plane += 8; } dest_plane += row_inc; } /* figure out the C samples */ for (h = 0; h < 2; h++) { /* select the color plane for this iteration */ if (h == 0) { source_plane = yuv_planes->u; dest_plane = yuy2_map + 1; } else { source_plane = yuv_planes->v; dest_plane = yuy2_map + 3; } for (i = 0; i < yuv_planes->row_count; i++) { filter_loops = 3; /* iterate through blocks of 6 samples */ for (j = 0; j <= block_loops; j++) { /* special case for end-of-line residual */ if (j != block_loops) { movq_m2r(*source_plane, mm1); /* load 8 C samples */ source_plane += 6; } else { advance_count = yuv_planes->row_width % 6; if (!advance_count) advance_count = 6; filter_loops = advance_count / 2; movq_m2r(*source_plane, mm1); /* load 8 C samples */ source_plane += advance_count; /* zero out the rest of the samples */ /* if (advance_count == 2) pand_r2r(mm4, mm1); else if (advance_count == 4) pand_r2r(mm5, mm1); else if (advance_count == 6) pand_r2r(mm6, mm1); */ } for (k = 0; k < filter_loops; k++) { movq_r2r(mm1, mm2); /* make a copy */ punpcklbw_r2r(mm0, mm2); /* interleave lower 4 samples with zeros */ pmaddwd_r2r(mm7, mm2); /* apply the filter */ movq_r2r(mm2, mm3); /* copy result to mm3 */ psrlq_i2r(32, mm3); /* move the upper sum down */ paddd_r2r(mm3, mm2); /* mm2 += mm3 */ psrlq_i2r(3, mm2); /* divide by 8 */ movq_r2m(mm2, *vector); dest_plane[0] = vector[0]; dest_plane += 4; psrlq_i2r(16, mm1); /* toss out 2 C samples and loop again */ } } } } /* be a good MMX citizen and empty MMX state */ emms(); #endif } /* * init_yuv_conversion * * This function precalculates all of the tables used for converting RGB * values to YUV values. This function also decides which conversion * functions to use. */ void init_yuv_conversion(void) { int i; for (i = 0; i < 256; i++) { y_r_table[i] = Y_R * i; y_g_table[i] = Y_G * i; y_b_table[i] = Y_B * i; u_r_table[i] = U_R * i; u_g_table[i] = U_G * i; u_b_table[i] = U_B * i; v_r_table[i] = V_R * i; v_g_table[i] = V_G * i; v_b_table[i] = V_B * i; } if (xine_mm_accel() & MM_ACCEL_X86_MMX) yuv444_to_yuy2 = yuv444_to_yuy2_mmx; else yuv444_to_yuy2 = yuv444_to_yuy2_c; }