/*
 * Copyright (C) 2000-2014 the xine project
 *
 * This file is part of xine, a free video player.
 *
 * xine is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * xine is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
 *
 * Color Conversion Utility Functions
 *
 * Overview: xine's video output modules only accept YUV images from
 * video decoder modules. A video decoder can either send a planar (YV12)
 * image or a packed (YUY2) image to a video output module. However, many
 * older video codecs are RGB-based. Either each pixel is an index
 * to an RGB value in a palette table, or each pixel is encoded with
 * red, green, and blue values. In the latter case, typically either
 * 15, 16, 24, or 32 bits are used to represent a single pixel.
 * The facilities in this file are designed to ease the pain of converting
 * RGB -> YUV.
 *
 * If you want to use these facilities in your decoder, include the
 * xineutils.h header file. Then declare a yuv_planes_t structure. This
 * structure represents 3 non-subsampled YUV planes. "Non-subsampled"
 * means that there is a Y, U, and V sample for each pixel in the RGB
 * image, whereas YUV formats are usually subsampled so that the U and
 * V samples correspond to more than 1 pixel in the output image. When
 * you need to convert RGB values to Y, U, and V, values, use the
 * COMPUTE_Y(r, g, b), COMPUTE_U(r, g, b), COMPUTE_V(r, g, b) macros found
 * in xineutils.h
 *
 * The yuv_planes_t structure has 2 other fields: row_width and row_count
 * which are equivalent to the frame width and height, respectively.
 *
 * When an image has been fully decoded into the yuv_planes_t structure,
 * call yuv444_to_yuy2() with the structure and the final (pre-allocated)
 * YUY2 buffer. xine will have already chosen the best conversion
 * function to use based on the CPU type. The YUY2 buffer will then be
 * ready to pass to the video output module.
 *
 * If your decoder is rendering an image based on an RGB palette, a good
 * strategy is to maintain a YUV palette rather than an RGB palette and
 * render the image directly in YUV.
 *
 * Some utility macros that you may find useful in your decoder are
 * UNPACK_RGB15, UNPACK_RGB16, UNPACK_BGR15, and UNPACK_BGR16. All are
 * located in xineutils.h. All of them take a packed pixel, either in
 * RGB or BGR format depending on the macro, and unpack them into the
 * component red, green, and blue bytes. If a CPU has special instructions
 * to facilitate these operations (such as the PPC AltiVec pixel-unpacking
 * instructions), these macros will automatically map to those special
 * instructions.
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <xine/xine_internal.h>
#include "xine_mmx.h"

/*
 * In search of the perfect colorspace conversion formulae...
 * These are the conversion equations that xine currently uses
 * (before normalisation):
 *
 *      Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
 *      U  = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
 *      V  =  0.50000 * R - 0.41869 * G - 0.08131 * B + 128
 */

/*
#define Y_R (SCALEFACTOR *  0.29900 * 219.0 / 255.0)
#define Y_G (SCALEFACTOR *  0.58700 * 219.0 / 255.0)
#define Y_B (SCALEFACTOR *  0.11400 * 219.0 / 255.0)

#define U_R (SCALEFACTOR * -0.16874 * 224.0 / 255.0)
#define U_G (SCALEFACTOR * -0.33126 * 224.0 / 255.0)
#define U_B (SCALEFACTOR *  0.50000 * 224.0 / 255.0)

#define V_R (SCALEFACTOR *  0.50000 * 224.0 / 255.0)
#define V_G (SCALEFACTOR * -0.41869 * 224.0 / 255.0)
#define V_B (SCALEFACTOR * -0.08131 * 224.0 / 255.0)
*/

#define Y_R (SCALEFACTOR *  0.299         * 219.0 / 255.0)
#define Y_G (SCALEFACTOR *  0.587         * 219.0 / 255.0)
#define Y_B (SCALEFACTOR *  0.114         * 219.0 / 255.0)

#define U_R (SCALEFACTOR * -0.299 / 1.772 * 224.0 / 255.0)
#define U_G (SCALEFACTOR * -0.587 / 1.772 * 224.0 / 255.0)
#define U_B (SCALEFACTOR *  0.886 / 1.772 * 224.0 / 255.0)

#define V_R (SCALEFACTOR *  0.701 / 1.402 * 224.0 / 255.0)
#define V_G (SCALEFACTOR * -0.587 / 1.402 * 224.0 / 255.0)
#define V_B (SCALEFACTOR * -0.114 / 1.402 * 224.0 / 255.0)

/*
 * With the normalisation factors above, Y needs 16 added.
 * This is done during setup, not in the macros in xineutils.h, because
 * doing it there would be an API change.
 */
#define Y_MOD (16 * SCALEFACTOR)

/*
 * Precalculate all of the YUV tables since it requires fewer than
 * 10 kilobytes to store them.
 */
int y_r_table[256];
int y_g_table[256];
int y_b_table[256];

int u_r_table[256];
int u_g_table[256];
int u_b_table[256];

int v_r_table[256];
int v_g_table[256];
int v_b_table[256];

void (*yuv444_to_yuy2) (const yuv_planes_t *yuv_planes, unsigned char *yuy2_map, int pitch);
void (*yuv9_to_yv12)
  (const unsigned char *y_src, int y_src_pitch, unsigned char *y_dest, int y_dest_pitch,
   const unsigned char *u_src, int u_src_pitch, unsigned char *u_dest, int u_dest_pitch,
   const unsigned char *v_src, int v_src_pitch, unsigned char *v_dest, int v_dest_pitch,
   int width, int height);
void (*yuv411_to_yv12)
  (const unsigned char *y_src, int y_src_pitch, unsigned char *y_dest, int y_dest_pitch,
   const unsigned char *u_src, int u_src_pitch, unsigned char *u_dest, int u_dest_pitch,
   const unsigned char *v_src, int v_src_pitch, unsigned char *v_dest, int v_dest_pitch,
   int width, int height);
void (*yv12_to_yuy2)
  (const unsigned char *y_src, int y_src_pitch,
   const unsigned char *u_src, int u_src_pitch,
   const unsigned char *v_src, int v_src_pitch,
   unsigned char *yuy2_map, int yuy2_pitch,
   int width, int height, int progressive);
void (*yuy2_to_yv12)
  (const unsigned char *yuy2_map, int yuy2_pitch,
   unsigned char *y_dst, int y_dst_pitch,
   unsigned char *u_dst, int u_dst_pitch,
   unsigned char *v_dst, int v_dst_pitch,
   int width, int height);

/*
 * init_yuv_planes
 *
 * This function initializes a yuv_planes_t structure based on the width
 * and height passed to it. The width must be divisible by 2.
 */
void init_yuv_planes(yuv_planes_t *yuv_planes, int width, int height) {
  memset (yuv_planes, 0, sizeof (*yuv_planes));

  yuv_planes->row_width = width;
  yuv_planes->row_count = height;

  yuv_planes->y = calloc(width, height);
  yuv_planes->u = calloc(width, height);
  yuv_planes->v = calloc(width, height);
}

/*
 * free_yuv_planes
 *
 * This frees the memory used by the YUV planes.
 */
void free_yuv_planes(yuv_planes_t *yuv_planes) {
  if (yuv_planes->y)
    free(yuv_planes->y);
  if (yuv_planes->u)
    free(yuv_planes->u);
  if (yuv_planes->v)
    free(yuv_planes->v);
}

/*
 * yuv444_to_yuy2_c
 *
 * This is the simple, portable C version of the yuv444_to_yuy2() function.
 * It is not especially accurate in its method. But it is fast.
 *
 * yuv_planes contains the 3 non-subsampled planes that represent Y, U,
 * and V samples for every pixel in the image. For each pair of pixels,
 * use both Y samples but use the first pixel's U value and the second
 * pixel's V value.
 *
 *    Y plane: Y0 Y1 Y2 Y3 ...
 *    U plane: U0 U1 U2 U3 ...
 *    V plane: V0 V1 V2 V3 ...
 *
 *   YUY2 map: Y0 U0 Y1 V1  Y2 U2 Y3 V3
 */
static void yuv444_to_yuy2_c(const yuv_planes_t *yuv_planes, unsigned char *yuy2_map,
  int pitch) {

  unsigned int row_ptr, pixel_ptr;
  int yuy2_index;

  /* copy the Y samples */
  yuy2_index = 0;
  for (row_ptr = 0; row_ptr < yuv_planes->row_width * yuv_planes->row_count;
    row_ptr += yuv_planes->row_width) {
    for (pixel_ptr = 0; pixel_ptr <  yuv_planes->row_width;
      pixel_ptr++, yuy2_index += 2)
      yuy2_map[yuy2_index] = yuv_planes->y[row_ptr + pixel_ptr];

    yuy2_index += (pitch - 2*yuv_planes->row_width);
  }

  /* copy the C samples */
  yuy2_index = 1;
  for (row_ptr = 0; row_ptr < yuv_planes->row_width * yuv_planes->row_count;
    row_ptr += yuv_planes->row_width) {

    for (pixel_ptr = 0; pixel_ptr <  yuv_planes->row_width;) {
      yuy2_map[yuy2_index] = yuv_planes->u[row_ptr + pixel_ptr];
      pixel_ptr++;
      yuy2_index += 2;
      yuy2_map[yuy2_index] = yuv_planes->v[row_ptr + pixel_ptr];
      pixel_ptr++;
      yuy2_index += 2;
    }

    yuy2_index += (pitch - 2*yuv_planes->row_width);
  }
}

/*
 * yuv444_to_yuy2_mmx
 *
 * This is the proper, filtering version of the yuv444_to_yuy2() function
 * optimized with basic Intel MMX instructions.
 *
 * yuv_planes contains the 3 non-subsampled planes that represent Y, U,
 * and V samples for every pixel in the image. The goal is to convert the
 * 3 planes to a single packed YUY2 byte stream. Dealing with the Y
 * samples is easy because every Y sample is used in the final image.
 * This can still be sped up using MMX instructions. Initialize mm0 to 0.
 * Then load blocks of 8 Y samples into mm1:
 *
 *    in memory: Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7
 *    in mm1:    Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
 *
 * Use the punpck*bw instructions to interleave the Y samples with zeros.
 * For example, executing punpcklbw_r2r(mm0, mm1) will result in:
 *
 *          mm1: 00 Y3 00 Y2 00 Y1 00 Y0
 *
 * which will be written back to memory (in the YUY2 map) as:
 *
 *    in memory: Y0 00 Y1 00 Y2 00 Y3 00
 *
 * Do the same with the top 4 samples and soon all of the Y samples are
 * split apart and ready to have the U and V values interleaved.
 *
 * The C planes (U and V) must be filtered. The filter looks like this:
 *
 *   (1 * C1 + 3 * C2 + 3 * C3 + 1 * C4) / 8
 *
 * This filter slides across each row of each color plane. In the end, all
 * of the samples are filtered and the converter only uses every other
 * one. Since half of the filtered samples will not be used, their
 * calculations can safely be skipped.
 *
 * This implementation of the converter uses the MMX pmaddwd instruction
 * which performs 4 16x16 multiplications and 2 additions in parallel.
 *
 * First, initialize mm0 to 0 and mm7 to the filter coefficients:
 *    mm0 = 0
 *    mm7 = 0001 0003 0003 0001
 *
 * For each C plane, init the YUY2 map pointer to either 1 (for the U
 * plane) or 3 (for the V plane). For each set of 8 C samples, compute
 * 3 final C samples: 1 for [C0..C3], 1 for [C2..C5], and 1 for [C4..C7].
 * Load 8 samples:
 *    mm1 = C7 C6 .. C1 C0 (opposite order than in memory)
 *
 * Interleave zeros with the first 4 C samples:
 *    mm2 = 00 C3 00 C2 00 C1 00 C0
 *
 * Use pmaddwd to multiply and add:
 *    mm2 = [C0 * 1 + C1 * 3] [C2 * 3 + C3 * 1]
 *
 * Copy mm2 to mm3, shift the high 32 bits in mm3 down, do the final
 * accumulation, and then divide by 8 (shift right by 3):
 *    mm3 = mm2
 *    mm3 >>= 32
 *    mm2 += mm3
 *    mm2 >>= 3
 *
 * At this point, the lower 8 bits of mm2 contain a filtered C sample.
 * Move it out to the YUY2 map and advance the map pointer by 4. Toss out
 * 2 of the samples in mm1 (C0 and C1) and loop twice more, once for
 * [C2..C5] and once for [C4..C7]. After computing 3 filtered samples,
 * increment the plane pointer by 6 and repeat the whole process.
 *
 * There is a special case when the filter hits the end of the line since
 * it is always necessary to rely on phantom samples beyond the end of the
 * line in order to compute the final 1-3 C samples of a line. This function
 * rewinds the C sample stream by a few bytes and reuses a few samples in
 * order to compute the final samples. This is not strictly correct; a
 * better approach would be to mirror the final samples before computing
 * the filter. But this reuse method is fast and apparently accurate
 * enough.
 *
 */
static void yuv444_to_yuy2_mmx(const yuv_planes_t *yuv_planes, unsigned char *yuy2_map,
  int pitch) {
#if defined(ARCH_X86) || defined(ARCH_X86_64)
  unsigned int i;
  int h, j, k;
  int width_div_8 = yuv_planes->row_width / 8;
  int width_mod_8 = yuv_planes->row_width % 8;
  unsigned char *source_plane;
  unsigned char *dest_plane;
  static const mmx_t filter = {ub: {
    0x01, 0x00,
    0x03, 0x00,
    0x03, 0x00,
    0x01, 0x00
  }};
  unsigned char shifter[] = {0, 0, 0, 0, 0, 0, 0, 0};
  unsigned char vector[8];
  int block_loops = yuv_planes->row_width / 6;
  int filter_loops;
  int residual_filter_loops;
  int row_inc = (pitch - 2 * yuv_planes->row_width);

  residual_filter_loops = (yuv_planes->row_width % 6) / 2;
  shifter[0] = residual_filter_loops * 8;
  /* if the width is divisible by 6, apply 3 residual filters and perform
   * one less primary loop */
  if (!residual_filter_loops) {
    residual_filter_loops = 3;
    block_loops--;
  }

  /* set up some MMX registers:
   * mm0 = 0, mm7 = color filter */
  pxor_r2r(mm0, mm0);
  movq_m2r(filter, mm7);

  /* copy the Y samples */
  source_plane = yuv_planes->y;
  dest_plane = yuy2_map;
  for (i = 0; i < yuv_planes->row_count; i++) {
    /* iterate through blocks of 8 Y samples */
    for (j = 0; j < width_div_8; j++) {

      movq_m2r(*source_plane, mm1);  /* load 8 Y samples */
      source_plane += 8;

      movq_r2r(mm1, mm2);  /* mm2 = mm1 */

      punpcklbw_r2r(mm0, mm1); /* interleave lower 4 samples with zeros */
      movq_r2m(mm1, *dest_plane);
      dest_plane += 8;

      punpckhbw_r2r(mm0, mm2); /* interleave upper 4 samples with zeros */
      movq_r2m(mm2, *dest_plane);
      dest_plane += 8;
    }

    /* iterate through residual samples in row if row is not divisible by 8 */
    for (j = 0; j < width_mod_8; j++) {

      *dest_plane = *source_plane;
      dest_plane += 2;
      source_plane++;
    }

    dest_plane += row_inc;
  }

  /* figure out the C samples */
  for (h = 0; h < 2; h++) {

    /* select the color plane for this iteration */
    if (h == 0) {
      source_plane = yuv_planes->u;
      dest_plane = yuy2_map + 1;
    } else {
      source_plane = yuv_planes->v;
      dest_plane = yuy2_map + 3;
    }

    for (i = 0; i < yuv_planes->row_count; i++) {

      filter_loops = 3;

      /* iterate through blocks of 6 samples */
      for (j = 0; j <= block_loops; j++) {

        if (j == block_loops) {

          /* special case for end-of-line residual */
          filter_loops = residual_filter_loops;
          source_plane -= (8 - residual_filter_loops * 2);
          movq_m2r(*source_plane, mm1); /* load 8 C samples */
          source_plane += 8;
          psrlq_m2r(*shifter, mm1);  /* toss out samples before starting */

        } else {

          /* normal case */
          movq_m2r(*source_plane, mm1); /* load 8 C samples */
          source_plane += 6;
        }

        for (k = 0; k < filter_loops; k++) {
          movq_r2r(mm1, mm2);      /* make a copy */

          punpcklbw_r2r(mm0, mm2); /* interleave lower 4 samples with zeros */
          pmaddwd_r2r(mm7, mm2);   /* apply the filter */
          movq_r2r(mm2, mm3);      /* copy result to mm3 */
          psrlq_i2r(32, mm3);      /* move the upper sum down */
          paddd_r2r(mm3, mm2);     /* mm2 += mm3 */
          psrlq_i2r(3, mm2);       /* divide by 8 */

#if 0
          /* load the destination address into ebx */
          __asm__ __volatile__ ("mov %0, %%ebx"
                              : /* nothing */
                              : "X" (dest_plane)
                              : "ebx" /* clobber list */);

          /* move the lower 32 bits of mm2 into eax */
          __asm__ __volatile__ ("movd %%mm2, %%eax"
                                : /* nothing */
                                : /* nothing */
                                : "eax" /* clobber list */ );

          /* move al (the final filtered sample) to its spot it memory */
          __asm__ __volatile__ ("mov %%al, (%%ebx)"
                                : /* nothing */
                                : /* nothing */ );

#else
          movq_r2m(mm2, *vector);
          dest_plane[0] = vector[0];
#endif

          dest_plane += 4;

          psrlq_i2r(16, mm1);      /* toss out 2 C samples and loop again */
        }
      }
      dest_plane += row_inc;
    }
  }

  /* be a good MMX citizen and empty MMX state */
  emms();
#endif
}

static void hscale_chroma_line (unsigned char *dst, const unsigned char *src,
  int width) {

  unsigned int n1, n2;
  int       x;

  n1       = *src;
  *(dst++) = n1;

  for (x=0; x < (width - 1); x++) {
    n2       = *(++src);
    *(dst++) = (3*n1 + n2 + 2) >> 2;
    *(dst++) = (n1 + 3*n2 + 2) >> 2;
    n1       = n2;
  }

  *dst = n1;
}

static void vscale_chroma_line (unsigned char *dst, int pitch,
  const unsigned char *src1, const unsigned char *src2, int width) {

  unsigned int t1, t2;
  unsigned int n1, n2, n3, n4;
  unsigned int *dst1, *dst2;
  int       x;

  dst1 = (unsigned int *) dst;
  dst2 = (unsigned int *) (dst + pitch);

  /* process blocks of 4 pixels */
  for (x=0; x < (width / 4); x++) {
    n1  = *((unsigned int *) src1); src1 = (const unsigned char *)(((const unsigned int *) src1) + 1);
    n2  = *((unsigned int *) src2); src2 = (const unsigned char *)(((const unsigned int *) src2) + 1);
    n3  = (n1 & 0xFF00FF00) >> 8;
    n4  = (n2 & 0xFF00FF00) >> 8;
    n1 &= 0x00FF00FF;
    n2 &= 0x00FF00FF;

    t1 = (2*n1 + 2*n2 + 0x20002);
    t2 = (n1 - n2);
    n1 = (t1 + t2);
    n2 = (t1 - t2);
    t1 = (2*n3 + 2*n4 + 0x20002);
    t2 = (n3 - n4);
    n3 = (t1 + t2);
    n4 = (t1 - t2);

    *(dst1++) = ((n1 >> 2) & 0x00FF00FF) | ((n3 << 6) & 0xFF00FF00);
    *(dst2++) = ((n2 >> 2) & 0x00FF00FF) | ((n4 << 6) & 0xFF00FF00);
  }

  /* process remaining pixels */
  for (x=(width & ~0x3); x < width; x++) {
    n1 = src1[x];
    n2 = src2[x];

    dst[x]       = (3*n1 + n2 + 2) >> 2;
    dst[x+pitch] = (n1 + 3*n2 + 2) >> 2;
  }
}

static void upsample_c_plane_c(const unsigned char *src, int src_width,
  int src_height, unsigned char *dest,
  unsigned int src_pitch, unsigned int dest_pitch) {

  unsigned char *cr1;
  unsigned char *cr2;
  unsigned char *tmp;
  int y;

  cr1 = &dest[dest_pitch * (src_height * 2 - 2)];
  cr2 = &dest[dest_pitch * (src_height * 2 - 3)];

  /* horizontally upscale first line */
  hscale_chroma_line (cr1, src, src_width);
  src += src_pitch;

  /* store first line */
  memcpy (dest, cr1, src_width * 2);
  dest += dest_pitch;

  for (y = 0; y < (src_height - 1); y++) {

    hscale_chroma_line (cr2, src, src_width);
    src += src_pitch;

    /* interpolate and store two lines */
    vscale_chroma_line (dest, dest_pitch, cr1, cr2, src_width * 2);
    dest += 2 * dest_pitch;

    /* swap buffers */
    tmp = cr2;
    cr2 = cr1;
    cr1 = tmp;
  }

  /* horizontally upscale and store last line */
  src -= src_pitch;
  hscale_chroma_line (dest, src, src_width);
}

/*
 * yuv9_to_yv12_c
 *
 */
static void yuv9_to_yv12_c
  (const unsigned char *y_src, int y_src_pitch, unsigned char *y_dest, int y_dest_pitch,
   const unsigned char *u_src, int u_src_pitch, unsigned char *u_dest, int u_dest_pitch,
   const unsigned char *v_src, int v_src_pitch, unsigned char *v_dest, int v_dest_pitch,
   int width, int height) {

  int y;

  /* Y plane */
  for (y=0; y < height; y++) {
    xine_fast_memcpy (y_dest, y_src, width);
    y_src += y_src_pitch;
    y_dest += y_dest_pitch;
  }

  /* U plane */
  upsample_c_plane_c(u_src, width / 4, height / 4, u_dest,
    u_src_pitch, u_dest_pitch);

  /* V plane */
  upsample_c_plane_c(v_src, width / 4, height / 4, v_dest,
    v_src_pitch, v_dest_pitch);

}

/*
 * yuv411_to_yv12_c
 *
 */
static void yuv411_to_yv12_c
  (const unsigned char *y_src, int y_src_pitch, unsigned char *y_dest, int y_dest_pitch,
   const unsigned char *u_src, int u_src_pitch, unsigned char *u_dest, int u_dest_pitch,
   const unsigned char *v_src, int v_src_pitch, unsigned char *v_dest, int v_dest_pitch,
   int width, int height) {

  int y;
  int c_src_row, c_src_pixel;
  int c_dest_row, c_dest_pixel;
  unsigned char c_sample;

  /* Y plane */
  for (y=0; y < height; y++) {
    xine_fast_memcpy (y_dest, y_src, width);
    y_src += y_src_pitch;
    y_dest += y_dest_pitch;
  }

  /* naive approach: downsample vertically, upsample horizontally */

  /* U plane */
  for (c_src_row = 0, c_dest_row = 0;
       c_src_row < u_src_pitch * height;
       c_src_row += u_src_pitch * 2, c_dest_row += u_dest_pitch) {

    for (c_src_pixel = c_src_row, c_dest_pixel = c_dest_row;
         c_dest_pixel < c_dest_row + u_dest_pitch;
         c_src_pixel++) {

      /* downsample by averaging the samples from 2 rows */
      c_sample =
        (u_src[c_src_pixel] + u_src[c_src_pixel + u_src_pitch] + 1) / 2;
      /* upsample by outputting the sample twice on the YV12 row */
      u_dest[c_dest_pixel++] = c_sample;
      u_dest[c_dest_pixel++] = c_sample;

    }
  }

  /* V plane */
  for (c_src_row = 0, c_dest_row = 0;
       c_src_row < v_src_pitch * height;
       c_src_row += v_src_pitch * 2, c_dest_row += v_dest_pitch) {

    for (c_src_pixel = c_src_row, c_dest_pixel = c_dest_row;
         c_dest_pixel < c_dest_row + v_dest_pitch;
         c_src_pixel++) {

      /* downsample by averaging the samples from 2 rows */
      c_sample =
        (v_src[c_src_pixel] + v_src[c_src_pixel + v_src_pitch] + 1 ) / 2;
      /* upsample by outputting the sample twice on the YV12 row */
      v_dest[c_dest_pixel++] = c_sample;
      v_dest[c_dest_pixel++] = c_sample;

    }
  }

}

#define C_YUV420_YUYV_PROGRESSIVE(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2)  \
    utmp = 3 * *p_u++;                                                                    \
    vtmp = 3 * *p_v++;                                                                    \
    *p_line1++ = *p_y1++;                *p_line2++ = *p_y2++;                            \
    *p_line1++ = (*p_ut++ + utmp) >> 2;  *p_line2++ = (utmp + *p_ub++) >> 2;              \
    *p_line1++ = *p_y1++;                *p_line2++ = *p_y2++;                            \
    *p_line1++ = (*p_vt++ + vtmp) >> 2;  *p_line2++ = (vtmp + *p_vb++) >> 2;              \

#define C_YUV420_YUYV_INTERLACED(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2)   \
    *p_line1++ = *p_y1++;                   *p_line2++ = *p_y2++;                         \
    *p_line1++ = (*p_ut++ + *p_u * 7) >> 3; *p_line2++ = (*p_u++ * 5 + *p_ub++ * 3) >> 3; \
    *p_line1++ = *p_y1++;                   *p_line2++ = *p_y2++;                         \
    *p_line1++ = (*p_vt++ + *p_v * 7) >> 3; *p_line2++ = (*p_v++ * 5 + *p_vb++ * 3) >> 3; \

/*****************************************************************************
 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
 * original conversion routine from Videolan project
 * changed to support interlaced frames and do correct chroma upsampling with
 * correct weighting factors and no chroma shift.
 *****************************************************************************/
static void yv12_to_yuy2_c
  (const unsigned char *y_src, int y_src_pitch,
   const unsigned char *u_src, int u_src_pitch,
   const unsigned char *v_src, int v_src_pitch,
   unsigned char *yuy2_map, int yuy2_pitch,
   int width, int height, int progressive) {

    uint8_t *p_line1, *p_line2 = yuy2_map;
    const uint8_t *p_y1, *p_y2 = y_src;
    const uint8_t *p_u = u_src;
    const uint8_t *p_v = v_src;
    const uint8_t *p_ub, *p_vb;
    const uint8_t *p_ut = u_src;
    const uint8_t *p_vt = v_src;

    int i_x, i_y;
    int utmp, vtmp;

    const int i_source_margin = y_src_pitch - width;
    const int i_source_u_margin = u_src_pitch - width/2;
    const int i_source_v_margin = v_src_pitch - width/2;
    const int i_dest_margin = yuy2_pitch - width*2;


    if( progressive ) {

      for( i_y = height / 2 ; i_y-- ; )
      {
          p_line1 = p_line2;
          p_line2 += yuy2_pitch;

          p_y1 = p_y2;
          p_y2 += y_src_pitch;

          if( i_y > 1 ) {
            p_ub = p_u + u_src_pitch;
            p_vb = p_v + v_src_pitch;
          } else {
            p_ub = p_u;
            p_vb = p_v;
          }

          for( i_x = width / 2 ; i_x-- ; )
          {
              C_YUV420_YUYV_PROGRESSIVE(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2);
          }

          p_y2 += i_source_margin;
          p_u += i_source_u_margin;
          p_v += i_source_v_margin;
          p_ut = p_u - u_src_pitch;
          p_vt = p_v - v_src_pitch;
          p_line2 += i_dest_margin;
      }

    } else {

      for( i_y = height / 4 ; i_y-- ; )
      {
          p_line1 = p_line2;
          p_line2 += 2 * yuy2_pitch;

          p_y1 = p_y2;
          p_y2 += 2 * y_src_pitch;

          if( i_y > 1 ) {
            p_ub = p_u + 2 * u_src_pitch;
            p_vb = p_v + 2 * v_src_pitch;
          } else {
            p_ub = p_u;
            p_vb = p_v;
          }

          for( i_x = width / 2 ; i_x-- ; )
          {
              C_YUV420_YUYV_INTERLACED(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2);
          }

          p_y2 += i_source_margin + y_src_pitch;
          p_u += i_source_u_margin + u_src_pitch;
          p_v += i_source_v_margin + v_src_pitch;
          p_ut = p_u - 2 * u_src_pitch;
          p_vt = p_v - 2 * v_src_pitch;
          p_line2 += i_dest_margin + yuy2_pitch;
      }

      p_line2 = yuy2_map + yuy2_pitch;
      p_y2 = y_src + y_src_pitch;
      p_u = u_src + u_src_pitch;
      p_v = v_src + v_src_pitch;
      p_ut = p_u;
      p_vt = p_v;

      for( i_y = height / 4 ; i_y-- ; )
      {
          p_line1 = p_line2;
          p_line2 += 2 * yuy2_pitch;

          p_y1 = p_y2;
          p_y2 += 2 * y_src_pitch;

          if( i_y > 1 ) {
            p_ub = p_u + 2 * u_src_pitch;
            p_vb = p_v + 2 * v_src_pitch;
          } else {
            p_ub = p_u;
            p_vb = p_v;
          }

          /* swap arguments for even lines */
          for( i_x = width / 2 ; i_x-- ; )
          {
              C_YUV420_YUYV_INTERLACED(p_y2,p_y1,p_u,p_ub,p_ut,p_v,p_vb,p_vt,p_line2,p_line1);
          }

          p_y2 += i_source_margin + y_src_pitch;
          p_u += i_source_u_margin + u_src_pitch;
          p_v += i_source_v_margin + v_src_pitch;
          p_ut = p_u - 2 * u_src_pitch;
          p_vt = p_v - 2 * v_src_pitch;
          p_line2 += i_dest_margin + yuy2_pitch;
      }

    }
}


#if defined(ARCH_X86) || defined(ARCH_X86_64)
static const int64_t __attribute__((__used__)) byte_one = 0x0101010101010101ll;

#define MMX_YUV420_YUYV_PROGRESSIVE(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2)  \
do {                                                                               \
   __asm__ __volatile__(                                                           \
    "movq       (%0), %%mm0 \n\t"  /* Load 8 Y          y7 y6 y5 y4 y3 y2 y1 y0 */ \
    "movd       (%1), %%mm1 \n\t"  /* Load 4 Cb         00 00 00 00 u3 u2 u1 u0 */ \
    "movd       (%2), %%mm2 \n\t"  /* Load 4 Cr         00 00 00 00 v3 v2 v1 v0 */ \
    "pxor      %%mm7, %%mm7 \n\t"  /*                   00 00 00 00 00 00 00 00 */ \
    "punpcklbw %%mm7, %%mm1 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
    "punpcklbw %%mm7, %%mm2 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
    "movq      %%mm1, %%mm3 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
    "movq      %%mm2, %%mm4 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
    "psllw        $1, %%mm3 \n\t"  /* Cb * 2                                    */ \
    "psllw        $1, %%mm4 \n\t"  /* Cr * 2                                    */ \
    "paddw     %%mm3, %%mm1 \n\t"  /* Cb * 3                                    */ \
    "paddw     %%mm4, %%mm2 \n\t"  /* Cr * 3                                    */ \
    :                                                                              \
    : "r" (p_y1), "r" (p_u), "r" (p_v) );                                          \
   __asm__ __volatile__(                                                           \
    "movd       (%0), %%mm3 \n\t"  /* Load 4 Cbt        00 00 00 00 u3 u2 u1 u0 */ \
    "movd       (%1), %%mm4 \n\t"  /* Load 4 Crt        00 00 00 00 v3 v2 v1 v0 */ \
    "movd       (%2), %%mm5 \n\t"  /* Load 4 Cbb        00 00 00 00 u3 u2 u1 u0 */ \
    "movd       (%3), %%mm6 \n\t"  /* Load 4 Crb        00 00 00 00 v3 v2 v1 v0 */ \
    "punpcklbw %%mm7, %%mm3 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
    "punpcklbw %%mm7, %%mm4 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
    "punpcklbw %%mm7, %%mm5 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
    "punpcklbw %%mm7, %%mm6 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
    "paddw     %%mm1, %%mm3 \n\t"  /* Cb1 = Cbt + 3*Cb                          */ \
    "paddw     %%mm2, %%mm4 \n\t"  /* Cr1 = Crt + 3*Cr                          */ \
    "paddw     %%mm5, %%mm1 \n\t"  /* Cb2 = Cbb + 3*Cb                          */ \
    "paddw     %%mm6, %%mm2 \n\t"  /* Cr2 = Crb + 3*Cr                          */ \
    "psrlw        $2, %%mm3 \n\t"  /* Cb1 = (Cbt + 3*Cb) / 4                    */ \
    /* either the shifts by 2 and 8 or mask off bits and shift by 6             */ \
    "psrlw        $2, %%mm4 \n\t"  /* Cr1 = (Crt + 3*Cr) / 4                    */ \
    "psllw        $8, %%mm4 \n\t"                                                  \
    "por       %%mm4, %%mm3 \n\t"  /* Cr1 Cb1 interl    v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "psrlw        $2, %%mm1 \n\t"  /* Cb2 = (Cbb + 3*Cb) / 4                    */ \
    "psrlw        $2, %%mm2 \n\t"  /* Cr2 = (Cbb + 3*Cb) / 4                    */ \
    "psllw        $8, %%mm2 \n\t"                                                  \
    "por       %%mm1, %%mm2 \n\t"  /* Cr2 Cb2 interl    v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "movq      %%mm0, %%mm1 \n\t"  /*                   y7 y6 y5 y4 y3 y2 y1 y0 */ \
    "punpcklbw %%mm3, %%mm1 \n\t"  /*                   v1 y3 u1 y2 v0 y1 u0 y0 */ \
    :                                                                              \
    : "r" (p_ut), "r" (p_vt), "r" (p_ub), "r" (p_vb) );                            \
   __asm__ __volatile__(                                                           \
    "movq      %%mm1, (%0)  \n\t"  /* Store low YUYV1                           */ \
    "punpckhbw %%mm3, %%mm0 \n\t"  /*                   v3 y7 u3 y6 v2 y5 u2 y4 */ \
    "movq      %%mm0, 8(%0) \n\t"  /* Store high YUYV1                          */ \
    "movq       (%2), %%mm0 \n\t"  /* Load 8 Y          Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
    "movq      %%mm0, %%mm1 \n\t"  /*                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
    "punpcklbw %%mm2, %%mm1 \n\t"  /*                   v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
    "movq      %%mm1, (%1)  \n\t"  /* Store low YUYV2                           */ \
    "punpckhbw %%mm2, %%mm0 \n\t"  /*                   v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
    "movq      %%mm0, 8(%1) \n\t"  /* Store high YUYV2                          */ \
    :                                                                              \
    : "r" (p_line1),  "r" (p_line2),  "r" (p_y2) );                                \
  p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4;          \
  p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4;                                      \
} while(0)

#define MMXEXT_YUV420_YUYV_PROGRESSIVE(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2)  \
do {                                                                               \
   __asm__ __volatile__(                                                           \
    "movd         %0, %%mm1 \n\t"  /* Load 4 Cb         00 00 00 00 u3 u2 u1 u0 */ \
    "movd         %1, %%mm2 \n\t"  /* Load 4 Cr         00 00 00 00 v3 v2 v1 v0 */ \
    "movd         %2, %%mm3 \n\t"  /* Load 4 Cbt        00 00 00 00 u3 u2 u1 u0 */ \
    "movd         %3, %%mm4 \n\t"  /* Load 4 Crt        00 00 00 00 v3 v2 v1 v0 */ \
    "punpcklbw %%mm2, %%mm1 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    :                                                                              \
    : "m" (*p_u), "m" (*p_v), "m" (*p_ut), "m" (*p_vt) );                          \
   __asm__ __volatile__(                                                           \
    "movq       (%0), %%mm0 \n\t"  /* Load 8 Y          y7 y6 y5 y4 y3 y2 y1 y0 */ \
    "punpcklbw %%mm4, %%mm3 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "pavgb     %%mm1, %%mm3 \n\t"  /* CrCb1 = 1/2(CrCbt + CrCb)                 */ \
    /* for correct rounding                                                     */ \
    "psubusb   %%mm7, %%mm3 \n\t"                                                  \
    "pavgb     %%mm1, %%mm3 \n\t"  /* CrCb1 = 1/2(1/2(CrCbt + CrCb) + CrCb)     */ \
    "movq      %%mm0, %%mm2 \n\t"  /*                   y7 y6 y5 y4 y3 y2 y1 y0 */ \
    "punpcklbw %%mm3, %%mm2 \n\t"  /*                   v1 y3 u1 y2 v0 y1 u0 y0 */ \
    "movntq    %%mm2, (%1)  \n\t"  /* Store low YUYV1                           */ \
    "punpckhbw %%mm3, %%mm0 \n\t"  /*                   v3 y7 u3 y6 v2 y5 u2 y4 */ \
    "movntq    %%mm0, 8(%1) \n\t"  /* Store high YUYV1                          */ \
    :                                                                              \
    : "r" (p_y1), "r" (p_line1) );                                                 \
   __asm__ __volatile__(                                                           \
    "movd         %1, %%mm3 \n\t"  /* Load 4 Cbb        00 00 00 00 u3 u2 u1 u0 */ \
    "movd         %2, %%mm4 \n\t"  /* Load 4 Crb        00 00 00 00 v3 v2 v1 v0 */ \
    "movq       (%0), %%mm0 \n\t"  /* Load 8 Y          Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
    "punpcklbw %%mm4, %%mm3 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "pavgb     %%mm1, %%mm3 \n\t"  /* CrCb2 = 1/2(CrCbb + CrCb)                 */ \
    /* for correct rounding                                                     */ \
    "psubusb   %%mm7, %%mm3 \n\t"                                                  \
    "pavgb     %%mm1, %%mm3 \n\t"  /* CrCb2 = 1/2(1/2(CrCbb + CrCb) + CrCb)     */ \
    "movq      %%mm0, %%mm2 \n\t"  /*                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
    "punpcklbw %%mm3, %%mm2 \n\t"  /*                   v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
    "movntq    %%mm2, (%3)  \n\t"  /* Store low YUYV2                           */ \
    "punpckhbw %%mm3, %%mm0 \n\t"  /*                   v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
    "movntq    %%mm0, 8(%3) \n\t"  /* Store high YUYV2                          */ \
    :                                                                              \
    : "r" (p_y2), "m" (*p_ub), "m" (*p_vb), "r" (p_line2) );                       \
  p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4;          \
  p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4;                                      \
} while(0)

#define SSE2_YUV420_YUYV_PROGRESSIVE(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2)  \
do {                                                                                 \
   __asm__ __volatile__(                                                             \
    "movq          %0, %%xmm1 \n\t"  /* Load 8 Cb         00 00 00 00 u3 u2 u1 u0 */ \
    "movq          %1, %%xmm2 \n\t"  /* Load 8 Cr         00 00 00 00 v3 v2 v1 v0 */ \
    "movq          %2, %%xmm3 \n\t"  /* Load 8 Cbt        00 00 00 00 u3 u2 u1 u0 */ \
    "movq          %3, %%xmm4 \n\t"  /* Load 8 Crt        00 00 00 00 v3 v2 v1 v0 */ \
    "punpcklbw %%xmm2, %%xmm1 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    :                                                                                \
    : "m" (*p_u), "m" (*p_v), "m" (*p_ut), "m" (*p_vt) );                            \
   __asm__ __volatile__(                                                             \
    "movdqa      (%0), %%xmm0 \n\t"  /* Load 16 Y         y7 y6 y5 y4 y3 y2 y1 y0 */ \
    "punpcklbw %%xmm4, %%xmm3 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "pavgb     %%xmm1, %%xmm3 \n\t"  /* CrCb1 = 1/2(CrCbt + CrCb)                 */ \
    /* for correct rounding                                                       */ \
    "psubusb   %%xmm7, %%xmm3 \n\t"                                                  \
    "pavgb     %%xmm1, %%xmm3 \n\t"  /* CrCb1 = 1/2(1/2(CrCbt + CrCb) + CrCb)     */ \
    "movdqa    %%xmm0, %%xmm2 \n\t"  /*                   y7 y6 y5 y4 y3 y2 y1 y0 */ \
    "punpcklbw %%xmm3, %%xmm2 \n\t"  /*                   v1 y3 u1 y2 v0 y1 u0 y0 */ \
    "movntdq   %%xmm2, (%1)   \n\t"  /* Store low YUYV1                           */ \
    "punpckhbw %%xmm3, %%xmm0 \n\t"  /*                   v3 y7 u3 y6 v2 y5 u2 y4 */ \
    "movntdq   %%xmm0, 16(%1) \n\t"  /* Store high YUYV1                          */ \
    :                                                                                \
    : "r" (p_y1), "r" (p_line1) );                                                   \
   __asm__ __volatile__(                                                             \
    "movq          %1, %%xmm3 \n\t"  /* Load 8 Cbb        00 00 00 00 u3 u2 u1 u0 */ \
    "movq          %2, %%xmm4 \n\t"  /* Load 8 Crb        00 00 00 00 v3 v2 v1 v0 */ \
    "movdqa      (%0), %%xmm0 \n\t"  /* Load 16 Y         Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
    "punpcklbw %%xmm4, %%xmm3 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "pavgb     %%xmm1, %%xmm3 \n\t"  /* CrCb2 = 1/2(CrCbb + CrCb)                 */ \
    /* for correct rounding                                                       */ \
    "psubusb   %%xmm7, %%xmm3 \n\t"                                                  \
    "pavgb     %%xmm1, %%xmm3 \n\t"  /* CrCb2 = 1/2(1/2(CrCbb + CrCb) + CrCb)     */ \
    "movdqa    %%xmm0, %%xmm2 \n\t"  /*                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
    "punpcklbw %%xmm3, %%xmm2 \n\t"  /*                   v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
    "movntdq   %%xmm2, (%3)   \n\t"  /* Store low YUYV2                           */ \
    "punpckhbw %%xmm3, %%xmm0 \n\t"  /*                   v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
    "movntdq   %%xmm0, 16(%3) \n\t"  /* Store high YUYV2                          */ \
    :                                                                                \
    : "r" (p_y2), "m" (*p_ub), "m" (*p_vb), "r" (p_line2) );                         \
  p_line1 += 32; p_line2 += 32; p_y1 += 16; p_y2 += 16; p_u += 8; p_v += 8;          \
  p_ub += 8; p_vb += 8; p_ut += 8; p_vt += 8;                                        \
} while(0)


#define MMX_YUV420_YUYV_INTERLACED(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2)  \
do {                                                                               \
   __asm__ __volatile__(                                                           \
    "movd       (%0), %%mm1 \n\t"  /* Load 4 Cb         00 00 00 00 u3 u2 u1 u0 */ \
    "movd       (%1), %%mm2 \n\t"  /* Load 4 Cr         00 00 00 00 v3 v2 v1 v0 */ \
    "pxor      %%mm7, %%mm7 \n\t"  /*                   00 00 00 00 00 00 00 00 */ \
    "punpcklbw %%mm7, %%mm1 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
    "punpcklbw %%mm7, %%mm2 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
    "movq      %%mm1, %%mm3 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
    "movq      %%mm2, %%mm4 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
    "psllw        $2, %%mm3 \n\t"  /* Cb * 4                                    */ \
    "psllw        $2, %%mm4 \n\t"  /* Cr * 4                                    */ \
    "paddw     %%mm3, %%mm1 \n\t"  /* Cb * 5                                    */ \
    "paddw     %%mm4, %%mm2 \n\t"  /* Cr * 5                                    */ \
    "psrlw        $1, %%mm3 \n\t"  /* Cb * 2                                    */ \
    "psrlw        $1, %%mm4 \n\t"  /* Cr * 2                                    */ \
    "paddw     %%mm1, %%mm3 \n\t"  /* Cb * 7                                    */ \
    "paddw     %%mm2, %%mm4 \n\t"  /* Cr * 7                                    */ \
    :                                                                              \
    : "r" (p_u), "r" (p_v) );                                                      \
   __asm__ __volatile__(                                                           \
    "movd       (%1), %%mm5 \n\t"  /* Load 4 Cbt        00 00 00 00 u3 u2 u1 u0 */ \
    "movd       (%2), %%mm6 \n\t"  /* Load 4 Crt        00 00 00 00 v3 v2 v1 v0 */ \
    "movq       (%0), %%mm0 \n\t"  /* Load 8 Y          y7 y6 y5 y4 y3 y2 y1 y0 */ \
    "punpcklbw %%mm7, %%mm5 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
    "punpcklbw %%mm7, %%mm6 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
    "paddw     %%mm3, %%mm5 \n\t"  /* Cb1 = Cbt + 7*Cb                          */ \
    "paddw     %%mm4, %%mm6 \n\t"  /* Cr1 = Crt + 7*Cr                          */ \
    "psrlw        $3, %%mm5 \n\t"  /* Cb1 = (Cbt + 7*Cb) / 8                    */ \
    /* either the shifts by 3 and 8 or mask off bits and shift by 5             */ \
    "psrlw        $3, %%mm6 \n\t"  /* Cr1 = (Crt + 7*Cr) / 8                    */ \
    "psllw        $8, %%mm6 \n\t"                                                  \
    "por       %%mm5, %%mm6 \n\t"  /* Cr1 Cb1 interl    v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "movq      %%mm0, %%mm3 \n\t"  /*                   y7 y6 y5 y4 y3 y2 y1 y0 */ \
    "punpcklbw %%mm6, %%mm3 \n\t"  /*                   v1 y3 u1 y2 v0 y1 u0 y0 */ \
    "movq      %%mm3, (%3)  \n\t"  /* Store low YUYV1                           */ \
    "punpckhbw %%mm6, %%mm0 \n\t"  /*                   v3 y7 u3 y6 v2 y5 u2 y4 */ \
    "movq      %%mm0, 8(%3) \n\t"  /* Store high YUYV1                          */ \
    :                                                                              \
    : "r" (p_y1), "r" (p_ut), "r" (p_vt), "r" (p_line1) );                         \
   __asm__ __volatile__(                                                           \
    "movd       (%1), %%mm3 \n\t"  /* Load 4 Cbb        00 00 00 00 u3 u2 u1 u0 */ \
    "movd       (%2), %%mm4 \n\t"  /* Load 4 Crb        00 00 00 00 v3 v2 v1 v0 */ \
    "movq       (%0), %%mm0 \n\t"  /* Load 8 Y          Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
    "punpcklbw %%mm7, %%mm3 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
    "punpcklbw %%mm7, %%mm4 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
    "movq      %%mm3, %%mm5 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
    "movq      %%mm4, %%mm6 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
    "psllw        $1, %%mm5 \n\t"  /* Cbb * 2                                   */ \
    "psllw        $1, %%mm6 \n\t"  /* Crb * 2                                   */ \
    "paddw     %%mm5, %%mm3 \n\t"  /* Cbb * 3                                   */ \
    "paddw     %%mm6, %%mm4 \n\t"  /* Crb * 3                                   */ \
    "paddw     %%mm3, %%mm1 \n\t"  /* Cb2 = 3*Cbb + 5*Cb                        */ \
    "paddw     %%mm4, %%mm2 \n\t"  /* Cr2 = 3*Crb + 5*Cr                        */ \
    "psrlw        $3, %%mm1 \n\t"  /* Cb2 = (3*Cbb + 5*Cb) / 8                  */ \
    /* either the shifts by 3 and 8 or mask off bits and shift by 5             */ \
    "psrlw        $3, %%mm2 \n\t"  /* Cr2 = (3*Crb + 5*Cr) / 8                  */ \
    "psllw        $8, %%mm2 \n\t"                                                  \
    "por       %%mm1, %%mm2 \n\t"  /* Cr2 Cb2 interl    v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "movq      %%mm0, %%mm1 \n\t"  /*                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
    "punpcklbw %%mm2, %%mm1 \n\t"  /*                   v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
    "movq      %%mm1, (%3)  \n\t"  /* Store low YUYV2                           */ \
    "punpckhbw %%mm2, %%mm0 \n\t"  /*                   v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
    "movq      %%mm0, 8(%3) \n\t"  /* Store high YUYV2                          */ \
    :                                                                              \
    : "r" (p_y2),  "r" (p_ub), "r" (p_vb),  "r" (p_line2) );                       \
  p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4;          \
  p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4;                                      \
} while(0)

#define MMXEXT_YUV420_YUYV_INTERLACED(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2)  \
do {                                                                               \
   __asm__ __volatile__(                                                           \
    "movd         %0, %%mm1 \n\t"  /* Load 4 Cb         00 00 00 00 u3 u2 u1 u0 */ \
    "movd         %1, %%mm2 \n\t"  /* Load 4 Cr         00 00 00 00 v3 v2 v1 v0 */ \
    "movd         %2, %%mm3 \n\t"  /* Load 4 Cbt        00 00 00 00 u3 u2 u1 u0 */ \
    "movd         %3, %%mm4 \n\t"  /* Load 4 Crt        00 00 00 00 v3 v2 v1 v0 */ \
    "punpcklbw %%mm2, %%mm1 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    :                                                                              \
    : "m" (*p_u), "m" (*p_v), "m" (*p_ut), "m" (*p_vt) );                          \
   __asm__ __volatile__(                                                           \
    "movq       (%0), %%mm0 \n\t"  /* Load 8 Y          y7 y6 y5 y4 y3 y2 y1 y0 */ \
    "punpcklbw %%mm4, %%mm3 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "pavgb     %%mm1, %%mm3 \n\t"  /* CrCb1 = 1/2(CrCbt + CrCb)                 */ \
    "pavgb     %%mm1, %%mm3 \n\t"  /* CrCb1 = 1/2(1/2(CrCbt + CrCb) + CrCb)     */ \
    /* for correct rounding                                                     */ \
    "psubusb   %%mm7, %%mm3 \n\t"                                                  \
    "pavgb     %%mm1, %%mm3 \n\t"  /* CrCb1 = 1/8CrCbt + 7/8CrCb                */ \
    "movq      %%mm0, %%mm2 \n\t"  /*                   y7 y6 y5 y4 y3 y2 y1 y0 */ \
    "punpcklbw %%mm3, %%mm2 \n\t"  /*                   v1 y3 u1 y2 v0 y1 u0 y0 */ \
    "movntq    %%mm2, (%1)  \n\t"  /* Store low YUYV1                           */ \
    "punpckhbw %%mm3, %%mm0 \n\t"  /*                   v3 y7 u3 y6 v2 y5 u2 y4 */ \
    "movntq    %%mm0, 8(%1) \n\t"  /* Store high YUYV1                          */ \
    :                                                                              \
    : "r" (p_y1), "r" (p_line1) );                                                 \
   __asm__ __volatile__(                                                           \
    "movd         %1, %%mm3 \n\t"  /* Load 4 Cbb        00 00 00 00 u3 u2 u1 u0 */ \
    "movd         %2, %%mm4 \n\t"  /* Load 4 Crb        00 00 00 00 v3 v2 v1 v0 */ \
    "movq       (%0), %%mm0 \n\t"  /* Load 8 Y          Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
    "punpcklbw %%mm4, %%mm3 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "pavgb     %%mm1, %%mm3 \n\t"  /* CrCb2 = 1/2(CrCbb + CrCb)                 */ \
    "pavgb     %%mm3, %%mm1 \n\t"  /* CrCb2 = 1/4CrCbb + 3/4CrCb                */ \
    /* other cases give error smaller than one with repeated pavgb but here we  */ \
    /* would get a max error of 1.125. Subtract one to compensate for repeated  */ \
    /* rounding up (which will give max error of 0.625 which isn't perfect      */ \
    /* rounding but good enough).                                               */ \
    "psubusb   %%mm7, %%mm1 \n\t"                                                  \
    "pavgb     %%mm1, %%mm3 \n\t"  /* CrCb2 = 3/8CrCbb + 5/8CrCb                */ \
    "movq      %%mm0, %%mm2 \n\t"  /*                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
    "punpcklbw %%mm3, %%mm2 \n\t"  /*                   v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
    "movntq    %%mm2, (%3)  \n\t"  /* Store low YUYV2                           */ \
    "punpckhbw %%mm3, %%mm0 \n\t"  /*                   v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
    "movntq    %%mm0, 8(%3) \n\t"  /* Store high YUYV2                          */ \
    :                                                                              \
    : "r" (p_y2),  "m" (*p_ub), "m" (*p_vb),  "r" (p_line2) );                     \
  p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4;          \
  p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4;                                      \
} while(0)

#define SSE2_YUV420_YUYV_INTERLACED(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2)  \
do {                                                                                 \
   __asm__ __volatile__(                                                             \
    "movq          %0, %%xmm1 \n\t"  /* Load 8 Cb         00 00 00 00 u3 u2 u1 u0 */ \
    "movq          %1, %%xmm2 \n\t"  /* Load 8 Cr         00 00 00 00 v3 v2 v1 v0 */ \
    "movq          %2, %%xmm3 \n\t"  /* Load 8 Cbt        00 00 00 00 u3 u2 u1 u0 */ \
    "movq          %3, %%xmm4 \n\t"  /* Load 8 Crt        00 00 00 00 v3 v2 v1 v0 */ \
    "punpcklbw %%xmm2, %%xmm1 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    :                                                                                \
    : "m" (*p_u), "m" (*p_v), "m" (*p_ut), "m" (*p_vt) );                            \
   __asm__ __volatile__(                                                             \
    "movdqa      (%0), %%xmm0 \n\t"  /* Load 16 Y         y7 y6 y5 y4 y3 y2 y1 y0 */ \
    "punpcklbw %%xmm4, %%xmm3 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "pavgb     %%xmm1, %%xmm3 \n\t"  /* CrCb1 = 1/2(CrCbt + CrCb)                 */ \
    "pavgb     %%xmm1, %%xmm3 \n\t"  /* CrCb1 = 1/2(1/2(CrCbt + CrCb) + CrCb)     */ \
    /* for correct rounding                                                       */ \
    "psubusb   %%xmm7, %%xmm3 \n\t"                                                  \
    "pavgb     %%xmm1, %%xmm3 \n\t"  /* CrCb1 = 1/8CrCbt + 7/8CrCb                */ \
    "movdqa    %%xmm0, %%xmm2 \n\t"  /*                   y7 y6 y5 y4 y3 y2 y1 y0 */ \
    "punpcklbw %%xmm3, %%xmm2 \n\t"  /*                   v1 y3 u1 y2 v0 y1 u0 y0 */ \
    "movntdq   %%xmm2, (%1)   \n\t"  /* Store low YUYV1                           */ \
    "punpckhbw %%xmm3, %%xmm0 \n\t"  /*                   v3 y7 u3 y6 v2 y5 u2 y4 */ \
    "movntdq   %%xmm0, 16(%1) \n\t"  /* Store high YUYV1                          */ \
    :                                                                                \
    : "r" (p_y1), "r" (p_line1) );                                                   \
   __asm__ __volatile__(                                                             \
    "movq          %1, %%xmm3 \n\t"  /* Load 8 Cbb        00 00 00 00 u3 u2 u1 u0 */ \
    "movq          %2, %%xmm4 \n\t"  /* Load 8 Crb        00 00 00 00 v3 v2 v1 v0 */ \
    "movdqa      (%0), %%xmm0 \n\t"  /* Load 16 Y         Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
    "punpcklbw %%xmm4, %%xmm3 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "pavgb     %%xmm1, %%xmm3 \n\t"  /* CrCb2 = 1/2(CrCbb + CrCb)                 */ \
    "pavgb     %%xmm3, %%xmm1 \n\t"  /* CrCb2 = 1/4CrCbb + 3/4CrCb                */ \
    /* other cases give error smaller than one with repeated pavgb but here we    */ \
    /* would get a max error of 1.125. Subtract one to compensate for repeated    */ \
    /* rounding up (which will give max error of 0.625 which isn't perfect        */ \
    /* rounding but good enough).                                                 */ \
    "psubusb   %%xmm7, %%xmm1 \n\t"                                                  \
    "pavgb     %%xmm1, %%xmm3 \n\t"  /* CrCb2 = 3/8CrCbb + 5/8CrCb                */ \
    "movdqa    %%xmm0, %%xmm2 \n\t"  /*                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
    "punpcklbw %%xmm3, %%xmm2 \n\t"  /*                   v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
    "movntdq   %%xmm2, (%3)   \n\t"  /* Store low YUYV2                           */ \
    "punpckhbw %%xmm3, %%xmm0 \n\t"  /*                   v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
    "movntdq   %%xmm0, 16(%3) \n\t"  /* Store high YUYV2                          */ \
    :                                                                                \
    : "r" (p_y2),  "m" (*p_ub), "m" (*p_vb),  "r" (p_line2) );                       \
  p_line1 += 32; p_line2 += 32; p_y1 += 16; p_y2 += 16; p_u += 8; p_v += 8;          \
  p_ub += 8; p_vb += 8; p_ut += 8; p_vt += 8;                                        \
} while(0)

#endif

static void yv12_to_yuy2_mmxext
  (const unsigned char *y_src, int y_src_pitch,
   const unsigned char *u_src, int u_src_pitch,
   const unsigned char *v_src, int v_src_pitch,
   unsigned char *yuy2_map, int yuy2_pitch,
   int width, int height, int progressive ) {
#if defined(ARCH_X86) || defined(ARCH_X86_64)
    uint8_t *p_line1, *p_line2 = yuy2_map;
    const uint8_t *p_y1, *p_y2 = y_src;
    const uint8_t *p_u = u_src;
    const uint8_t *p_v = v_src;
    const uint8_t *p_ub, *p_vb;
    const uint8_t *p_ut = u_src;
    const uint8_t *p_vt = v_src;

    int i_x, i_y;
    int utmp, vtmp;

    const int i_source_margin = y_src_pitch - width;
    const int i_source_u_margin = u_src_pitch - width/2;
    const int i_source_v_margin = v_src_pitch - width/2;
    const int i_dest_margin = yuy2_pitch - width*2;

    __asm__ __volatile__(
     "movq %0, %%mm7 \n\t"
     :
     : "m" (byte_one) );

    if( progressive ) {

      for( i_y = height / 2; i_y-- ; )
      {
          p_line1 = p_line2;
          p_line2 += yuy2_pitch;

          p_y1 = p_y2;
          p_y2 += y_src_pitch;

          if( i_y > 1 ) {
            p_ub = p_u + u_src_pitch;
            p_vb = p_v + v_src_pitch;
          } else {
            p_ub = p_u;
            p_vb = p_v;
          }

          for( i_x = width / 8 ; i_x-- ; )
          {
              MMXEXT_YUV420_YUYV_PROGRESSIVE(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2);
          }
          for( i_x = (width % 8) / 2 ; i_x-- ; )
          {
              C_YUV420_YUYV_PROGRESSIVE(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2);
          }

          p_y2 += i_source_margin;
          p_u += i_source_u_margin;
          p_v += i_source_v_margin;
          p_ut = p_u - u_src_pitch;
          p_vt = p_v - v_src_pitch;
          p_line2 += i_dest_margin;
      }

    } else {

      for( i_y = height / 4 ; i_y-- ; )
      {
          p_line1 = p_line2;
          p_line2 += 2 * yuy2_pitch;

          p_y1 = p_y2;
          p_y2 += 2 * y_src_pitch;

          if( i_y > 1 ) {
            p_ub = p_u + 2 * u_src_pitch;
            p_vb = p_v + 2 * v_src_pitch;
          } else {
            p_ub = p_u;
            p_vb = p_v;
          }
          
          /* 2 odd lines */
          for( i_x = width / 8 ; i_x-- ; )
          {
              MMXEXT_YUV420_YUYV_INTERLACED(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2);
          }
          for( i_x = (width % 8) / 2 ; i_x-- ; )
          {
              C_YUV420_YUYV_INTERLACED(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2);
          }
          
          p_y1 += i_source_margin;
          p_y2 += i_source_margin;
          p_u += i_source_u_margin;
          p_v += i_source_v_margin;
          p_ut += i_source_u_margin;
          p_vt += i_source_v_margin;
          p_ub += i_source_u_margin;
          p_vb += i_source_v_margin;
          p_line1 += i_dest_margin;
          p_line2 += i_dest_margin;

          /* 2 even lines - arguments need to be swapped */
          for( i_x = width / 8 ; i_x-- ; )
          {
              MMXEXT_YUV420_YUYV_INTERLACED(p_y2,p_y1,p_u,p_ub,p_ut,p_v,p_vb,p_vt,p_line2,p_line1);
          }
          for( i_x = (width % 8) / 2 ; i_x-- ; )
          {
              C_YUV420_YUYV_INTERLACED(p_y2,p_y1,p_u,p_ub,p_ut,p_v,p_vb,p_vt,p_line2,p_line1);
          }

          p_y2 += i_source_margin;
          p_u += i_source_u_margin;
          p_v += i_source_v_margin;
          p_ut = p_u - 2 * u_src_pitch;
          p_vt = p_v - 2 * v_src_pitch;
          p_line2 += i_dest_margin;
      }
    }

    sfence();
    emms();

#endif
}

static void yv12_to_yuy2_sse2
  (const unsigned char *y_src, int y_src_pitch,
   const unsigned char *u_src, int u_src_pitch,
   const unsigned char *v_src, int v_src_pitch,
   unsigned char *yuy2_map, int yuy2_pitch,
   int width, int height, int progressive ) {
#if defined(ARCH_X86) || defined(ARCH_X86_64)

    /* check alignment */
    if (((uintptr_t)y_src | (uintptr_t)yuy2_map | yuy2_pitch | y_src_pitch) & 15) {
        yv12_to_yuy2_mmxext(y_src, y_src_pitch, u_src, u_src_pitch, v_src, v_src_pitch,
                            yuy2_map, yuy2_pitch, width, height, progressive);
        return;
    }

    uint8_t *p_line1, *p_line2 = yuy2_map;
    const uint8_t *p_y1, *p_y2 = y_src;
    const uint8_t *p_u = u_src;
    const uint8_t *p_v = v_src;
    const uint8_t *p_ub, *p_vb;
    const uint8_t *p_ut = u_src;
    const uint8_t *p_vt = v_src;

    int i_x, i_y;
    int utmp, vtmp;

    const int i_source_margin = y_src_pitch - width;
    const int i_source_u_margin = u_src_pitch - width/2;
    const int i_source_v_margin = v_src_pitch - width/2;
    const int i_dest_margin = yuy2_pitch - width*2;

    static const sse_t qw_byte_one = { uq: { 0x0101010101010101ll, 0x0101010101010101ll } };
    __asm__ __volatile__(
     "movdqa %0, %%xmm7 \n\t"
     :
     : "m" (qw_byte_one) );

    if( progressive ) {

      for( i_y = height / 2; i_y-- ; )
      {
          p_line1 = p_line2;
          p_line2 += yuy2_pitch;

          p_y1 = p_y2;
          p_y2 += y_src_pitch;

          if( i_y > 1 ) {
            p_ub = p_u + u_src_pitch;
            p_vb = p_v + v_src_pitch;
          } else {
            p_ub = p_u;
            p_vb = p_v;
          }

          for( i_x = width / 16 ; i_x-- ; )
          {
              SSE2_YUV420_YUYV_PROGRESSIVE(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2);
          }
          for( i_x = (width % 16) / 2 ; i_x-- ; )
          {
              C_YUV420_YUYV_PROGRESSIVE(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2);
          }

          p_y2 += i_source_margin;
          p_u += i_source_u_margin;
          p_v += i_source_v_margin;
          p_ut = p_u - u_src_pitch;
          p_vt = p_v - v_src_pitch;
          p_line2 += i_dest_margin;
      }

    } else {

      for( i_y = height / 4 ; i_y-- ; )
      {
          p_line1 = p_line2;
          p_line2 += 2 * yuy2_pitch;

          p_y1 = p_y2;
          p_y2 += 2 * y_src_pitch;

          if( i_y > 1 ) {
            p_ub = p_u + 2 * u_src_pitch;
            p_vb = p_v + 2 * v_src_pitch;
          } else {
            p_ub = p_u;
            p_vb = p_v;
          }

          /* 2 odd lines */
          for( i_x = width / 16 ; i_x-- ; )
          {
              SSE2_YUV420_YUYV_INTERLACED(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2);
          }
          for( i_x = (width % 16) / 2 ; i_x-- ; )
          {
              C_YUV420_YUYV_INTERLACED(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2);
          }

          p_y1 += i_source_margin;
          p_y2 += i_source_margin;
          p_u += i_source_u_margin;
          p_v += i_source_v_margin;
          p_ut += i_source_u_margin;
          p_vt += i_source_v_margin;
          p_ub += i_source_u_margin;
          p_vb += i_source_v_margin;
          p_line1 += i_dest_margin;
          p_line2 += i_dest_margin;

          /* 2 even lines - arguments need to be swapped */
          for( i_x = width / 16 ; i_x-- ; )
          {
              SSE2_YUV420_YUYV_INTERLACED(p_y2,p_y1,p_u,p_ub,p_ut,p_v,p_vb,p_vt,p_line2,p_line1);
          }
          for( i_x = (width % 16) / 2 ; i_x-- ; )
          {
              C_YUV420_YUYV_INTERLACED(p_y2,p_y1,p_u,p_ub,p_ut,p_v,p_vb,p_vt,p_line2,p_line1);
          }

          p_y2 += i_source_margin;
          p_u += i_source_u_margin;
          p_v += i_source_v_margin;
          p_ut = p_u - 2 * u_src_pitch;
          p_vt = p_v - 2 * v_src_pitch;
          p_line2 += i_dest_margin;
      }
    }

    sfence();
#endif
}

/* identical to yv12_to_yuy2_c with the obvious exception... */
static void yv12_to_yuy2_mmx
  (const unsigned char *y_src, int y_src_pitch,
   const unsigned char *u_src, int u_src_pitch,
   const unsigned char *v_src, int v_src_pitch,
   unsigned char *yuy2_map, int yuy2_pitch,
   int width, int height, int progressive ) {
#if defined(ARCH_X86) || defined(ARCH_X86_64)
    uint8_t *p_line1, *p_line2 = yuy2_map;
    const uint8_t *p_y1, *p_y2 = y_src;
    const uint8_t *p_u = u_src;
    const uint8_t *p_v = v_src;
    const uint8_t *p_ub, *p_vb;
    const uint8_t *p_ut = u_src;
    const uint8_t *p_vt = v_src;

    int i_x, i_y;
    int utmp, vtmp;

    const int i_source_margin = y_src_pitch - width;
    const int i_source_u_margin = u_src_pitch - width/2;
    const int i_source_v_margin = v_src_pitch - width/2;
    const int i_dest_margin = yuy2_pitch - width*2;

    if( progressive ) {

      for( i_y = height / 2; i_y-- ; )
      {
          p_line1 = p_line2;
          p_line2 += yuy2_pitch;

          p_y1 = p_y2;
          p_y2 += y_src_pitch;

          if( i_y > 1 ) {
            p_ub = p_u + u_src_pitch;
            p_vb = p_v + v_src_pitch;
          } else {
            p_ub = p_u;
            p_vb = p_v;
          }

          for( i_x = width / 8 ; i_x-- ; )
          {
              MMX_YUV420_YUYV_PROGRESSIVE(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2);
          }
          for( i_x = (width % 8) / 2 ; i_x-- ; )
          {
              C_YUV420_YUYV_PROGRESSIVE(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2);
          }

          p_y2 += i_source_margin;
          p_u += i_source_u_margin;
          p_v += i_source_v_margin;
          p_ut = p_u - u_src_pitch;
          p_vt = p_v - v_src_pitch;
          p_line2 += i_dest_margin;
      }

    } else {

      for( i_y = height / 4 ; i_y-- ; )
      {
          p_line1 = p_line2;
          p_line2 += 2 * yuy2_pitch;

          p_y1 = p_y2;
          p_y2 += 2 * y_src_pitch;

          if( i_y > 1 ) {
            p_ub = p_u + 2 * u_src_pitch;
            p_vb = p_v + 2 * v_src_pitch;
          } else {
            p_ub = p_u;
            p_vb = p_v;
          }

          for( i_x = width / 8 ; i_x-- ; )
          {
              MMX_YUV420_YUYV_INTERLACED(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2);
          }
          for( i_x = (width % 8) / 2 ; i_x-- ; )
          {
              C_YUV420_YUYV_INTERLACED(p_y1,p_y2,p_u,p_ut,p_ub,p_v,p_vt,p_vb,p_line1,p_line2);
          }

          p_y2 += i_source_margin + y_src_pitch;
          p_u += i_source_u_margin + u_src_pitch;
          p_v += i_source_v_margin + v_src_pitch;
          p_ut = p_u - 2 * u_src_pitch;
          p_vt = p_v - 2 * v_src_pitch;
          p_line2 += i_dest_margin + yuy2_pitch;
      }

      p_line2 = yuy2_map + yuy2_pitch;
      p_y2 = y_src + y_src_pitch;
      p_u = u_src + u_src_pitch;
      p_v = v_src + v_src_pitch;
      p_ut = p_u;
      p_vt = p_v;
 
      for( i_y = height / 4 ; i_y-- ; )
      {
          p_line1 = p_line2;
          p_line2 += 2 * yuy2_pitch;

          p_y1 = p_y2;
          p_y2 += 2 * y_src_pitch;

          if( i_y > 1 ) {
            p_ub = p_u + 2 * u_src_pitch;
            p_vb = p_v + 2 * v_src_pitch;
          } else {
            p_ub = p_u;
            p_vb = p_v;
          }

          /* swap arguments for even lines */
          for( i_x = width / 8 ; i_x-- ; )
          {
              MMX_YUV420_YUYV_INTERLACED(p_y2,p_y1,p_u,p_ub,p_ut,p_v,p_vb,p_vt,p_line2,p_line1);
          }
          for( i_x = (width % 8) / 2 ; i_x-- ; )
          {
              C_YUV420_YUYV_INTERLACED(p_y2,p_y1,p_u,p_ub,p_ut,p_v,p_vb,p_vt,p_line2,p_line1);
          }

          p_y2 += i_source_margin + y_src_pitch;
          p_u += i_source_u_margin + u_src_pitch;
          p_v += i_source_v_margin + v_src_pitch;
          p_ut = p_u - 2 * u_src_pitch;
          p_vt = p_v - 2 * v_src_pitch;
          p_line2 += i_dest_margin + yuy2_pitch;
      }

    }

    sfence();
    emms();

#endif
}

#define C_YUYV_YUV420( )                                          \
    *p_y1++ = *p_line1++; *p_y2++ = *p_line2++;                   \
    *p_u++ = (*p_line1++ + *p_line2++)>>1;                        \
    *p_y1++ = *p_line1++; *p_y2++ = *p_line2++;                   \
    *p_v++ = (*p_line1++ + *p_line2++)>>1;

static void yuy2_to_yv12_c
  (const unsigned char *yuy2_map, int yuy2_pitch,
   unsigned char *y_dst, int y_dst_pitch,
   unsigned char *u_dst, int u_dst_pitch,
   unsigned char *v_dst, int v_dst_pitch,
   int width, int height) {

    const uint8_t *p_line1, *p_line2 = yuy2_map;
    uint8_t *p_y1, *p_y2 = y_dst;
    uint8_t *p_u = u_dst;
    uint8_t *p_v = v_dst;

    int i_x, i_y;

    const int i_dest_margin = y_dst_pitch - width;
    const int i_dest_u_margin = u_dst_pitch - width/2;
    const int i_dest_v_margin = v_dst_pitch - width/2;
    const int i_source_margin = yuy2_pitch - width*2;


    for( i_y = height / 2 ; i_y-- ; )
    {
        p_line1 = p_line2;
        p_line2 += yuy2_pitch;

        p_y1 = p_y2;
        p_y2 += y_dst_pitch;

        for( i_x = width / 8 ; i_x-- ; )
        {
            C_YUYV_YUV420( );
            C_YUYV_YUV420( );
            C_YUYV_YUV420( );
            C_YUYV_YUV420( );
        }

        p_y2 += i_dest_margin;
        p_u += i_dest_u_margin;
        p_v += i_dest_v_margin;
        p_line2 += i_source_margin;
    }
}


#if defined(ARCH_X86) || defined(ARCH_X86_64)

/* yuy2->yv12 with subsampling (some ideas from mplayer's yuy2toyv12) */
#define MMXEXT_YUYV_YUV420( )                                                      \
do {                                                                               \
   __asm__ __volatile__(                                                           \
    "movq       (%0), %%mm0 \n\t"  /* Load              v1 y3 u1 y2 v0 y1 u0 y0 */ \
    "movq      8(%0), %%mm1 \n\t"  /* Load              v3 y7 u3 y6 v2 y5 u2 y4 */ \
    "movq      %%mm0, %%mm2 \n\t"  /*                   v1 y3 u1 y2 v0 y1 u0 y0 */ \
    "movq      %%mm1, %%mm3 \n\t"  /*                   v3 y7 u3 y6 v2 y5 u2 y4 */ \
    "psrlw     $8, %%mm0    \n\t"  /*                   00 v1 00 u1 00 v0 00 u0 */ \
    "psrlw     $8, %%mm1    \n\t"  /*                   00 v3 00 u3 00 v2 00 u2 */ \
    "pand      %%mm7, %%mm2 \n\t"  /*                   00 y3 00 y2 00 y1 00 y0 */ \
    "pand      %%mm7, %%mm3 \n\t"  /*                   00 y7 00 y6 00 y5 00 y4 */ \
    "packuswb  %%mm1, %%mm0 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "packuswb  %%mm3, %%mm2 \n\t"  /*                   y7 y6 y5 y4 y3 y2 y1 y0 */ \
    "movntq    %%mm2, (%1)  \n\t"  /* Store YYYYYYYY line1                      */ \
    :                                                                              \
    : "r" (p_line1), "r" (p_y1) );                                                 \
   __asm__ __volatile__(                                                           \
    "movq       (%0), %%mm1 \n\t"  /* Load              v1 y3 u1 y2 v0 y1 u0 y0 */ \
    "movq      8(%0), %%mm2 \n\t"  /* Load              v3 y7 u3 y6 v2 y5 u2 y4 */ \
    "movq      %%mm1, %%mm3 \n\t"  /*                   v1 y3 u1 y2 v0 y1 u0 y0 */ \
    "movq      %%mm2, %%mm4 \n\t"  /*                   v3 y7 u3 y6 v2 y5 u2 y4 */ \
    "psrlw     $8, %%mm1    \n\t"  /*                   00 v1 00 u1 00 v0 00 u0 */ \
    "psrlw     $8, %%mm2    \n\t"  /*                   00 v3 00 u3 00 v2 00 u2 */ \
    "pand      %%mm7, %%mm3 \n\t"  /*                   00 y3 00 y2 00 y1 00 y0 */ \
    "pand      %%mm7, %%mm4 \n\t"  /*                   00 y7 00 y6 00 y5 00 y4 */ \
    "packuswb  %%mm2, %%mm1 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "packuswb  %%mm4, %%mm3 \n\t"  /*                   y7 y6 y5 y4 y3 y2 y1 y0 */ \
    "movntq    %%mm3, (%1)  \n\t"  /* Store YYYYYYYY line2                      */ \
    :                                                                              \
    : "r" (p_line2), "r" (p_y2) );                                                 \
   __asm__ __volatile__(                                                           \
    "pavgb     %%mm1, %%mm0 \n\t"  /* (mean)            v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "movq      %%mm0, %%mm1 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
    "psrlw     $8, %%mm0    \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
    "packuswb  %%mm0, %%mm0 \n\t"  /*                               v3 v2 v1 v0 */ \
    "movd      %%mm0, (%0)  \n\t"  /* Store VVVV                                */ \
    "pand      %%mm7, %%mm1 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
    "packuswb  %%mm1, %%mm1 \n\t"  /*                               u3 u2 u1 u0 */ \
    "movd      %%mm1, (%1)  \n\t"  /* Store UUUU                                */ \
    :                                                                              \
    : "r" (p_v), "r" (p_u) );                                                      \
  p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4;          \
} while(0)

#endif

static void yuy2_to_yv12_mmxext
  (const unsigned char *yuy2_map, int yuy2_pitch,
   unsigned char *y_dst, int y_dst_pitch,
   unsigned char *u_dst, int u_dst_pitch,
   unsigned char *v_dst, int v_dst_pitch,
   int width, int height) {
#if defined(ARCH_X86) || defined(ARCH_X86_64)
    const uint8_t *p_line1, *p_line2 = yuy2_map;
    uint8_t *p_y1, *p_y2 = y_dst;
    uint8_t *p_u = u_dst;
    uint8_t *p_v = v_dst;

    int i_x, i_y;

    const int i_dest_margin = y_dst_pitch - width;
    const int i_dest_u_margin = u_dst_pitch - width/2;
    const int i_dest_v_margin = v_dst_pitch - width/2;
    const int i_source_margin = yuy2_pitch - width*2;

    __asm__ __volatile__(
     "pcmpeqw %mm7, %mm7           \n\t"
     "psrlw $8, %mm7               \n\t" /* 00 ff 00 ff 00 ff 00 ff */
    );

    for( i_y = height / 2 ; i_y-- ; )
    {
        p_line1 = p_line2;
        p_line2 += yuy2_pitch;

        p_y1 = p_y2;
        p_y2 += y_dst_pitch;

        for( i_x = width / 8 ; i_x-- ; )
        {
            MMXEXT_YUYV_YUV420( );
        }

        p_y2 += i_dest_margin;
        p_u += i_dest_u_margin;
        p_v += i_dest_v_margin;
        p_line2 += i_source_margin;
    }

    sfence();
    emms();
#endif
}


/*
 * init_yuv_conversion
 *
 * This function precalculates all of the tables used for converting RGB
 * values to YUV values. This function also decides which conversion
 * functions to use.
 */
void init_yuv_conversion(void) {

  int i;

  /* initialize the RGB -> YUV tables */
  for (i = 0; i < 256; i++) {

    y_r_table[i] = Y_R * i + Y_MOD;
    y_g_table[i] = Y_G * i;
    y_b_table[i] = Y_B * i;

    u_r_table[i] = U_R * i;
    u_g_table[i] = U_G * i;
    u_b_table[i] = U_B * i;

    v_r_table[i] = V_R * i;
    v_g_table[i] = V_G * i;
    v_b_table[i] = V_B * i;
  }

  /* determine best YUV444 -> YUY2 converter to use */
  if (xine_mm_accel() & MM_ACCEL_X86_MMX)
    yuv444_to_yuy2 = yuv444_to_yuy2_mmx;
  else
    yuv444_to_yuy2 = yuv444_to_yuy2_c;

  /* determine best YV12 -> YUY2 converter to use */
  if (xine_mm_accel() & MM_ACCEL_X86_SSE2)
    yv12_to_yuy2 = yv12_to_yuy2_sse2;
  else if (xine_mm_accel() & MM_ACCEL_X86_MMXEXT)
    yv12_to_yuy2 = yv12_to_yuy2_mmxext;
  else if (xine_mm_accel() & MM_ACCEL_X86_MMX)
    yv12_to_yuy2 = yv12_to_yuy2_mmx;
  else
    yv12_to_yuy2 = yv12_to_yuy2_c;

  /* determine best YUY2 -> YV12 converter to use */
  if (xine_mm_accel() & MM_ACCEL_X86_MMXEXT)
    yuy2_to_yv12 = yuy2_to_yv12_mmxext;
  else
    yuy2_to_yv12 = yuy2_to_yv12_c;


  /* determine best YUV9 -> YV12 converter to use (only the portable C
   * version is available so far) */
  yuv9_to_yv12 = yuv9_to_yv12_c;

  /* determine best YUV411 -> YV12 converter to use (only the portable C
   * version is available so far) */
  yuv411_to_yv12 = yuv411_to_yv12_c;

}

/* TJ. direct sliced rgb -> yuy2 conversion */
typedef struct {
  /* unused:1 u:21 v:21 y:21 */
  uint64_t t0[256], t1[256], t2[256];
  /* u:12 v:12 y:8 (rgb_pal8) */
  /* native endian yuyv (other rgb_pal*) */
  uint32_t p[272];
  int cm, fmt, pfmt;
} rgb2yuy2_t;

typedef enum {
  rgb_bgr = 0, rgb_rgb, rgb_bgra, rgb_argb, rgb_rgba,
  rgb_rgb555le, rgb_rgb555be, rgb_rgb565le, rgb_rgb565be,
  rgb_bgra_clut, rgb_rgba_clut,
  rgb_pal8, rgb_pal4, rgb_pal2, rgb_pal1
} rgb_fmt_t;

void *rgb2yuy2_alloc (int color_matrix, const char *format) {
  rgb2yuy2_t *b;
  float kb, kr;
  float _ry, _gy, _by, _yoffs;
  float _bv, _bvoffset, _ru, _ruoffset, _gu, _guoffset, _gv, _gvoffset, _burv;
  int i = -1;
  const char *fmts[] = {
    "bgr", "rgb", "bgra", "argb", "rgba",
    "rgb555le", "rgb555be", "rgb565le", "rgb565be",
    "bgra_clut", "rgba_clut"
  };

  if (format) for (i = 10; i >= 0; i--) if (!strcmp (format, fmts[i])) break;
  if (i < 0) return NULL;

  b = malloc (sizeof (*b));
  if (!b) return b;

  b->pfmt = -1;
  b->fmt = i;
  b->cm = color_matrix;
  switch ((b->cm) >> 1) {
    case 1:  kb = 0.0722; kr = 0.2126; break; /* ITU-R 709 */
    case 4:  kb = 0.1100; kr = 0.3000; break; /* FCC */
    case 7:  kb = 0.0870; kr = 0.2120; break; /* SMPTE 240 */
    default: kb = 0.1140; kr = 0.2990;        /* ITU-R 601 */
  }
  if (b->cm & 1) {
    /* fullrange */
    _ry = 8192.0 * kr;
    _gy = 8192.0 * (1.0 - kb - kr);
    _by = 8192.0 * kb;
    _yoffs = 8192.0 * 0.5;
    _burv = 4096.0 * (127.0 / 255.0);
  } else {
    /* mpeg range */
    _ry = 8192.0 * (219.0 / 255.0) * kr;
    _gy = 8192.0 * (219.0 / 255.0) * (1.0 - kb - kr);
    _by = 8192.0 * (219.0 / 255.0) * kb;
    _yoffs = 8192.0 * 16.5;
    _burv = 4096.0 * (112.0 / 255.0);
  }
  _ru = _burv * (kr / (kb - 1.0));
  _gu = _burv * ((1.0 - kb - kr) / (kb - 1.0));
  _bv = _burv * (kb / (kr - 1.0));
  _gv = _burv * ((1.0 - kb - kr) / (kr - 1.0));

  switch (b->fmt) {
    case rgb_bgr:
    case rgb_rgb:
    case rgb_bgra:
    case rgb_argb:
    case rgb_rgba:
    case rgb_bgra_clut:
    case rgb_rgba_clut: { /* 24/32 bit */
      uint64_t *rr, *gg, *bb;
      if ((b->fmt == rgb_bgr) || (b->fmt == rgb_bgra) || (b->fmt == rgb_bgra_clut)) {
        bb = b->t0;
        gg = b->t1;
        rr = b->t2;
      } else {
        rr = b->t0;
        gg = b->t1;
        bb = b->t2;
      }
      /* Split uv offsets to make all values non negative.
         This prevents carry between components. */
      _ruoffset = -255.0 * _ru;
      _guoffset = 4096.0 * 128.5 - _ruoffset;
      _bvoffset = -255.0 * _bv;
      _gvoffset = 4096.0 * 128.5 - _bvoffset;

      for (i = 0; i < 256; i++) {
        rr[i] = ((uint64_t)(_ru * i + _ruoffset + 0.5) << 42)
              | ((uint64_t)(_burv * i + 0.5) << 21)
              |  (uint64_t)(_ry * i + 0.5);
        gg[i] = ((uint64_t)(_gu * i + _guoffset + 0.5) << 42)
              | ((uint64_t)(_gv * i + _gvoffset + 0.5) << 21)
              |  (uint64_t)(_gy * i + _yoffs + 0.5);
        bb[i] = ((uint64_t)(_burv * i + 0.5) << 42)
              | ((uint64_t)(_bv * i + _bvoffset + 0.5) << 21)
              |  (uint64_t)(_by * i + 0.5);
      }
    } break;

    case rgb_rgb555le:
    case rgb_rgb555be: { /* 15 bit */
      uint64_t *hi, *lo;
      float _uloffset, _uhoffset, _vloffset, _vhoffset;
      /* A little more preparation for Verrifast Plain Co. */
      if (b->fmt == rgb_rgb555le) {
        lo = b->t0;
        hi = b->t1;
      } else {
        hi = b->t0;
        lo = b->t1;
      }
      /* gl <= 0x39, gh <= 0xc6 - see below */
      _uloffset = -(float)0x39 * _gu;
      _uhoffset = 4096.0 * 128.5 - _uloffset;
      _vhoffset = -(float)0xc6 * _gv;
      _vloffset = 4096.0 * 128.5 - _vhoffset;

      for (i = 0; i < 256; i++) {
        int rr, gl, gh, bb;
        /* extract rgb parts from high byte */
        rr = (i << 1) & 0xf8;
        gh = (i << 6) & 0xc0;
        /* and from low byte */
        gl = (i >> 2) & 0x38;
        bb = (i << 3) & 0xf8;
        /* scale them to 8 bits */
        rr |= rr >> 5;
        gl |= gl >> 5;
        gh |= gh >> 5;
        bb |= bb >> 5;
        /* setup low byte lookup */
        lo[i] = ((uint64_t)(_burv * bb + _gu * gl + _uloffset + 0.5) << 42)
              | ((uint64_t)(  _bv * bb + _gv * gl + _vloffset + 0.5) << 21)
              |  (uint64_t)(  _by * bb + _gy * gl + 0.5);
        /* and high byte */
        hi[i] = ((uint64_t)(  _ru * rr + _gu * gh + _uhoffset + 0.5) << 42)
              | ((uint64_t)(_burv * rr + _gv * gh + _vhoffset + 0.5) << 21)
              |  (uint64_t)(  _ry * rr + _gy * gh + _yoffs + 0.5);
      }
    } break;

    case rgb_rgb565le:
    case rgb_rgb565be: { /* 16 bit */
      uint64_t *lo, *hi;
      float _uloffset, _uhoffset, _vloffset, _vhoffset;
      /* Much like 15 bit */
      if (b->fmt == rgb_rgb565le) {
        lo = b->t0;
        hi = b->t1;
      } else {
        hi = b->t0;
        lo = b->t1;
      }
      /* gl <= 0x1c, gh <= 0xe3 - see below */
      _uloffset = -(float)0x1c * _gu;
      _uhoffset = 4096.0 * 128.5 - _uloffset;
      _vhoffset = -(float)0xe3 * _gv;
      _vloffset = 4096.0 * 128.5 - _vhoffset;

      for (i = 0; i < 256; i++) {
        int rr, gl, gh, bb;
        /* extract rgb parts from high byte */
        rr =  i       & 0xf8;
        gh = (i << 5) & 0xe0;
        /* and from low byte */
        gl = (i >> 3) & 0x1c;
        bb = (i << 3) & 0xf8;
        /* scale them to 8 bits */
        rr |= rr >> 5;
        gh |= gh >> 6;
        bb |= bb >> 5;
        /* setup low byte lookup */
        lo[i] = ((uint64_t)(_burv * bb + _gu * gl + _uloffset + 0.5) << 42)
              | ((uint64_t)(  _bv * bb + _gv * gl + _vloffset + 0.5) << 21)
              |  (uint64_t)(  _by * bb + _gy * gl + 0.5);
        /* and high byte */
        hi[i] = ((uint64_t)(  _ru * rr + _gu * gh + _uhoffset + 0.5) << 42)
              | ((uint64_t)(_burv * rr + _gv * gh + _vhoffset + 0.5) << 21)
              |  (uint64_t)(  _ry * rr + _gy * gh + _yoffs + 0.5);
      }
    } break;

    default: ;
  }

  return b;
}


void rgb2yuy2_free (void *rgb2yuy2) {
  free (rgb2yuy2);
}

void rgb2yuy2_palette (void *rgb2yuy2, const uint8_t *pal, int num_colors, int bits_per_pixel) {
  rgb2yuy2_t *b = rgb2yuy2;
  uint64_t v;
  uint32_t w, *p;
  int mode, i = 0, max_colors;

  if (!b || (num_colors < 2)) return;
  switch (bits_per_pixel) {
    case 8: mode = rgb_pal8; max_colors = 256; p = b->p;       break;
    case 4: mode = rgb_pal4; max_colors = 16;  p = b->p + 256; break;
    case 2: mode = rgb_pal2; max_colors = 4;   p = b->p + 256; break;
    case 1: mode = rgb_pal1; max_colors = 2;   p = b->p + 256; break;
    default: return;
  }
  if (num_colors > max_colors) num_colors = max_colors;
  /* fmt now refers to the format of the _palette_ */
  if (b->pfmt == -1) b->pfmt = b->fmt;
  b->fmt = mode;
  /* default grays */
  if (!pal) {
    for (i = 0; i < num_colors; i++) {
      w = (i * 255 + (num_colors - 1) / 2) / (num_colors - 1);
      v = b->t0[w] + b->t1[w] + b->t2[w];
      p[i] = ((v >> 31) & 0xfff00000) | ((v >> 22) & 0xfff00) | ((v >> 13) & 0xff);
    }
  } else
  /* convert palette */
  switch (b->pfmt) {
    case rgb_bgr:
    case rgb_rgb:
      for (i = 0; i < num_colors; i++) {
        v  = b->t0[*pal++];
        v += b->t1[*pal++];
        v += b->t2[*pal++];
        p[i] = ((v >> 31) & 0xfff00000) | ((v >> 22) & 0xfff00) | ((v >> 13) & 0xff);
      }
    break;
    case rgb_argb:
      pal++;
    case rgb_bgra:
    case rgb_rgba:
      for (i = 0; i < num_colors; i++) {
        v  = b->t0[*pal++];
        v += b->t1[*pal++];
        v += b->t2[*pal];
        pal += 2;
        p[i] = ((v >> 31) & 0xfff00000) | ((v >> 22) & 0xfff00) | ((v >> 13) & 0xff);
      }
    break;
    default: ;
  }
  /* black pad rest of palette */
  v = b->t0[0] + b->t1[0] + b->t2[0];
  w = ((v >> 31) & 0xfff00000) | ((v >> 22) & 0xfff00) | ((v >> 13) & 0xff);
  for (; i < max_colors; i++) p[i] = w;
  /* more optimization ahead? */
  if (mode != rgb_pal8) {
    union {uint8_t bytes[4]; uint32_t word;} tmp;
    switch (mode) {
      case rgb_pal4:
        for (i = 0; i < 256; i++) {
          w = p[i >> 4];
          tmp.bytes[0] = w;       /* y1 */
          w &= ~255;
          w += p[i & 15];
          tmp.bytes[1] = w >> 24; /* u */
          tmp.bytes[2] = w;       /* y2 */
          tmp.bytes[3] = w >> 12; /* v */
          b->p[i] = tmp.word;
        }
      break;
      case rgb_pal2:
        for (i = 0; i < 16; i++) {
          w = p[i >> 2];
          tmp.bytes[0] = w;       /* y1 */
          w &= ~255;
          w += p[i & 3];
          tmp.bytes[1] = w >> 24; /* u */
          tmp.bytes[2] = w;       /* y2 */
          tmp.bytes[3] = w >> 12; /* v */
          b->p[i] = tmp.word;
        }
      break;
      case rgb_pal1:
        for (i = 0; i < 4; i++) {
          w = p[i >> 1];
          tmp.bytes[0] = w;       /* y1 */
          w &= ~255;
          w += p[i & 1];
          tmp.bytes[1] = w >> 24; /* u */
          tmp.bytes[2] = w;       /* y2 */
          tmp.bytes[3] = w >> 12; /* v */
          b->p[i] = tmp.word;
        }
      break;
      default: ;
    }
  }
}

void rgb2yuy2_slice (void *rgb2yuy2, const uint8_t *in, int ipitch, uint8_t *out, int opitch,
  int width, int height) {
  rgb2yuy2_t *b = rgb2yuy2;
  uint64_t v;
  uint32_t w;
  int ipad, opad;
  int x, y;

  if (!b) return;

  width &= ~1;
  opad = opitch - 2 * width;

  switch (b->fmt) {
    case rgb_bgr:
    case rgb_rgb:
      ipad = ipitch - 3 * width;
      for (y = height; y; y--) {
        for (x = width / 2; x; x--) {
          v  = b->t0[*in++];
          v += b->t1[*in++];
          v += b->t2[*in++];
          *out++  = v >> 13; /* y1 */
          v &= ~0x1fffffLL;
          v += b->t0[*in++];
          v += b->t1[*in++];
          v += b->t2[*in++];
          *out++  = v >> 55; /* u */
          *out++  = v >> 13; /* y2 */
          *out++  = v >> 34; /* v */
        }
        in  += ipad;
        out += opad;
      }
    break;
    case rgb_argb:
      in++;
    case rgb_bgra:
    case rgb_rgba:
      ipad = ipitch - 4 * width;
      for (y = height; y; y--) {
        for (x = width / 2; x; x--) {
          v  = b->t0[*in++];
          v += b->t1[*in++];
          v += b->t2[*in];
          in += 2;
          *out++  = v >> 13; /* y1 */
          v &= ~0x1fffffLL;
          v += b->t0[*in++];
          v += b->t1[*in++];
          v += b->t2[*in];
          in += 2;
          *out++  = v >> 55; /* u */
          *out++  = v >> 13; /* y2 */
          *out++  = v >> 34; /* v */
        }
        in  += ipad;
        out += opad;
      }
    break;
    case rgb_rgb555le:
    case rgb_rgb565le:
    case rgb_rgb555be:
    case rgb_rgb565be:
      ipad = ipitch - 2 * width;
      for (y = height; y; y--) {
        for (x = width / 2; x; x--) {
          v  = b->t0[*in++];
          v += b->t1[*in++];
          *out++  = v >> 13; /* y1 */
          v &= ~0x1fffffLL;
          v += b->t0[*in++];
          v += b->t1[*in++];
          *out++  = v >> 55; /* u */
          *out++  = v >> 13; /* y2 */
          *out++  = v >> 34; /* v */
        }
        in  += ipad;
        out += opad;
      }
    break;
    case rgb_pal8:
      ipad = ipitch - width;
      for (y = height; y; y--) {
        for (x = width / 2; x; x--) {
          w  = b->p[*in++];
          *out++  = w;       /* y1 */
          w &= ~0xffL;
          w += b->p[*in++];
          *out++  = w >> 24; /* u */
          *out++  = w;       /* y2 */
          *out++  = w >> 12; /* v */
        }
        in  += ipad;
        out += opad;
      }
    break;
    case rgb_pal4:
      ipad = ipitch - (width / 2);
      for (y = height; y; y--) {
        uint32_t *o2 = (uint32_t *)out;
        for (x = width / 2; x; x--)
          *o2++ = b->p[*in++];
        in  += ipad;
        out += opitch;
      }
    break;
    case rgb_pal2:
      ipad = ipitch - (width / 4);
      for (y = height; y; y--) {
        uint32_t *o2 = (uint32_t *)out;
        for (x = width / 4; x; x--) {
          *o2++ = b->p[(*in)   >> 4];
          *o2++ = b->p[(*in++) & 15];
        }
        if (width & 3)
          *o2++ = b->p[(*in) >> 4];
        in  += ipad;
        out += opitch;
      }
    break;
    case rgb_pal1:
      ipad = ipitch - (width / 8);
      for (y = height; y; y--) {
        uint32_t *o2 = (uint32_t *)out;
        for (x = width / 8; x; x--) {
          *o2++ = b->p[(*in) >> 6];
          *o2++ = b->p[((*in) >> 4) & 3];
          *o2++ = b->p[((*in) >> 2) & 3];
          *o2++ = b->p[(*in++) & 3];
        }
        x = width & 7;
        if (x) {
          *o2++ = b->p[(*in) >> 6];
          if (x > 2) {
            *o2++ = b->p[((*in) >> 4) & 3];
            if (x > 4)
             *o2++ = b->p[((*in) >> 2) & 3];
          }
        }
        in  += ipad;
        out += opitch;
      }
    break;
    case rgb_rgba_clut:
    case rgb_bgra_clut:
      ipad = ipitch - 4 * width;
      opad = opitch - 4 * width;
      for (y = height; y; y--) {
        for (x = width; x; x--) {
          v  = b->t0[*in++];
          v += b->t1[*in++];
          v += b->t2[*in++];
          *out++  = v >> 54; /* u */
          *out++  = v >> 33; /* v */
          *out++  = v >> 13; /* y */
          *out++  = *in++;   /* a */
        }
        in  += ipad;
        out += opad;
      }
    break;
  }
}