summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/xine-utils/color.c377
1 files changed, 288 insertions, 89 deletions
diff --git a/src/xine-utils/color.c b/src/xine-utils/color.c
index bea4cd952..3c2388c27 100644
--- a/src/xine-utils/color.c
+++ b/src/xine-utils/color.c
@@ -658,16 +658,31 @@ static void yuv411_to_yv12_c
}
-#define C_YUV420_YUYV( ) \
- *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \
- *p_line1++ = *p_u; *p_line2++ = (*p_u++ + *p_u2++)>>1; \
- *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \
- *p_line1++ = *p_v; *p_line2++ = (*p_v++ + *p_v2++)>>1;
+#define C_YUV420_YUYV_PROGRESSIVE( ) \
+ utmp = 3 * *p_u++; \
+ vtmp = 3 * *p_v++; \
+ *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \
+ *p_line1++ = (*p_ut++ + utmp) >> 2; *p_line2++ = (utmp + *p_ub++) >> 2; \
+ *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \
+ *p_line1++ = (*p_vt++ + vtmp) >> 2; *p_line2++ = (vtmp + *p_vb++) >> 2; \
+
+#define C_YUV420_YUYV_INTERLACED_ODD( ) \
+ *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \
+ *p_line1++ = (*p_ut++ + *p_u * 7) >> 3; *p_line2++ = (*p_u++ * 5 + *p_ub++ * 3) >> 3; \
+ *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \
+ *p_line1++ = (*p_vt++ + *p_v * 7) >> 3; *p_line2++ = (*p_v++ * 5 + *p_vb++ * 3) >> 3; \
+
+#define C_YUV420_YUYV_INTERLACED_EVEN( ) \
+ *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \
+ *p_line1++ = (*p_ut++ * 3 + *p_u * 5) >> 3; *p_line2++ = (*p_u++ * 7 + *p_ub++) >> 3; \
+ *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \
+ *p_line1++ = (*p_vt++ * 3 + *p_v * 5) >> 3; *p_line2++ = (*p_v++ * 7 + *p_vb++) >> 3; \
/*****************************************************************************
* I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
* original conversion routine from Videolan project
- * changed to support interlaced frames and use simple mean interpolation [MF]
+ * changed to support interlaced frames and do correct chroma upsampling with
+ * correct weighting factors and no chroma shift.
*****************************************************************************/
static void yv12_to_yuy2_c
(const unsigned char *y_src, int y_src_pitch,
@@ -680,10 +695,12 @@ static void yv12_to_yuy2_c
const uint8_t *p_y1, *p_y2 = y_src;
const uint8_t *p_u = u_src;
const uint8_t *p_v = v_src;
- const uint8_t *p_u2 = u_src + u_src_pitch;
- const uint8_t *p_v2 = v_src + v_src_pitch;
+ const uint8_t *p_ub, *p_vb;
+ const uint8_t *p_ut = u_src;
+ const uint8_t *p_vt = v_src;
int i_x, i_y;
+ int utmp, vtmp;
const int i_source_margin = y_src_pitch - width;
const int i_source_u_margin = u_src_pitch - width/2;
@@ -701,28 +718,29 @@ static void yv12_to_yuy2_c
p_y1 = p_y2;
p_y2 += y_src_pitch;
+ if( i_y > 1 ) {
+ p_ub = p_u + u_src_pitch;
+ p_vb = p_v + v_src_pitch;
+ } else {
+ p_ub = p_u;
+ p_vb = p_v;
+ }
+
for( i_x = width / 2 ; i_x-- ; )
{
- C_YUV420_YUYV( );
+ C_YUV420_YUYV_PROGRESSIVE( );
}
p_y2 += i_source_margin;
p_u += i_source_u_margin;
p_v += i_source_v_margin;
- if( i_y > 1 ) {
- p_u2 += i_source_u_margin;
- p_v2 += i_source_v_margin;
- } else {
- p_u2 = p_u;
- p_v2 = p_v;
- }
+ p_ut = p_u - u_src_pitch;
+ p_vt = p_v - v_src_pitch;
p_line2 += i_dest_margin;
}
} else {
- p_u2 = u_src + 2*u_src_pitch;
- p_v2 = v_src + 2*v_src_pitch;
for( i_y = height / 4 ; i_y-- ; )
{
p_line1 = p_line2;
@@ -731,21 +749,24 @@ static void yv12_to_yuy2_c
p_y1 = p_y2;
p_y2 += 2 * y_src_pitch;
+ if( i_y > 1 ) {
+ p_ub = p_u + 2 * u_src_pitch;
+ p_vb = p_v + 2 * v_src_pitch;
+ } else {
+ p_ub = p_u;
+ p_vb = p_v;
+ }
+
for( i_x = width / 2 ; i_x-- ; )
{
- C_YUV420_YUYV( );
+ C_YUV420_YUYV_INTERLACED_ODD( );
}
p_y2 += i_source_margin + y_src_pitch;
p_u += i_source_u_margin + u_src_pitch;
p_v += i_source_v_margin + v_src_pitch;
- if( i_y > 1 ) {
- p_u2 += i_source_u_margin + u_src_pitch;
- p_v2 += i_source_v_margin + v_src_pitch;
- } else {
- p_u2 = p_u;
- p_v2 = p_v;
- }
+ p_ut = p_u - 2 * u_src_pitch;
+ p_vt = p_v - 2 * v_src_pitch;
p_line2 += i_dest_margin + yuy2_pitch;
}
@@ -753,8 +774,8 @@ static void yv12_to_yuy2_c
p_y2 = y_src + y_src_pitch;
p_u = u_src + u_src_pitch;
p_v = v_src + v_src_pitch;
- p_u2 = u_src + 3*u_src_pitch;
- p_v2 = v_src + 3*v_src_pitch;
+ p_ut = p_u;
+ p_vt = p_v;
for( i_y = height / 4 ; i_y-- ; )
{
@@ -764,21 +785,24 @@ static void yv12_to_yuy2_c
p_y1 = p_y2;
p_y2 += 2 * y_src_pitch;
+ if( i_y > 1 ) {
+ p_ub = p_u + 2 * u_src_pitch;
+ p_vb = p_v + 2 * v_src_pitch;
+ } else {
+ p_ub = p_u;
+ p_vb = p_v;
+ }
+
for( i_x = width / 2 ; i_x-- ; )
{
- C_YUV420_YUYV( );
+ C_YUV420_YUYV_INTERLACED_EVEN( );
}
p_y2 += i_source_margin + y_src_pitch;
p_u += i_source_u_margin + u_src_pitch;
p_v += i_source_v_margin + v_src_pitch;
- if( i_y > 1 ) {
- p_u2 += i_source_u_margin + u_src_pitch;
- p_v2 += i_source_v_margin + v_src_pitch;
- } else {
- p_u2 = p_u;
- p_v2 = p_v;
- }
+ p_ut = p_u - 2 * u_src_pitch;
+ p_vt = p_v - 2 * v_src_pitch;
p_line2 += i_dest_margin + yuy2_pitch;
}
@@ -788,38 +812,204 @@ static void yv12_to_yuy2_c
#if defined(ARCH_X86) || defined(ARCH_X86_64)
-#define MMXEXT_YUV420_YUYV( ) \
+#define MMXEXT_YUV420_YUYV_PROGRESSIVE( ) \
do { \
__asm__ __volatile__(".align 8 \n\t" \
"movq (%0), %%mm0 \n\t" /* Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 */ \
"movd (%1), %%mm1 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \
"movd (%2), %%mm2 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \
- "punpcklbw %%mm2, %%mm1 \n\t" /* v3 u3 v2 u2 v1 u1 v0 u0 */ \
- "movq %%mm0, %%mm2 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \
- "punpcklbw %%mm1, %%mm2 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \
+ "pxor %%mm7, %%mm7 \n\t" /* 00 00 00 00 00 00 00 00 */ \
+ "punpcklbw %%mm7, %%mm1 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \
+ "punpcklbw %%mm7, %%mm2 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \
+ "movq %%mm1, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \
+ "movq %%mm2, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \
+ "psllw $1, %%mm3 \n\t" /* Cb * 2 */ \
+ "psllw $1, %%mm4 \n\t" /* Cr * 2 */ \
+ "paddw %%mm3, %%mm1 \n\t" /* Cb * 3 */ \
+ "paddw %%mm4, %%mm2 \n\t" /* Cr * 3 */ \
: \
: "r" (p_y1), "r" (p_u), "r" (p_v) ); \
__asm__ __volatile__( \
- "movd (%0), %%mm3 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \
- "movd (%1), %%mm4 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \
- "punpcklbw %%mm4, %%mm3 \n\t" /* v3 u3 v2 u2 v1 u1 v0 u0 */ \
- "pavgb %%mm1, %%mm3 \n\t" /* (mean) v3 u3 v2 u2 v1 u1 v0 u0 */ \
+ "movd (%0), %%mm3 \n\t" /* Load 4 Cbt 00 00 00 00 u3 u2 u1 u0 */ \
+ "movd (%1), %%mm4 \n\t" /* Load 4 Crt 00 00 00 00 v3 v2 v1 v0 */ \
+ "movd (%2), %%mm5 \n\t" /* Load 4 Cbb 00 00 00 00 u3 u2 u1 u0 */ \
+ "movd (%3), %%mm6 \n\t" /* Load 4 Crb 00 00 00 00 v3 v2 v1 v0 */ \
+ "punpcklbw %%mm7, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \
+ "punpcklbw %%mm7, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \
+ "punpcklbw %%mm7, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \
+ "punpcklbw %%mm7, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \
+ "paddw %%mm1, %%mm3 \n\t" /* Cb1 = Cbt + 3*Cb */ \
+ "paddw %%mm2, %%mm4 \n\t" /* Cr1 = Crt + 3*Cr */ \
+ "paddw %%mm5, %%mm1 \n\t" /* Cb2 = Cbb + 3*Cb */ \
+ "paddw %%mm6, %%mm2 \n\t" /* Cr2 = Crb + 3*Cr */ \
+ "psrlw $2, %%mm3 \n\t" /* Cb1 = (Cbt + 3*Cb) / 4 */ \
+ /* either the shifts by 2 and 8 or mask off bits and shift by 6 */ \
+ "psrlw $2, %%mm4 \n\t" /* Cr1 = (Crt + 3*Cr) / 4 */ \
+ "psllw $8, %%mm4 \n\t" \
+ "por %%mm4, %%mm3 \n\t" /* Cr1 Cb1 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \
+ "psrlw $2, %%mm1 \n\t" /* Cb2 = (Cbb + 3*Cb) / 4 */ \
+ "psrlw $2, %%mm2 \n\t" /* Cr2 = (Cbb + 3*Cb) / 4 */ \
+ "psllw $8, %%mm2 \n\t" \
+ "por %%mm1, %%mm2 \n\t" /* Cr2 Cb2 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \
+ "movq %%mm0, %%mm1 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \
+ "punpcklbw %%mm3, %%mm1 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \
: \
- : "r" (p_u2), "r" (p_v2) ); \
+ : "r" (p_ut), "r" (p_vt), "r" (p_ub), "r" (p_vb) ); \
__asm__ __volatile__( \
- "movntq %%mm2, (%0) \n\t" /* Store low YUYV */ \
- "punpckhbw %%mm1, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \
- "movntq %%mm0, 8(%0) \n\t" /* Store high YUYV */ \
+ "movntq %%mm1, (%0) \n\t" /* Store low YUYV1 */ \
+ "punpckhbw %%mm3, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \
+ "movntq %%mm0, 8(%0) \n\t" /* Store high YUYV1 */ \
"movq (%2), %%mm0 \n\t" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
- "movq %%mm0, %%mm2 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
- "punpcklbw %%mm3, %%mm2 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
- "movntq %%mm2, (%1) \n\t" /* Store low YUYV */ \
- "punpckhbw %%mm3, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
- "movntq %%mm0, 8(%1) \n\t" /* Store high YUYV */ \
+ "movq %%mm0, %%mm1 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
+ "punpcklbw %%mm2, %%mm1 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
+ "movntq %%mm1, (%1) \n\t" /* Store low YUYV2 */ \
+ "punpckhbw %%mm2, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
+ "movntq %%mm0, 8(%1) \n\t" /* Store high YUYV2 */ \
: \
: "r" (p_line1), "r" (p_line2), "r" (p_y2) ); \
p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \
- p_u2 += 4; p_v2 += 4; \
+ p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4; \
+} while(0)
+
+#define MMXEXT_YUV420_YUYV_INTERLACED_ODD( ) \
+do { \
+ __asm__ __volatile__(".align 8 \n\t" \
+ "movd (%0), %%mm1 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \
+ "movd (%1), %%mm2 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \
+ "pxor %%mm7, %%mm7 \n\t" /* 00 00 00 00 00 00 00 00 */ \
+ "punpcklbw %%mm7, %%mm1 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \
+ "punpcklbw %%mm7, %%mm2 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \
+ "movq %%mm1, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \
+ "movq %%mm2, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \
+ "psllw $2, %%mm3 \n\t" /* Cb * 4 */ \
+ "psllw $2, %%mm4 \n\t" /* Cr * 4 */ \
+ "paddw %%mm3, %%mm1 \n\t" /* Cb * 5 */ \
+ "paddw %%mm4, %%mm2 \n\t" /* Cr * 5 */ \
+ "psrlw $1, %%mm3 \n\t" /* Cb * 2 */ \
+ "psrlw $1, %%mm4 \n\t" /* Cr * 2 */ \
+ "paddw %%mm1, %%mm3 \n\t" /* Cb * 7 */ \
+ "paddw %%mm2, %%mm4 \n\t" /* Cr * 7 */ \
+ : \
+ : "r" (p_u), "r" (p_v) ); \
+ __asm__ __volatile__( \
+ "movd (%1), %%mm5 \n\t" /* Load 4 Cbt 00 00 00 00 u3 u2 u1 u0 */ \
+ "movd (%2), %%mm6 \n\t" /* Load 4 Crt 00 00 00 00 v3 v2 v1 v0 */ \
+ "movq (%0), %%mm0 \n\t" /* Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 */ \
+ "punpcklbw %%mm7, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \
+ "punpcklbw %%mm7, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \
+ "paddw %%mm3, %%mm5 \n\t" /* Cb1 = Cbt + 7*Cb */ \
+ "paddw %%mm4, %%mm6 \n\t" /* Cr1 = Crt + 7*Cr */ \
+ "psrlw $3, %%mm5 \n\t" /* Cb1 = (Cbt + 7*Cb) / 8 */ \
+ /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \
+ "psrlw $3, %%mm6 \n\t" /* Cr1 = (Crt + 7*Cr) / 8 */ \
+ "psllw $8, %%mm6 \n\t" \
+ "por %%mm5, %%mm6 \n\t" /* Cr1 Cb1 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \
+ "movq %%mm0, %%mm3 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \
+ "punpcklbw %%mm6, %%mm3 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \
+ "movntq %%mm3, (%3) \n\t" /* Store low YUYV1 */ \
+ "punpckhbw %%mm6, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \
+ "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV1 */ \
+ : \
+ : "r" (p_y1), "r" (p_ut), "r" (p_vt), "r" (p_line1) ); \
+ __asm__ __volatile__( \
+ "movd (%1), %%mm3 \n\t" /* Load 4 Cbb 00 00 00 00 u3 u2 u1 u0 */ \
+ "movd (%2), %%mm4 \n\t" /* Load 4 Crb 00 00 00 00 v3 v2 v1 v0 */ \
+ "movq (%0), %%mm0 \n\t" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
+ "punpcklbw %%mm7, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \
+ "punpcklbw %%mm7, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \
+ "movq %%mm3, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \
+ "movq %%mm4, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \
+ "psllw $1, %%mm5 \n\t" /* Cbb * 2 */ \
+ "psllw $1, %%mm6 \n\t" /* Crb * 2 */ \
+ "paddw %%mm5, %%mm3 \n\t" /* Cbb * 3 */ \
+ "paddw %%mm6, %%mm4 \n\t" /* Crb * 3 */ \
+ "paddw %%mm3, %%mm1 \n\t" /* Cb2 = 3*Cbb + 5*Cb */ \
+ "paddw %%mm4, %%mm2 \n\t" /* Cr2 = 3*Crb + 5*Cr */ \
+ "psrlw $3, %%mm1 \n\t" /* Cb2 = (3*Cbb + 5*Cb) / 8 */ \
+ /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \
+ "psrlw $3, %%mm2 \n\t" /* Cr2 = (3*Crb + 5*Cr) / 8 */ \
+ "psllw $8, %%mm2 \n\t" \
+ "por %%mm1, %%mm2 \n\t" /* Cr2 Cb2 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \
+ "movq %%mm0, %%mm1 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
+ "punpcklbw %%mm2, %%mm1 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
+ "movntq %%mm1, (%3) \n\t" /* Store low YUYV2 */ \
+ "punpckhbw %%mm2, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
+ "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV2 */ \
+ : \
+ : "r" (p_y2), "r" (p_ub), "r" (p_vb), "r" (p_line2) ); \
+ p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \
+ p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4; \
+} while(0)
+
+#define MMXEXT_YUV420_YUYV_INTERLACED_EVEN( ) \
+/* same as above, except the assembly input arguments are switched */ \
+do { \
+ __asm__ __volatile__(".align 8 \n\t" \
+ "movd (%0), %%mm1 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \
+ "movd (%1), %%mm2 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \
+ "pxor %%mm7, %%mm7 \n\t" /* 00 00 00 00 00 00 00 00 */ \
+ "punpcklbw %%mm7, %%mm1 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \
+ "punpcklbw %%mm7, %%mm2 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \
+ "movq %%mm1, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \
+ "movq %%mm2, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \
+ "psllw $2, %%mm3 \n\t" /* Cb * 4 */ \
+ "psllw $2, %%mm4 \n\t" /* Cr * 4 */ \
+ "paddw %%mm3, %%mm1 \n\t" /* Cb * 5 */ \
+ "paddw %%mm4, %%mm2 \n\t" /* Cr * 5 */ \
+ "psrlw $1, %%mm3 \n\t" /* Cb * 2 */ \
+ "psrlw $1, %%mm4 \n\t" /* Cr * 2 */ \
+ "paddw %%mm1, %%mm3 \n\t" /* Cb * 7 */ \
+ "paddw %%mm2, %%mm4 \n\t" /* Cr * 7 */ \
+ : \
+ : "r" (p_u), "r" (p_v) ); \
+ __asm__ __volatile__( \
+ "movd (%1), %%mm5 \n\t" /* Load 4 Cbt 00 00 00 00 u3 u2 u1 u0 */ \
+ "movd (%2), %%mm6 \n\t" /* Load 4 Crt 00 00 00 00 v3 v2 v1 v0 */ \
+ "movq (%0), %%mm0 \n\t" /* Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 */ \
+ "punpcklbw %%mm7, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \
+ "punpcklbw %%mm7, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \
+ "paddw %%mm3, %%mm5 \n\t" /* Cb1 = Cbt + 7*Cb */ \
+ "paddw %%mm4, %%mm6 \n\t" /* Cr1 = Crt + 7*Cr */ \
+ "psrlw $3, %%mm5 \n\t" /* Cb1 = (Cbt + 7*Cb) / 8 */ \
+ /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \
+ "psrlw $3, %%mm6 \n\t" /* Cr1 = (Crt + 7*Cr) / 8 */ \
+ "psllw $8, %%mm6 \n\t" \
+ "por %%mm5, %%mm6 \n\t" /* Cr1 Cb1 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \
+ "movq %%mm0, %%mm3 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \
+ "punpcklbw %%mm6, %%mm3 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \
+ "movntq %%mm3, (%3) \n\t" /* Store low YUYV1 */ \
+ "punpckhbw %%mm6, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \
+ "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV1 */ \
+ : \
+ : "r" (p_y2), "r" (p_ub), "r" (p_vb), "r" (p_line2) ); \
+ __asm__ __volatile__( \
+ "movd (%1), %%mm3 \n\t" /* Load 4 Cbb 00 00 00 00 u3 u2 u1 u0 */ \
+ "movd (%2), %%mm4 \n\t" /* Load 4 Crb 00 00 00 00 v3 v2 v1 v0 */ \
+ "movq (%0), %%mm0 \n\t" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
+ "punpcklbw %%mm7, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \
+ "punpcklbw %%mm7, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \
+ "movq %%mm3, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \
+ "movq %%mm4, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \
+ "psllw $1, %%mm5 \n\t" /* Cbb * 2 */ \
+ "psllw $1, %%mm6 \n\t" /* Crb * 2 */ \
+ "paddw %%mm5, %%mm3 \n\t" /* Cbb * 3 */ \
+ "paddw %%mm6, %%mm4 \n\t" /* Crb * 3 */ \
+ "paddw %%mm3, %%mm1 \n\t" /* Cb2 = 3*Cbb + 5*Cb */ \
+ "paddw %%mm4, %%mm2 \n\t" /* Cr2 = 3*Crb + 5*Cr */ \
+ "psrlw $3, %%mm1 \n\t" /* Cb2 = (3*Cbb + 5*Cb) / 8 */ \
+ /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \
+ "psrlw $3, %%mm2 \n\t" /* Cr2 = (3*Crb + 5*Cr) / 8 */ \
+ "psllw $8, %%mm2 \n\t" \
+ "por %%mm1, %%mm2 \n\t" /* Cr2 Cb2 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \
+ "movq %%mm0, %%mm1 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
+ "punpcklbw %%mm2, %%mm1 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
+ "movntq %%mm1, (%3) \n\t" /* Store low YUYV2 */ \
+ "punpckhbw %%mm2, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
+ "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV2 */ \
+ : \
+ : "r" (p_y1), "r" (p_ut), "r" (p_vt), "r" (p_line1) ); \
+ p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \
+ p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4; \
} while(0)
#endif
@@ -835,10 +1025,12 @@ static void yv12_to_yuy2_mmxext
const uint8_t *p_y1, *p_y2 = y_src;
const uint8_t *p_u = u_src;
const uint8_t *p_v = v_src;
- const uint8_t *p_u2 = u_src + u_src_pitch;
- const uint8_t *p_v2 = v_src + v_src_pitch;
+ const uint8_t *p_ub, *p_vb;
+ const uint8_t *p_ut = u_src;
+ const uint8_t *p_vt = v_src;
int i_x, i_y;
+ int utmp, vtmp;
const int i_source_margin = y_src_pitch - width;
const int i_source_u_margin = u_src_pitch - width/2;
@@ -855,32 +1047,33 @@ static void yv12_to_yuy2_mmxext
p_y1 = p_y2;
p_y2 += y_src_pitch;
+ if( i_y > 1 ) {
+ p_ub = p_u + u_src_pitch;
+ p_vb = p_v + v_src_pitch;
+ } else {
+ p_ub = p_u;
+ p_vb = p_v;
+ }
+
for( i_x = width / 8 ; i_x-- ; )
{
- MMXEXT_YUV420_YUYV( );
+ MMXEXT_YUV420_YUYV_PROGRESSIVE( );
}
for( i_x = (width % 8) / 2 ; i_x-- ; )
{
- C_YUV420_YUYV( );
+ C_YUV420_YUYV_PROGRESSIVE( );
}
p_y2 += i_source_margin;
p_u += i_source_u_margin;
p_v += i_source_v_margin;
- if( i_y > 1 ) {
- p_u2 += i_source_u_margin;
- p_v2 += i_source_v_margin;
- } else {
- p_u2 = p_u;
- p_v2 = p_v;
- }
+ p_ut = p_u - u_src_pitch;
+ p_vt = p_v - v_src_pitch;
p_line2 += i_dest_margin;
}
} else {
- p_u2 = u_src + 2*u_src_pitch;
- p_v2 = v_src + 2*v_src_pitch;
for( i_y = height / 4 ; i_y-- ; )
{
p_line1 = p_line2;
@@ -889,25 +1082,28 @@ static void yv12_to_yuy2_mmxext
p_y1 = p_y2;
p_y2 += 2 * y_src_pitch;
+ if( i_y > 1 ) {
+ p_ub = p_u + 2 * u_src_pitch;
+ p_vb = p_v + 2 * v_src_pitch;
+ } else {
+ p_ub = p_u;
+ p_vb = p_v;
+ }
+
for( i_x = width / 8 ; i_x-- ; )
{
- MMXEXT_YUV420_YUYV( );
+ MMXEXT_YUV420_YUYV_INTERLACED_ODD( );
}
for( i_x = (width % 8) / 2 ; i_x-- ; )
{
- C_YUV420_YUYV( );
+ C_YUV420_YUYV_INTERLACED_ODD( );
}
p_y2 += i_source_margin + y_src_pitch;
p_u += i_source_u_margin + u_src_pitch;
p_v += i_source_v_margin + v_src_pitch;
- if( i_y > 1 ) {
- p_u2 += i_source_u_margin + u_src_pitch;
- p_v2 += i_source_v_margin + v_src_pitch;
- } else {
- p_u2 = p_u;
- p_v2 = p_v;
- }
+ p_ut = p_u - 2 * u_src_pitch;
+ p_vt = p_v - 2 * v_src_pitch;
p_line2 += i_dest_margin + yuy2_pitch;
}
@@ -915,9 +1111,9 @@ static void yv12_to_yuy2_mmxext
p_y2 = y_src + y_src_pitch;
p_u = u_src + u_src_pitch;
p_v = v_src + v_src_pitch;
- p_u2 = u_src + 3*u_src_pitch;
- p_v2 = v_src + 3*v_src_pitch;
-
+ p_ut = p_u;
+ p_vt = p_v;
+
for( i_y = height / 4 ; i_y-- ; )
{
p_line1 = p_line2;
@@ -926,25 +1122,28 @@ static void yv12_to_yuy2_mmxext
p_y1 = p_y2;
p_y2 += 2 * y_src_pitch;
+ if( i_y > 1 ) {
+ p_ub = p_u + 2 * u_src_pitch;
+ p_vb = p_v + 2 * v_src_pitch;
+ } else {
+ p_ub = p_u;
+ p_vb = p_v;
+ }
+
for( i_x = width / 8 ; i_x-- ; )
{
- MMXEXT_YUV420_YUYV( );
+ MMXEXT_YUV420_YUYV_INTERLACED_EVEN( );
}
for( i_x = (width % 8) / 2 ; i_x-- ; )
{
- C_YUV420_YUYV( );
+ C_YUV420_YUYV_INTERLACED_EVEN( );
}
p_y2 += i_source_margin + y_src_pitch;
p_u += i_source_u_margin + u_src_pitch;
p_v += i_source_v_margin + v_src_pitch;
- if( i_y > 1 ) {
- p_u2 += i_source_u_margin + u_src_pitch;
- p_v2 += i_source_v_margin + v_src_pitch;
- } else {
- p_u2 = p_u;
- p_v2 = p_v;
- }
+ p_ut = p_u - 2 * u_src_pitch;
+ p_vt = p_v - 2 * v_src_pitch;
p_line2 += i_dest_margin + yuy2_pitch;
}
@@ -1143,7 +1342,7 @@ void init_yuv_conversion(void) {
else
yv12_to_yuy2 = yv12_to_yuy2_c;
- /* determine best YV12 -> YUY2 converter to use */
+ /* determine best YUY2 -> YV12 converter to use */
if (xine_mm_accel() & MM_ACCEL_X86_MMXEXT)
yuy2_to_yv12 = yuy2_to_yv12_mmxext;
else