From 8e8d3892412c5b3166efbc9aa4571dcaf2cc2a57 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <rscheidegger_lists@hispeed.ch>
Date: Thu, 22 Dec 2011 01:40:22 +0100
Subject: Use proper chroma upsampling for yv12 to yuy2 conversion

The old code did some "averaging" which, while cheap, lead to serious
chroma shift because the weighting factors turned out to be pretty random
(arguably no averaging likely would have been given more correct results).
It also in fact lead to chroma ghosts.
To see why this was wrong read the following and then do the math.
http://www.hometheaterhifi.com/the-dvd-benchmark/179-the-chroma-upsampling-error-and-the-420-interlaced-chroma-problem.html
http://avisynth.org/mediawiki/Sampling

As an example, let's look what happens at line 4 for interlaced content
(where the code would have averaged chroma from chroma line 2 and 4):
Chroma line 2 contains chroma values for line 2 (25%) and 4 (75%) while
chroma line 4 contains chroma values for line 6 (25%) and 8 (75%) of the
original (prior to subsampling) frame.
Average these together and you get something quite wrong. Most importantly
the center of these weights will be at 5.5 instead of 4 (hence chroma shift).
For odd lines it is different (better but still wrong).
So, fix this by using the correct weights for reconstruction of the chroma
values (which is averaging for the progressive case for all pixels since the
samples are defined to be between the lines, and use different weighting
factors for odd/even/"upper"/"lower" lines).

This runs more than twice the instructions (for the mmx case), but I measured
only a performance impact of roughly 5% (on a Athlon64 X2) - seriously bound
by memory access (by comparison the sort-of-pointless post-deinterlace chroma
filter is nearly twice as slow hence if you don't need it because the values
are correct this will be a lot faster).

Note: this is only correct for codecs which use the same chroma positions
as mpeg2 (dv is definitely different, mpeg1 is also different but only for
horizontal positioning, which doesn't matter here). "yv12" as such seems
underspecified wrt chroma positioning.

On another note, while this algorithm may be correct, it is inherently
suboptimal doing this pre-deinterlace (and a post-deinterlace chroma
filter is not going to help much neither except it can blur the mess).
This NEEDS to be part of deinterlace (which btw would also be quite a bit
faster when handling planar directly due to saving one pass of going
through all memory).
The reason is while line 4 will now use the correct weighting factors,
the fact remains it will use chroma values originating from lines 2, 4, 6
and 8 of the original image. However, if the deinterlacer decides to weave
because there is no motion, it CAN and most likely wants to use chroma values
from the other field (hence values originating from line 2, 3, 4, 5 in this
case when using a very simple filter, with appropriate weighting).

--HG--
branch : point-release
extra : rebase_source : 808bb5785ca398970324bea6b391a9e24c576d2f
---
 src/xine-utils/color.c | 377 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 288 insertions(+), 89 deletions(-)

diff --git a/src/xine-utils/color.c b/src/xine-utils/color.c
index bea4cd952..3c2388c27 100644
--- a/src/xine-utils/color.c
+++ b/src/xine-utils/color.c
@@ -658,16 +658,31 @@ static void yuv411_to_yv12_c
 
 }
 
-#define C_YUV420_YUYV( )                                          \
-    *p_line1++ = *p_y1++; *p_line2++ = *p_y2++;                   \
-    *p_line1++ = *p_u;    *p_line2++ = (*p_u++ + *p_u2++)>>1;     \
-    *p_line1++ = *p_y1++; *p_line2++ = *p_y2++;                   \
-    *p_line1++ = *p_v;    *p_line2++ = (*p_v++ + *p_v2++)>>1;
+#define C_YUV420_YUYV_PROGRESSIVE( )                                          \
+    utmp = 3 * *p_u++;                                                        \
+    vtmp = 3 * *p_v++;                                                        \
+    *p_line1++ = *p_y1++;                *p_line2++ = *p_y2++;                \
+    *p_line1++ = (*p_ut++ + utmp) >> 2;  *p_line2++ = (utmp + *p_ub++) >> 2;  \
+    *p_line1++ = *p_y1++;                *p_line2++ = *p_y2++;                \
+    *p_line1++ = (*p_vt++ + vtmp) >> 2;  *p_line2++ = (vtmp + *p_vb++) >> 2;  \
+
+#define C_YUV420_YUYV_INTERLACED_ODD( )                                                   \
+    *p_line1++ = *p_y1++;                   *p_line2++ = *p_y2++;                         \
+    *p_line1++ = (*p_ut++ + *p_u * 7) >> 3; *p_line2++ = (*p_u++ * 5 + *p_ub++ * 3) >> 3; \
+    *p_line1++ = *p_y1++;                   *p_line2++ = *p_y2++;                         \
+    *p_line1++ = (*p_vt++ + *p_v * 7) >> 3; *p_line2++ = (*p_v++ * 5 + *p_vb++ * 3) >> 3; \
+
+#define C_YUV420_YUYV_INTERLACED_EVEN( )                                                  \
+    *p_line1++ = *p_y1++;                       *p_line2++ = *p_y2++;                     \
+    *p_line1++ = (*p_ut++ * 3 + *p_u * 5) >> 3; *p_line2++ = (*p_u++ * 7 + *p_ub++) >> 3; \
+    *p_line1++ = *p_y1++;                       *p_line2++ = *p_y2++;                     \
+    *p_line1++ = (*p_vt++ * 3 + *p_v * 5) >> 3; *p_line2++ = (*p_v++ * 7 + *p_vb++) >> 3; \
 
 /*****************************************************************************
  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
  * original conversion routine from Videolan project
- * changed to support interlaced frames and use simple mean interpolation [MF]
+ * changed to support interlaced frames and do correct chroma upsampling with
+ * correct weighting factors and no chroma shift.
  *****************************************************************************/
 static void yv12_to_yuy2_c
   (const unsigned char *y_src, int y_src_pitch,
@@ -680,10 +695,12 @@ static void yv12_to_yuy2_c
     const uint8_t *p_y1, *p_y2 = y_src;
     const uint8_t *p_u = u_src;
     const uint8_t *p_v = v_src;
-    const uint8_t *p_u2 = u_src + u_src_pitch;
-    const uint8_t *p_v2 = v_src + v_src_pitch;
+    const uint8_t *p_ub, *p_vb;
+    const uint8_t *p_ut = u_src;
+    const uint8_t *p_vt = v_src;
 
     int i_x, i_y;
+    int utmp, vtmp;
 
     const int i_source_margin = y_src_pitch - width;
     const int i_source_u_margin = u_src_pitch - width/2;
@@ -701,28 +718,29 @@ static void yv12_to_yuy2_c
           p_y1 = p_y2;
           p_y2 += y_src_pitch;
 
+          if( i_y > 1 ) {
+            p_ub = p_u + u_src_pitch;
+            p_vb = p_v + v_src_pitch;
+          } else {
+            p_ub = p_u;
+            p_vb = p_v;
+          }
+
           for( i_x = width / 2 ; i_x-- ; )
           {
-              C_YUV420_YUYV( );
+              C_YUV420_YUYV_PROGRESSIVE( );
           }
 
           p_y2 += i_source_margin;
           p_u += i_source_u_margin;
           p_v += i_source_v_margin;
-          if( i_y > 1 ) {
-            p_u2 += i_source_u_margin;
-            p_v2 += i_source_v_margin;
-          } else {
-            p_u2 = p_u;
-            p_v2 = p_v;
-          }
+          p_ut = p_u - u_src_pitch;
+          p_vt = p_v - v_src_pitch;
           p_line2 += i_dest_margin;
       }
 
     } else {
 
-      p_u2 = u_src + 2*u_src_pitch;
-      p_v2 = v_src + 2*v_src_pitch;
       for( i_y = height / 4 ; i_y-- ; )
       {
           p_line1 = p_line2;
@@ -731,21 +749,24 @@ static void yv12_to_yuy2_c
           p_y1 = p_y2;
           p_y2 += 2 * y_src_pitch;
 
+          if( i_y > 1 ) {
+            p_ub = p_u + 2 * u_src_pitch;
+            p_vb = p_v + 2 * v_src_pitch;
+          } else {
+            p_ub = p_u;
+            p_vb = p_v;
+          }
+
           for( i_x = width / 2 ; i_x-- ; )
           {
-              C_YUV420_YUYV( );
+              C_YUV420_YUYV_INTERLACED_ODD( );
           }
 
           p_y2 += i_source_margin + y_src_pitch;
           p_u += i_source_u_margin + u_src_pitch;
           p_v += i_source_v_margin + v_src_pitch;
-          if( i_y > 1 ) {
-            p_u2 += i_source_u_margin + u_src_pitch;
-            p_v2 += i_source_v_margin + v_src_pitch;
-          } else {
-            p_u2 = p_u;
-            p_v2 = p_v;
-          }
+          p_ut = p_u - 2 * u_src_pitch;
+          p_vt = p_v - 2 * v_src_pitch;
           p_line2 += i_dest_margin + yuy2_pitch;
       }
 
@@ -753,8 +774,8 @@ static void yv12_to_yuy2_c
       p_y2 = y_src + y_src_pitch;
       p_u = u_src + u_src_pitch;
       p_v = v_src + v_src_pitch;
-      p_u2 = u_src + 3*u_src_pitch;
-      p_v2 = v_src + 3*v_src_pitch;
+      p_ut = p_u;
+      p_vt = p_v;
 
       for( i_y = height / 4 ; i_y-- ; )
       {
@@ -764,21 +785,24 @@ static void yv12_to_yuy2_c
           p_y1 = p_y2;
           p_y2 += 2 * y_src_pitch;
 
+          if( i_y > 1 ) {
+            p_ub = p_u + 2 * u_src_pitch;
+            p_vb = p_v + 2 * v_src_pitch;
+          } else {
+            p_ub = p_u;
+            p_vb = p_v;
+          }
+
           for( i_x = width / 2 ; i_x-- ; )
           {
-              C_YUV420_YUYV( );
+              C_YUV420_YUYV_INTERLACED_EVEN( );
           }
 
           p_y2 += i_source_margin + y_src_pitch;
           p_u += i_source_u_margin + u_src_pitch;
           p_v += i_source_v_margin + v_src_pitch;
-          if( i_y > 1 ) {
-            p_u2 += i_source_u_margin + u_src_pitch;
-            p_v2 += i_source_v_margin + v_src_pitch;
-          } else {
-            p_u2 = p_u;
-            p_v2 = p_v;
-          }
+          p_ut = p_u - 2 * u_src_pitch;
+          p_vt = p_v - 2 * v_src_pitch;
           p_line2 += i_dest_margin + yuy2_pitch;
       }
 
@@ -788,38 +812,204 @@ static void yv12_to_yuy2_c
 
 #if defined(ARCH_X86) || defined(ARCH_X86_64)
 
-#define MMXEXT_YUV420_YUYV( )                                                      \
+#define MMXEXT_YUV420_YUYV_PROGRESSIVE( )                                          \
 do {                                                                               \
    __asm__ __volatile__(".align 8 \n\t"                                            \
     "movq       (%0), %%mm0 \n\t"  /* Load 8 Y          y7 y6 y5 y4 y3 y2 y1 y0 */ \
     "movd       (%1), %%mm1 \n\t"  /* Load 4 Cb         00 00 00 00 u3 u2 u1 u0 */ \
     "movd       (%2), %%mm2 \n\t"  /* Load 4 Cr         00 00 00 00 v3 v2 v1 v0 */ \
-    "punpcklbw %%mm2, %%mm1 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
-    "movq      %%mm0, %%mm2 \n\t"  /*                   y7 y6 y5 y4 y3 y2 y1 y0 */ \
-    "punpcklbw %%mm1, %%mm2 \n\t"  /*                   v1 y3 u1 y2 v0 y1 u0 y0 */ \
+    "pxor      %%mm7, %%mm7 \n\t"  /*                   00 00 00 00 00 00 00 00 */ \
+    "punpcklbw %%mm7, %%mm1 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
+    "punpcklbw %%mm7, %%mm2 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
+    "movq      %%mm1, %%mm3 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
+    "movq      %%mm2, %%mm4 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
+    "psllw        $1, %%mm3 \n\t"  /* Cb * 2                                    */ \
+    "psllw        $1, %%mm4 \n\t"  /* Cr * 2                                    */ \
+    "paddw     %%mm3, %%mm1 \n\t"  /* Cb * 3                                    */ \
+    "paddw     %%mm4, %%mm2 \n\t"  /* Cr * 3                                    */ \
     :                                                                              \
     : "r" (p_y1), "r" (p_u), "r" (p_v) );                                          \
    __asm__ __volatile__(                                                           \
-    "movd       (%0), %%mm3 \n\t"  /* Load 4 Cb         00 00 00 00 u3 u2 u1 u0 */ \
-    "movd       (%1), %%mm4 \n\t"  /* Load 4 Cr         00 00 00 00 v3 v2 v1 v0 */ \
-    "punpcklbw %%mm4, %%mm3 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
-    "pavgb     %%mm1, %%mm3 \n\t"  /* (mean)            v3 u3 v2 u2 v1 u1 v0 u0 */ \
+    "movd       (%0), %%mm3 \n\t"  /* Load 4 Cbt        00 00 00 00 u3 u2 u1 u0 */ \
+    "movd       (%1), %%mm4 \n\t"  /* Load 4 Crt        00 00 00 00 v3 v2 v1 v0 */ \
+    "movd       (%2), %%mm5 \n\t"  /* Load 4 Cbb        00 00 00 00 u3 u2 u1 u0 */ \
+    "movd       (%3), %%mm6 \n\t"  /* Load 4 Crb        00 00 00 00 v3 v2 v1 v0 */ \
+    "punpcklbw %%mm7, %%mm3 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
+    "punpcklbw %%mm7, %%mm4 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
+    "punpcklbw %%mm7, %%mm5 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
+    "punpcklbw %%mm7, %%mm6 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
+    "paddw     %%mm1, %%mm3 \n\t"  /* Cb1 = Cbt + 3*Cb                          */ \
+    "paddw     %%mm2, %%mm4 \n\t"  /* Cr1 = Crt + 3*Cr                          */ \
+    "paddw     %%mm5, %%mm1 \n\t"  /* Cb2 = Cbb + 3*Cb                          */ \
+    "paddw     %%mm6, %%mm2 \n\t"  /* Cr2 = Crb + 3*Cr                          */ \
+    "psrlw        $2, %%mm3 \n\t"  /* Cb1 = (Cbt + 3*Cb) / 4                    */ \
+    /* either the shifts by 2 and 8 or mask off bits and shift by 6             */ \
+    "psrlw        $2, %%mm4 \n\t"  /* Cr1 = (Crt + 3*Cr) / 4                    */ \
+    "psllw        $8, %%mm4 \n\t"                                                  \
+    "por       %%mm4, %%mm3 \n\t"  /* Cr1 Cb1 interl    v3 u3 v2 u2 v1 u1 v0 u0 */ \
+    "psrlw        $2, %%mm1 \n\t"  /* Cb2 = (Cbb + 3*Cb) / 4                    */ \
+    "psrlw        $2, %%mm2 \n\t"  /* Cr2 = (Cbb + 3*Cb) / 4                    */ \
+    "psllw        $8, %%mm2 \n\t"                                                  \
+    "por       %%mm1, %%mm2 \n\t"  /* Cr2 Cb2 interl    v3 u3 v2 u2 v1 u1 v0 u0 */ \
+    "movq      %%mm0, %%mm1 \n\t"  /*                   y7 y6 y5 y4 y3 y2 y1 y0 */ \
+    "punpcklbw %%mm3, %%mm1 \n\t"  /*                   v1 y3 u1 y2 v0 y1 u0 y0 */ \
     :                                                                              \
-    : "r" (p_u2), "r" (p_v2) );                                                    \
+    : "r" (p_ut), "r" (p_vt), "r" (p_ub), "r" (p_vb) );                            \
    __asm__ __volatile__(                                                           \
-    "movntq    %%mm2, (%0)  \n\t"  /* Store low YUYV                            */ \
-    "punpckhbw %%mm1, %%mm0 \n\t"  /*                   v3 y7 u3 y6 v2 y5 u2 y4 */ \
-    "movntq    %%mm0, 8(%0) \n\t"  /* Store high YUYV                           */ \
+    "movntq    %%mm1, (%0)  \n\t"  /* Store low YUYV1                           */ \
+    "punpckhbw %%mm3, %%mm0 \n\t"  /*                   v3 y7 u3 y6 v2 y5 u2 y4 */ \
+    "movntq    %%mm0, 8(%0) \n\t"  /* Store high YUYV1                          */ \
     "movq       (%2), %%mm0 \n\t"  /* Load 8 Y          Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
-    "movq      %%mm0, %%mm2 \n\t"  /*                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
-    "punpcklbw %%mm3, %%mm2 \n\t"  /*                   v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
-    "movntq    %%mm2, (%1)  \n\t"  /* Store low YUYV                            */ \
-    "punpckhbw %%mm3, %%mm0 \n\t"  /*                   v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
-    "movntq    %%mm0, 8(%1) \n\t"  /* Store high YUYV                           */ \
+    "movq      %%mm0, %%mm1 \n\t"  /*                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
+    "punpcklbw %%mm2, %%mm1 \n\t"  /*                   v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
+    "movntq    %%mm1, (%1)  \n\t"  /* Store low YUYV2                           */ \
+    "punpckhbw %%mm2, %%mm0 \n\t"  /*                   v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
+    "movntq    %%mm0, 8(%1) \n\t"  /* Store high YUYV2                          */ \
     :                                                                              \
     : "r" (p_line1),  "r" (p_line2),  "r" (p_y2) );                                \
   p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4;          \
-  p_u2 += 4; p_v2 += 4;                                                            \
+  p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4;                                      \
+} while(0)
+
+#define MMXEXT_YUV420_YUYV_INTERLACED_ODD( )                                       \
+do {                                                                               \
+   __asm__ __volatile__(".align 8 \n\t"                                            \
+    "movd       (%0), %%mm1 \n\t"  /* Load 4 Cb         00 00 00 00 u3 u2 u1 u0 */ \
+    "movd       (%1), %%mm2 \n\t"  /* Load 4 Cr         00 00 00 00 v3 v2 v1 v0 */ \
+    "pxor      %%mm7, %%mm7 \n\t"  /*                   00 00 00 00 00 00 00 00 */ \
+    "punpcklbw %%mm7, %%mm1 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
+    "punpcklbw %%mm7, %%mm2 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
+    "movq      %%mm1, %%mm3 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
+    "movq      %%mm2, %%mm4 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
+    "psllw        $2, %%mm3 \n\t"  /* Cb * 4                                    */ \
+    "psllw        $2, %%mm4 \n\t"  /* Cr * 4                                    */ \
+    "paddw     %%mm3, %%mm1 \n\t"  /* Cb * 5                                    */ \
+    "paddw     %%mm4, %%mm2 \n\t"  /* Cr * 5                                    */ \
+    "psrlw        $1, %%mm3 \n\t"  /* Cb * 2                                    */ \
+    "psrlw        $1, %%mm4 \n\t"  /* Cr * 2                                    */ \
+    "paddw     %%mm1, %%mm3 \n\t"  /* Cb * 7                                    */ \
+    "paddw     %%mm2, %%mm4 \n\t"  /* Cr * 7                                    */ \
+    :                                                                              \
+    : "r" (p_u), "r" (p_v) );                                                      \
+   __asm__ __volatile__(                                                           \
+    "movd       (%1), %%mm5 \n\t"  /* Load 4 Cbt        00 00 00 00 u3 u2 u1 u0 */ \
+    "movd       (%2), %%mm6 \n\t"  /* Load 4 Crt        00 00 00 00 v3 v2 v1 v0 */ \
+    "movq       (%0), %%mm0 \n\t"  /* Load 8 Y          y7 y6 y5 y4 y3 y2 y1 y0 */ \
+    "punpcklbw %%mm7, %%mm5 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
+    "punpcklbw %%mm7, %%mm6 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
+    "paddw     %%mm3, %%mm5 \n\t"  /* Cb1 = Cbt + 7*Cb                          */ \
+    "paddw     %%mm4, %%mm6 \n\t"  /* Cr1 = Crt + 7*Cr                          */ \
+    "psrlw        $3, %%mm5 \n\t"  /* Cb1 = (Cbt + 7*Cb) / 8                    */ \
+    /* either the shifts by 3 and 8 or mask off bits and shift by 5             */ \
+    "psrlw        $3, %%mm6 \n\t"  /* Cr1 = (Crt + 7*Cr) / 8                    */ \
+    "psllw        $8, %%mm6 \n\t"                                                  \
+    "por       %%mm5, %%mm6 \n\t"  /* Cr1 Cb1 interl    v3 u3 v2 u2 v1 u1 v0 u0 */ \
+    "movq      %%mm0, %%mm3 \n\t"  /*                   y7 y6 y5 y4 y3 y2 y1 y0 */ \
+    "punpcklbw %%mm6, %%mm3 \n\t"  /*                   v1 y3 u1 y2 v0 y1 u0 y0 */ \
+    "movntq    %%mm3, (%3)  \n\t"  /* Store low YUYV1                           */ \
+    "punpckhbw %%mm6, %%mm0 \n\t"  /*                   v3 y7 u3 y6 v2 y5 u2 y4 */ \
+    "movntq    %%mm0, 8(%3) \n\t"  /* Store high YUYV1                          */ \
+    :                                                                              \
+    : "r" (p_y1), "r" (p_ut), "r" (p_vt), "r" (p_line1) );                         \
+   __asm__ __volatile__(                                                           \
+    "movd       (%1), %%mm3 \n\t"  /* Load 4 Cbb        00 00 00 00 u3 u2 u1 u0 */ \
+    "movd       (%2), %%mm4 \n\t"  /* Load 4 Crb        00 00 00 00 v3 v2 v1 v0 */ \
+    "movq       (%0), %%mm0 \n\t"  /* Load 8 Y          Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
+    "punpcklbw %%mm7, %%mm3 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
+    "punpcklbw %%mm7, %%mm4 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
+    "movq      %%mm3, %%mm5 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
+    "movq      %%mm4, %%mm6 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
+    "psllw        $1, %%mm5 \n\t"  /* Cbb * 2                                   */ \
+    "psllw        $1, %%mm6 \n\t"  /* Crb * 2                                   */ \
+    "paddw     %%mm5, %%mm3 \n\t"  /* Cbb * 3                                   */ \
+    "paddw     %%mm6, %%mm4 \n\t"  /* Crb * 3                                   */ \
+    "paddw     %%mm3, %%mm1 \n\t"  /* Cb2 = 3*Cbb + 5*Cb                        */ \
+    "paddw     %%mm4, %%mm2 \n\t"  /* Cr2 = 3*Crb + 5*Cr                        */ \
+    "psrlw        $3, %%mm1 \n\t"  /* Cb2 = (3*Cbb + 5*Cb) / 8                  */ \
+    /* either the shifts by 3 and 8 or mask off bits and shift by 5             */ \
+    "psrlw        $3, %%mm2 \n\t"  /* Cr2 = (3*Crb + 5*Cr) / 8                  */ \
+    "psllw        $8, %%mm2 \n\t"                                                  \
+    "por       %%mm1, %%mm2 \n\t"  /* Cr2 Cb2 interl    v3 u3 v2 u2 v1 u1 v0 u0 */ \
+    "movq      %%mm0, %%mm1 \n\t"  /*                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
+    "punpcklbw %%mm2, %%mm1 \n\t"  /*                   v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
+    "movntq    %%mm1, (%3)  \n\t"  /* Store low YUYV2                           */ \
+    "punpckhbw %%mm2, %%mm0 \n\t"  /*                   v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
+    "movntq    %%mm0, 8(%3) \n\t"  /* Store high YUYV2                          */ \
+    :                                                                              \
+    : "r" (p_y2),  "r" (p_ub), "r" (p_vb),  "r" (p_line2) );                       \
+  p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4;          \
+  p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4;                                      \
+} while(0)
+
+#define MMXEXT_YUV420_YUYV_INTERLACED_EVEN( )                                      \
+/* same as above, except the assembly input arguments are switched */              \
+do {                                                                               \
+   __asm__ __volatile__(".align 8 \n\t"                                            \
+    "movd       (%0), %%mm1 \n\t"  /* Load 4 Cb         00 00 00 00 u3 u2 u1 u0 */ \
+    "movd       (%1), %%mm2 \n\t"  /* Load 4 Cr         00 00 00 00 v3 v2 v1 v0 */ \
+    "pxor      %%mm7, %%mm7 \n\t"  /*                   00 00 00 00 00 00 00 00 */ \
+    "punpcklbw %%mm7, %%mm1 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
+    "punpcklbw %%mm7, %%mm2 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
+    "movq      %%mm1, %%mm3 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
+    "movq      %%mm2, %%mm4 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
+    "psllw        $2, %%mm3 \n\t"  /* Cb * 4                                    */ \
+    "psllw        $2, %%mm4 \n\t"  /* Cr * 4                                    */ \
+    "paddw     %%mm3, %%mm1 \n\t"  /* Cb * 5                                    */ \
+    "paddw     %%mm4, %%mm2 \n\t"  /* Cr * 5                                    */ \
+    "psrlw        $1,  %%mm3 \n\t" /* Cb * 2                                    */ \
+    "psrlw        $1,  %%mm4 \n\t" /* Cr * 2                                    */ \
+    "paddw     %%mm1, %%mm3 \n\t"  /* Cb * 7                                    */ \
+    "paddw     %%mm2, %%mm4 \n\t"  /* Cr * 7                                    */ \
+    :                                                                              \
+    : "r" (p_u), "r" (p_v) );                                                      \
+   __asm__ __volatile__(                                                           \
+    "movd       (%1), %%mm5 \n\t"  /* Load 4 Cbt        00 00 00 00 u3 u2 u1 u0 */ \
+    "movd       (%2), %%mm6 \n\t"  /* Load 4 Crt        00 00 00 00 v3 v2 v1 v0 */ \
+    "movq       (%0), %%mm0 \n\t"  /* Load 8 Y          y7 y6 y5 y4 y3 y2 y1 y0 */ \
+    "punpcklbw %%mm7, %%mm5 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
+    "punpcklbw %%mm7, %%mm6 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
+    "paddw     %%mm3, %%mm5 \n\t"  /* Cb1 = Cbt + 7*Cb                          */ \
+    "paddw     %%mm4, %%mm6 \n\t"  /* Cr1 = Crt + 7*Cr                          */ \
+    "psrlw        $3, %%mm5 \n\t"  /* Cb1 = (Cbt + 7*Cb) / 8                    */ \
+    /* either the shifts by 3 and 8 or mask off bits and shift by 5             */ \
+    "psrlw        $3, %%mm6 \n\t"  /* Cr1 = (Crt + 7*Cr) / 8                    */ \
+    "psllw        $8, %%mm6 \n\t"                                                  \
+    "por       %%mm5, %%mm6 \n\t"  /* Cr1 Cb1 interl    v3 u3 v2 u2 v1 u1 v0 u0 */ \
+    "movq      %%mm0, %%mm3 \n\t"  /*                   y7 y6 y5 y4 y3 y2 y1 y0 */ \
+    "punpcklbw %%mm6, %%mm3 \n\t"  /*                   v1 y3 u1 y2 v0 y1 u0 y0 */ \
+    "movntq    %%mm3, (%3)  \n\t"  /* Store low YUYV1                           */ \
+    "punpckhbw %%mm6, %%mm0 \n\t"  /*                   v3 y7 u3 y6 v2 y5 u2 y4 */ \
+    "movntq    %%mm0, 8(%3) \n\t"  /* Store high YUYV1                          */ \
+    :                                                                              \
+    : "r" (p_y2), "r" (p_ub), "r" (p_vb), "r" (p_line2) );                         \
+   __asm__ __volatile__(                                                           \
+    "movd       (%1), %%mm3 \n\t"  /* Load 4 Cbb        00 00 00 00 u3 u2 u1 u0 */ \
+    "movd       (%2), %%mm4 \n\t"  /* Load 4 Crb        00 00 00 00 v3 v2 v1 v0 */ \
+    "movq       (%0), %%mm0 \n\t"  /* Load 8 Y          Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
+    "punpcklbw %%mm7, %%mm3 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
+    "punpcklbw %%mm7, %%mm4 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
+    "movq      %%mm3, %%mm5 \n\t"  /*                   00 u3 00 u2 00 u1 00 u0 */ \
+    "movq      %%mm4, %%mm6 \n\t"  /*                   00 v3 00 v2 00 v1 00 v0 */ \
+    "psllw        $1, %%mm5 \n\t"  /* Cbb * 2                                   */ \
+    "psllw        $1, %%mm6 \n\t"  /* Crb * 2                                   */ \
+    "paddw     %%mm5, %%mm3 \n\t"  /* Cbb * 3                                   */ \
+    "paddw     %%mm6, %%mm4 \n\t"  /* Crb * 3                                   */ \
+    "paddw     %%mm3, %%mm1 \n\t"  /* Cb2 = 3*Cbb + 5*Cb                        */ \
+    "paddw     %%mm4, %%mm2 \n\t"  /* Cr2 = 3*Crb + 5*Cr                        */ \
+    "psrlw        $3, %%mm1 \n\t"  /* Cb2 = (3*Cbb + 5*Cb) / 8                  */ \
+    /* either the shifts by 3 and 8 or mask off bits and shift by 5             */ \
+    "psrlw        $3, %%mm2 \n\t"  /* Cr2 = (3*Crb + 5*Cr) / 8                  */ \
+    "psllw        $8, %%mm2 \n\t"                                                  \
+    "por       %%mm1, %%mm2 \n\t"  /* Cr2 Cb2 interl    v3 u3 v2 u2 v1 u1 v0 u0 */ \
+    "movq      %%mm0, %%mm1 \n\t"  /*                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
+    "punpcklbw %%mm2, %%mm1 \n\t"  /*                   v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
+    "movntq    %%mm1, (%3)  \n\t"  /* Store low YUYV2                           */ \
+    "punpckhbw %%mm2, %%mm0 \n\t"  /*                   v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
+    "movntq    %%mm0, 8(%3) \n\t"  /* Store high YUYV2                          */ \
+    :                                                                              \
+    : "r" (p_y1),  "r" (p_ut), "r" (p_vt),  "r" (p_line1) );                       \
+  p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4;          \
+  p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4;                                      \
 } while(0)
 
 #endif
@@ -835,10 +1025,12 @@ static void yv12_to_yuy2_mmxext
     const uint8_t *p_y1, *p_y2 = y_src;
     const uint8_t *p_u = u_src;
     const uint8_t *p_v = v_src;
-    const uint8_t *p_u2 = u_src + u_src_pitch;
-    const uint8_t *p_v2 = v_src + v_src_pitch;
+    const uint8_t *p_ub, *p_vb;
+    const uint8_t *p_ut = u_src;
+    const uint8_t *p_vt = v_src;
 
     int i_x, i_y;
+    int utmp, vtmp;
 
     const int i_source_margin = y_src_pitch - width;
     const int i_source_u_margin = u_src_pitch - width/2;
@@ -855,32 +1047,33 @@ static void yv12_to_yuy2_mmxext
           p_y1 = p_y2;
           p_y2 += y_src_pitch;
 
+          if( i_y > 1 ) {
+            p_ub = p_u + u_src_pitch;
+            p_vb = p_v + v_src_pitch;
+          } else {
+            p_ub = p_u;
+            p_vb = p_v;
+          }
+
           for( i_x = width / 8 ; i_x-- ; )
           {
-              MMXEXT_YUV420_YUYV( );
+              MMXEXT_YUV420_YUYV_PROGRESSIVE( );
           }
           for( i_x = (width % 8) / 2 ; i_x-- ; )
           {
-              C_YUV420_YUYV( );
+              C_YUV420_YUYV_PROGRESSIVE( );
           }
 
           p_y2 += i_source_margin;
           p_u += i_source_u_margin;
           p_v += i_source_v_margin;
-          if( i_y > 1 ) {
-            p_u2 += i_source_u_margin;
-            p_v2 += i_source_v_margin;
-          } else {
-            p_u2 = p_u;
-            p_v2 = p_v;
-          }
+          p_ut = p_u - u_src_pitch;
+          p_vt = p_v - v_src_pitch;
           p_line2 += i_dest_margin;
       }
 
     } else {
 
-      p_u2 = u_src + 2*u_src_pitch;
-      p_v2 = v_src + 2*v_src_pitch;
       for( i_y = height / 4 ; i_y-- ; )
       {
           p_line1 = p_line2;
@@ -889,25 +1082,28 @@ static void yv12_to_yuy2_mmxext
           p_y1 = p_y2;
           p_y2 += 2 * y_src_pitch;
 
+          if( i_y > 1 ) {
+            p_ub = p_u + 2 * u_src_pitch;
+            p_vb = p_v + 2 * v_src_pitch;
+          } else {
+            p_ub = p_u;
+            p_vb = p_v;
+          }
+
           for( i_x = width / 8 ; i_x-- ; )
           {
-              MMXEXT_YUV420_YUYV( );
+              MMXEXT_YUV420_YUYV_INTERLACED_ODD( );
           }
           for( i_x = (width % 8) / 2 ; i_x-- ; )
           {
-              C_YUV420_YUYV( );
+              C_YUV420_YUYV_INTERLACED_ODD( );
           }
 
           p_y2 += i_source_margin + y_src_pitch;
           p_u += i_source_u_margin + u_src_pitch;
           p_v += i_source_v_margin + v_src_pitch;
-          if( i_y > 1 ) {
-            p_u2 += i_source_u_margin + u_src_pitch;
-            p_v2 += i_source_v_margin + v_src_pitch;
-          } else {
-            p_u2 = p_u;
-            p_v2 = p_v;
-          }
+          p_ut = p_u - 2 * u_src_pitch;
+          p_vt = p_v - 2 * v_src_pitch;
           p_line2 += i_dest_margin + yuy2_pitch;
       }
 
@@ -915,9 +1111,9 @@ static void yv12_to_yuy2_mmxext
       p_y2 = y_src + y_src_pitch;
       p_u = u_src + u_src_pitch;
       p_v = v_src + v_src_pitch;
-      p_u2 = u_src + 3*u_src_pitch;
-      p_v2 = v_src + 3*v_src_pitch;
-
+      p_ut = p_u;
+      p_vt = p_v;
+ 
       for( i_y = height / 4 ; i_y-- ; )
       {
           p_line1 = p_line2;
@@ -926,25 +1122,28 @@ static void yv12_to_yuy2_mmxext
           p_y1 = p_y2;
           p_y2 += 2 * y_src_pitch;
 
+          if( i_y > 1 ) {
+            p_ub = p_u + 2 * u_src_pitch;
+            p_vb = p_v + 2 * v_src_pitch;
+          } else {
+            p_ub = p_u;
+            p_vb = p_v;
+          }
+
           for( i_x = width / 8 ; i_x-- ; )
           {
-              MMXEXT_YUV420_YUYV( );
+              MMXEXT_YUV420_YUYV_INTERLACED_EVEN( );
           }
           for( i_x = (width % 8) / 2 ; i_x-- ; )
           {
-              C_YUV420_YUYV( );
+              C_YUV420_YUYV_INTERLACED_EVEN( );
           }
 
           p_y2 += i_source_margin + y_src_pitch;
           p_u += i_source_u_margin + u_src_pitch;
           p_v += i_source_v_margin + v_src_pitch;
-          if( i_y > 1 ) {
-            p_u2 += i_source_u_margin + u_src_pitch;
-            p_v2 += i_source_v_margin + v_src_pitch;
-          } else {
-            p_u2 = p_u;
-            p_v2 = p_v;
-          }
+          p_ut = p_u - 2 * u_src_pitch;
+          p_vt = p_v - 2 * v_src_pitch;
           p_line2 += i_dest_margin + yuy2_pitch;
       }
 
@@ -1143,7 +1342,7 @@ void init_yuv_conversion(void) {
   else
     yv12_to_yuy2 = yv12_to_yuy2_c;
 
-  /* determine best YV12 -> YUY2 converter to use */
+  /* determine best YUY2 -> YV12 converter to use */
   if (xine_mm_accel() & MM_ACCEL_X86_MMXEXT)
     yuy2_to_yv12 = yuy2_to_yv12_mmxext;
   else
-- 
cgit v1.2.3


From 761030ff507cea3b6bd9bca4a755811725fdde1a Mon Sep 17 00:00:00 2001
From: "\"Torsten Jager\"" <t.jager@gmx.de>
Date: Tue, 27 Dec 2011 15:11:28 +0100
Subject: Fixes nasty mpeg2 on ts A/V lag when using ff.

--HG--
branch : point-release
extra : rebase_source : 6e059c732a63d40b65b09f4ef725ec5ca45c4c1c
---
 src/combined/ffmpeg/ff_video_decoder.c | 122 +++++++++++++++++++--------------
 1 file changed, 72 insertions(+), 50 deletions(-)

diff --git a/src/combined/ffmpeg/ff_video_decoder.c b/src/combined/ffmpeg/ff_video_decoder.c
index a1d729df4..50357182a 100644
--- a/src/combined/ffmpeg/ff_video_decoder.c
+++ b/src/combined/ffmpeg/ff_video_decoder.c
@@ -96,6 +96,7 @@ struct ff_video_decoder_s {
 
   xine_stream_t    *stream;
   int64_t           pts;
+  int64_t           last_pts;
 #ifdef AVCODEC_HAS_REORDERED_OPAQUE
   uint64_t          pts_tag_mask;
   uint64_t          pts_tag;
@@ -433,6 +434,9 @@ static void init_video_codec (ff_video_decoder_t *this, unsigned int codec_type)
       break;
   }
 
+  /* dont want initial AV_NOPTS_VALUE here */
+  this->context->reordered_opaque = 0;
+
 }
 
 static void choose_speed_over_accuracy_cb(void *user_data, xine_cfg_entry_t *entry) {
@@ -1081,6 +1085,54 @@ static void ff_handle_special_buffer (ff_video_decoder_t *this, buf_element_t *b
   }
 }
 
+#ifdef AVCODEC_HAS_REORDERED_OPAQUE
+static uint64_t ff_tag_pts(ff_video_decoder_t *this, uint64_t pts)
+{
+  return pts | this->pts_tag;
+}
+
+static uint64_t ff_untag_pts(ff_video_decoder_t *this, uint64_t pts)
+{
+  if (this->pts_tag_mask == 0)
+    return pts; /* pts tagging inactive */
+
+  if (this->pts_tag != 0 && (pts & this->pts_tag_mask) != this->pts_tag)
+    return 0; /* reset pts if outdated while waiting for first pass (see below) */
+
+  return pts & ~this->pts_tag_mask;
+}
+
+static void ff_check_pts_tagging(ff_video_decoder_t *this, uint64_t pts)
+{
+  if (this->pts_tag_mask == 0)
+    return; /* pts tagging inactive */
+  if ((pts & this->pts_tag_mask) != this->pts_tag) {
+    this->pts_tag_stable_counter = 0;
+    return; /* pts still outdated */
+  }
+
+  /* the tag should be stable for 100 frames */
+  this->pts_tag_stable_counter++;
+
+  if (this->pts_tag != 0) {
+    if (this->pts_tag_stable_counter >= 100) {
+      /* first pass: reset pts_tag */
+      this->pts_tag = 0;
+      this->pts_tag_stable_counter = 0;
+    }
+  } else if (pts == 0)
+    return; /* cannot detect second pass */
+  else {
+    if (this->pts_tag_stable_counter >= 100) {
+      /* second pass: reset pts_tag_mask and pts_tag_counter */
+      this->pts_tag_mask = 0;
+      this->pts_tag_counter = 0;
+      this->pts_tag_stable_counter = 0;
+    }
+  }
+}
+#endif /* AVCODEC_HAS_REORDERED_OPAQUE */
+
 static void ff_handle_mpeg12_buffer (ff_video_decoder_t *this, buf_element_t *buf) {
 
   vo_frame_t *img;
@@ -1102,6 +1154,15 @@ static void ff_handle_mpeg12_buffer (ff_video_decoder_t *this, buf_element_t *bu
     uint8_t *current;
     int next_flush;
 
+#ifdef AVCODEC_HAS_REORDERED_OPAQUE
+    /* apply valid pts to first frame _starting_ thereafter only */
+    if (this->pts && !this->context->reordered_opaque) {
+      this->context->reordered_opaque = 
+      this->av_frame->reordered_opaque = ff_tag_pts (this, this->pts);
+      this->pts = 0;
+    }
+#endif /* AVCODEC_HAS_REORDERED_OPAQUE */
+
     got_picture = 0;
     if (!flush) {
       current = mpeg_parser_decode_data(this->mpeg_parser,
@@ -1183,8 +1244,16 @@ static void ff_handle_mpeg12_buffer (ff_video_decoder_t *this, buf_element_t *bu
         free_img = 0;
       }
 
+#ifdef AVCODEC_HAS_REORDERED_OPAQUE
+      /* get back reordered pts */
+      img->pts = ff_untag_pts (this, this->av_frame->reordered_opaque);
+      ff_check_pts_tagging (this, this->av_frame->reordered_opaque);
+      this->av_frame->reordered_opaque = 0;
+      this->context->reordered_opaque = 0;
+#else
       img->pts  = this->pts;
       this->pts = 0;
+#endif /* AVCODEC_HAS_REORDERED_OPAQUE */
 
       if (this->av_frame->repeat_pict)
         img->duration = this->video_step * 3 / 2;
@@ -1225,54 +1294,6 @@ static void ff_handle_mpeg12_buffer (ff_video_decoder_t *this, buf_element_t *bu
   }
 }
 
-#ifdef AVCODEC_HAS_REORDERED_OPAQUE
-static uint64_t ff_tag_pts(ff_video_decoder_t *this, uint64_t pts)
-{
-  return pts | this->pts_tag;
-}
-
-static uint64_t ff_untag_pts(ff_video_decoder_t *this, uint64_t pts)
-{
-  if (this->pts_tag_mask == 0)
-    return pts; /* pts tagging inactive */
-
-  if (this->pts_tag != 0 && (pts & this->pts_tag_mask) != this->pts_tag)
-    return 0; /* reset pts if outdated while waiting for first pass (see below) */
-
-  return pts & ~this->pts_tag_mask;
-}
-
-static void ff_check_pts_tagging(ff_video_decoder_t *this, uint64_t pts)
-{
-  if (this->pts_tag_mask == 0)
-    return; /* pts tagging inactive */
-  if ((pts & this->pts_tag_mask) != this->pts_tag) {
-    this->pts_tag_stable_counter = 0;
-    return; /* pts still outdated */
-  }
-
-  /* the tag should be stable for 100 frames */
-  this->pts_tag_stable_counter++;
-
-  if (this->pts_tag != 0) {
-    if (this->pts_tag_stable_counter >= 100) {
-      /* first pass: reset pts_tag */
-      this->pts_tag = 0;
-      this->pts_tag_stable_counter = 0;
-    }
-  } else if (pts == 0)
-    return; /* cannot detect second pass */
-  else {
-    if (this->pts_tag_stable_counter >= 100) {
-      /* second pass: reset pts_tag_mask and pts_tag_counter */
-      this->pts_tag_mask = 0;
-      this->pts_tag_counter = 0;
-      this->pts_tag_stable_counter = 0;
-    }
-  }
-}
-#endif /* AVCODEC_HAS_REORDERED_OPAQUE */
-
 static void ff_handle_buffer (ff_video_decoder_t *this, buf_element_t *buf) {
   uint8_t *chunk_buf = this->buf;
   AVRational avr00 = {0, 1};
@@ -1616,8 +1637,9 @@ static void ff_decode_data (video_decoder_t *this_gen, buf_element_t *buf) {
     } else {
 
       /* decode */
-      if (buf->pts)
-	this->pts = buf->pts;
+      /* PES: each valid pts shall be used only once */
+      if (buf->pts && (buf->pts != this->last_pts))
+	this->last_pts = this->pts = buf->pts;
 
       if ((buf->type & 0xFFFF0000) == BUF_VIDEO_MPEG) {
 	ff_handle_mpeg12_buffer(this, buf);
-- 
cgit v1.2.3