From 8e8d3892412c5b3166efbc9aa4571dcaf2cc2a57 Mon Sep 17 00:00:00 2001 From: Roland Scheidegger Date: Thu, 22 Dec 2011 01:40:22 +0100 Subject: Use proper chroma upsampling for yv12 to yuy2 conversion The old code did some "averaging" which, while cheap, lead to serious chroma shift because the weighting factors turned out to be pretty random (arguably no averaging likely would have been given more correct results). It also in fact lead to chroma ghosts. To see why this was wrong read the following and then do the math. http://www.hometheaterhifi.com/the-dvd-benchmark/179-the-chroma-upsampling-error-and-the-420-interlaced-chroma-problem.html http://avisynth.org/mediawiki/Sampling As an example, let's look what happens at line 4 for interlaced content (where the code would have averaged chroma from chroma line 2 and 4): Chroma line 2 contains chroma values for line 2 (25%) and 4 (75%) while chroma line 4 contains chroma values for line 6 (25%) and 8 (75%) of the original (prior to subsampling) frame. Average these together and you get something quite wrong. Most importantly the center of these weights will be at 5.5 instead of 4 (hence chroma shift). For odd lines it is different (better but still wrong). So, fix this by using the correct weights for reconstruction of the chroma values (which is averaging for the progressive case for all pixels since the samples are defined to be between the lines, and use different weighting factors for odd/even/"upper"/"lower" lines). This runs more than twice the instructions (for the mmx case), but I measured only a performance impact of roughly 5% (on a Athlon64 X2) - seriously bound by memory access (by comparison the sort-of-pointless post-deinterlace chroma filter is nearly twice as slow hence if you don't need it because the values are correct this will be a lot faster). Note: this is only correct for codecs which use the same chroma positions as mpeg2 (dv is definitely different, mpeg1 is also different but only for horizontal positioning, which doesn't matter here). "yv12" as such seems underspecified wrt chroma positioning. On another note, while this algorithm may be correct, it is inherently suboptimal doing this pre-deinterlace (and a post-deinterlace chroma filter is not going to help much neither except it can blur the mess). This NEEDS to be part of deinterlace (which btw would also be quite a bit faster when handling planar directly due to saving one pass of going through all memory). The reason is while line 4 will now use the correct weighting factors, the fact remains it will use chroma values originating from lines 2, 4, 6 and 8 of the original image. However, if the deinterlacer decides to weave because there is no motion, it CAN and most likely wants to use chroma values from the other field (hence values originating from line 2, 3, 4, 5 in this case when using a very simple filter, with appropriate weighting). --HG-- branch : point-release extra : rebase_source : 808bb5785ca398970324bea6b391a9e24c576d2f --- src/xine-utils/color.c | 377 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 288 insertions(+), 89 deletions(-) diff --git a/src/xine-utils/color.c b/src/xine-utils/color.c index bea4cd952..3c2388c27 100644 --- a/src/xine-utils/color.c +++ b/src/xine-utils/color.c @@ -658,16 +658,31 @@ static void yuv411_to_yv12_c } -#define C_YUV420_YUYV( ) \ - *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ - *p_line1++ = *p_u; *p_line2++ = (*p_u++ + *p_u2++)>>1; \ - *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ - *p_line1++ = *p_v; *p_line2++ = (*p_v++ + *p_v2++)>>1; +#define C_YUV420_YUYV_PROGRESSIVE( ) \ + utmp = 3 * *p_u++; \ + vtmp = 3 * *p_v++; \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_ut++ + utmp) >> 2; *p_line2++ = (utmp + *p_ub++) >> 2; \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_vt++ + vtmp) >> 2; *p_line2++ = (vtmp + *p_vb++) >> 2; \ + +#define C_YUV420_YUYV_INTERLACED_ODD( ) \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_ut++ + *p_u * 7) >> 3; *p_line2++ = (*p_u++ * 5 + *p_ub++ * 3) >> 3; \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_vt++ + *p_v * 7) >> 3; *p_line2++ = (*p_v++ * 5 + *p_vb++ * 3) >> 3; \ + +#define C_YUV420_YUYV_INTERLACED_EVEN( ) \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_ut++ * 3 + *p_u * 5) >> 3; *p_line2++ = (*p_u++ * 7 + *p_ub++) >> 3; \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_vt++ * 3 + *p_v * 5) >> 3; *p_line2++ = (*p_v++ * 7 + *p_vb++) >> 3; \ /***************************************************************************** * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2 * original conversion routine from Videolan project - * changed to support interlaced frames and use simple mean interpolation [MF] + * changed to support interlaced frames and do correct chroma upsampling with + * correct weighting factors and no chroma shift. *****************************************************************************/ static void yv12_to_yuy2_c (const unsigned char *y_src, int y_src_pitch, @@ -680,10 +695,12 @@ static void yv12_to_yuy2_c const uint8_t *p_y1, *p_y2 = y_src; const uint8_t *p_u = u_src; const uint8_t *p_v = v_src; - const uint8_t *p_u2 = u_src + u_src_pitch; - const uint8_t *p_v2 = v_src + v_src_pitch; + const uint8_t *p_ub, *p_vb; + const uint8_t *p_ut = u_src; + const uint8_t *p_vt = v_src; int i_x, i_y; + int utmp, vtmp; const int i_source_margin = y_src_pitch - width; const int i_source_u_margin = u_src_pitch - width/2; @@ -701,28 +718,29 @@ static void yv12_to_yuy2_c p_y1 = p_y2; p_y2 += y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + u_src_pitch; + p_vb = p_v + v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_PROGRESSIVE( ); } p_y2 += i_source_margin; p_u += i_source_u_margin; p_v += i_source_v_margin; - if( i_y > 1 ) { - p_u2 += i_source_u_margin; - p_v2 += i_source_v_margin; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - u_src_pitch; + p_vt = p_v - v_src_pitch; p_line2 += i_dest_margin; } } else { - p_u2 = u_src + 2*u_src_pitch; - p_v2 = v_src + 2*v_src_pitch; for( i_y = height / 4 ; i_y-- ; ) { p_line1 = p_line2; @@ -731,21 +749,24 @@ static void yv12_to_yuy2_c p_y1 = p_y2; p_y2 += 2 * y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + 2 * u_src_pitch; + p_vb = p_v + 2 * v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_INTERLACED_ODD( ); } p_y2 += i_source_margin + y_src_pitch; p_u += i_source_u_margin + u_src_pitch; p_v += i_source_v_margin + v_src_pitch; - if( i_y > 1 ) { - p_u2 += i_source_u_margin + u_src_pitch; - p_v2 += i_source_v_margin + v_src_pitch; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - 2 * u_src_pitch; + p_vt = p_v - 2 * v_src_pitch; p_line2 += i_dest_margin + yuy2_pitch; } @@ -753,8 +774,8 @@ static void yv12_to_yuy2_c p_y2 = y_src + y_src_pitch; p_u = u_src + u_src_pitch; p_v = v_src + v_src_pitch; - p_u2 = u_src + 3*u_src_pitch; - p_v2 = v_src + 3*v_src_pitch; + p_ut = p_u; + p_vt = p_v; for( i_y = height / 4 ; i_y-- ; ) { @@ -764,21 +785,24 @@ static void yv12_to_yuy2_c p_y1 = p_y2; p_y2 += 2 * y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + 2 * u_src_pitch; + p_vb = p_v + 2 * v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_INTERLACED_EVEN( ); } p_y2 += i_source_margin + y_src_pitch; p_u += i_source_u_margin + u_src_pitch; p_v += i_source_v_margin + v_src_pitch; - if( i_y > 1 ) { - p_u2 += i_source_u_margin + u_src_pitch; - p_v2 += i_source_v_margin + v_src_pitch; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - 2 * u_src_pitch; + p_vt = p_v - 2 * v_src_pitch; p_line2 += i_dest_margin + yuy2_pitch; } @@ -788,38 +812,204 @@ static void yv12_to_yuy2_c #if defined(ARCH_X86) || defined(ARCH_X86_64) -#define MMXEXT_YUV420_YUYV( ) \ +#define MMXEXT_YUV420_YUYV_PROGRESSIVE( ) \ do { \ __asm__ __volatile__(".align 8 \n\t" \ "movq (%0), %%mm0 \n\t" /* Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 */ \ "movd (%1), %%mm1 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \ "movd (%2), %%mm2 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ - "punpcklbw %%mm2, %%mm1 \n\t" /* v3 u3 v2 u2 v1 u1 v0 u0 */ \ - "movq %%mm0, %%mm2 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \ - "punpcklbw %%mm1, %%mm2 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \ + "pxor %%mm7, %%mm7 \n\t" /* 00 00 00 00 00 00 00 00 */ \ + "punpcklbw %%mm7, %%mm1 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm2 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm1, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm2, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $1, %%mm3 \n\t" /* Cb * 2 */ \ + "psllw $1, %%mm4 \n\t" /* Cr * 2 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb * 3 */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr * 3 */ \ : \ : "r" (p_y1), "r" (p_u), "r" (p_v) ); \ __asm__ __volatile__( \ - "movd (%0), %%mm3 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \ - "movd (%1), %%mm4 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ - "punpcklbw %%mm4, %%mm3 \n\t" /* v3 u3 v2 u2 v1 u1 v0 u0 */ \ - "pavgb %%mm1, %%mm3 \n\t" /* (mean) v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movd (%0), %%mm3 \n\t" /* Load 4 Cbt 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%1), %%mm4 \n\t" /* Load 4 Crt 00 00 00 00 v3 v2 v1 v0 */ \ + "movd (%2), %%mm5 \n\t" /* Load 4 Cbb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%3), %%mm6 \n\t" /* Load 4 Crb 00 00 00 00 v3 v2 v1 v0 */ \ + "punpcklbw %%mm7, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "punpcklbw %%mm7, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "paddw %%mm1, %%mm3 \n\t" /* Cb1 = Cbt + 3*Cb */ \ + "paddw %%mm2, %%mm4 \n\t" /* Cr1 = Crt + 3*Cr */ \ + "paddw %%mm5, %%mm1 \n\t" /* Cb2 = Cbb + 3*Cb */ \ + "paddw %%mm6, %%mm2 \n\t" /* Cr2 = Crb + 3*Cr */ \ + "psrlw $2, %%mm3 \n\t" /* Cb1 = (Cbt + 3*Cb) / 4 */ \ + /* either the shifts by 2 and 8 or mask off bits and shift by 6 */ \ + "psrlw $2, %%mm4 \n\t" /* Cr1 = (Crt + 3*Cr) / 4 */ \ + "psllw $8, %%mm4 \n\t" \ + "por %%mm4, %%mm3 \n\t" /* Cr1 Cb1 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "psrlw $2, %%mm1 \n\t" /* Cb2 = (Cbb + 3*Cb) / 4 */ \ + "psrlw $2, %%mm2 \n\t" /* Cr2 = (Cbb + 3*Cb) / 4 */ \ + "psllw $8, %%mm2 \n\t" \ + "por %%mm1, %%mm2 \n\t" /* Cr2 Cb2 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm1 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm3, %%mm1 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \ : \ - : "r" (p_u2), "r" (p_v2) ); \ + : "r" (p_ut), "r" (p_vt), "r" (p_ub), "r" (p_vb) ); \ __asm__ __volatile__( \ - "movntq %%mm2, (%0) \n\t" /* Store low YUYV */ \ - "punpckhbw %%mm1, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \ - "movntq %%mm0, 8(%0) \n\t" /* Store high YUYV */ \ + "movntq %%mm1, (%0) \n\t" /* Store low YUYV1 */ \ + "punpckhbw %%mm3, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \ + "movntq %%mm0, 8(%0) \n\t" /* Store high YUYV1 */ \ "movq (%2), %%mm0 \n\t" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ - "movq %%mm0, %%mm2 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ - "punpcklbw %%mm3, %%mm2 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \ - "movntq %%mm2, (%1) \n\t" /* Store low YUYV */ \ - "punpckhbw %%mm3, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \ - "movntq %%mm0, 8(%1) \n\t" /* Store high YUYV */ \ + "movq %%mm0, %%mm1 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm2, %%mm1 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \ + "movntq %%mm1, (%1) \n\t" /* Store low YUYV2 */ \ + "punpckhbw %%mm2, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \ + "movntq %%mm0, 8(%1) \n\t" /* Store high YUYV2 */ \ : \ : "r" (p_line1), "r" (p_line2), "r" (p_y2) ); \ p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \ - p_u2 += 4; p_v2 += 4; \ + p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4; \ +} while(0) + +#define MMXEXT_YUV420_YUYV_INTERLACED_ODD( ) \ +do { \ + __asm__ __volatile__(".align 8 \n\t" \ + "movd (%0), %%mm1 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%1), %%mm2 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ + "pxor %%mm7, %%mm7 \n\t" /* 00 00 00 00 00 00 00 00 */ \ + "punpcklbw %%mm7, %%mm1 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm2 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm1, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm2, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $2, %%mm3 \n\t" /* Cb * 4 */ \ + "psllw $2, %%mm4 \n\t" /* Cr * 4 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb * 5 */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr * 5 */ \ + "psrlw $1, %%mm3 \n\t" /* Cb * 2 */ \ + "psrlw $1, %%mm4 \n\t" /* Cr * 2 */ \ + "paddw %%mm1, %%mm3 \n\t" /* Cb * 7 */ \ + "paddw %%mm2, %%mm4 \n\t" /* Cr * 7 */ \ + : \ + : "r" (p_u), "r" (p_v) ); \ + __asm__ __volatile__( \ + "movd (%1), %%mm5 \n\t" /* Load 4 Cbt 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%2), %%mm6 \n\t" /* Load 4 Crt 00 00 00 00 v3 v2 v1 v0 */ \ + "movq (%0), %%mm0 \n\t" /* Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm7, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "paddw %%mm3, %%mm5 \n\t" /* Cb1 = Cbt + 7*Cb */ \ + "paddw %%mm4, %%mm6 \n\t" /* Cr1 = Crt + 7*Cr */ \ + "psrlw $3, %%mm5 \n\t" /* Cb1 = (Cbt + 7*Cb) / 8 */ \ + /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \ + "psrlw $3, %%mm6 \n\t" /* Cr1 = (Crt + 7*Cr) / 8 */ \ + "psllw $8, %%mm6 \n\t" \ + "por %%mm5, %%mm6 \n\t" /* Cr1 Cb1 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm3 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm6, %%mm3 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \ + "movntq %%mm3, (%3) \n\t" /* Store low YUYV1 */ \ + "punpckhbw %%mm6, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \ + "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV1 */ \ + : \ + : "r" (p_y1), "r" (p_ut), "r" (p_vt), "r" (p_line1) ); \ + __asm__ __volatile__( \ + "movd (%1), %%mm3 \n\t" /* Load 4 Cbb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%2), %%mm4 \n\t" /* Load 4 Crb 00 00 00 00 v3 v2 v1 v0 */ \ + "movq (%0), %%mm0 \n\t" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm7, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm3, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm4, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $1, %%mm5 \n\t" /* Cbb * 2 */ \ + "psllw $1, %%mm6 \n\t" /* Crb * 2 */ \ + "paddw %%mm5, %%mm3 \n\t" /* Cbb * 3 */ \ + "paddw %%mm6, %%mm4 \n\t" /* Crb * 3 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb2 = 3*Cbb + 5*Cb */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr2 = 3*Crb + 5*Cr */ \ + "psrlw $3, %%mm1 \n\t" /* Cb2 = (3*Cbb + 5*Cb) / 8 */ \ + /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \ + "psrlw $3, %%mm2 \n\t" /* Cr2 = (3*Crb + 5*Cr) / 8 */ \ + "psllw $8, %%mm2 \n\t" \ + "por %%mm1, %%mm2 \n\t" /* Cr2 Cb2 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm1 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm2, %%mm1 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \ + "movntq %%mm1, (%3) \n\t" /* Store low YUYV2 */ \ + "punpckhbw %%mm2, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \ + "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV2 */ \ + : \ + : "r" (p_y2), "r" (p_ub), "r" (p_vb), "r" (p_line2) ); \ + p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \ + p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4; \ +} while(0) + +#define MMXEXT_YUV420_YUYV_INTERLACED_EVEN( ) \ +/* same as above, except the assembly input arguments are switched */ \ +do { \ + __asm__ __volatile__(".align 8 \n\t" \ + "movd (%0), %%mm1 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%1), %%mm2 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ + "pxor %%mm7, %%mm7 \n\t" /* 00 00 00 00 00 00 00 00 */ \ + "punpcklbw %%mm7, %%mm1 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm2 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm1, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm2, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $2, %%mm3 \n\t" /* Cb * 4 */ \ + "psllw $2, %%mm4 \n\t" /* Cr * 4 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb * 5 */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr * 5 */ \ + "psrlw $1, %%mm3 \n\t" /* Cb * 2 */ \ + "psrlw $1, %%mm4 \n\t" /* Cr * 2 */ \ + "paddw %%mm1, %%mm3 \n\t" /* Cb * 7 */ \ + "paddw %%mm2, %%mm4 \n\t" /* Cr * 7 */ \ + : \ + : "r" (p_u), "r" (p_v) ); \ + __asm__ __volatile__( \ + "movd (%1), %%mm5 \n\t" /* Load 4 Cbt 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%2), %%mm6 \n\t" /* Load 4 Crt 00 00 00 00 v3 v2 v1 v0 */ \ + "movq (%0), %%mm0 \n\t" /* Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm7, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "paddw %%mm3, %%mm5 \n\t" /* Cb1 = Cbt + 7*Cb */ \ + "paddw %%mm4, %%mm6 \n\t" /* Cr1 = Crt + 7*Cr */ \ + "psrlw $3, %%mm5 \n\t" /* Cb1 = (Cbt + 7*Cb) / 8 */ \ + /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \ + "psrlw $3, %%mm6 \n\t" /* Cr1 = (Crt + 7*Cr) / 8 */ \ + "psllw $8, %%mm6 \n\t" \ + "por %%mm5, %%mm6 \n\t" /* Cr1 Cb1 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm3 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm6, %%mm3 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \ + "movntq %%mm3, (%3) \n\t" /* Store low YUYV1 */ \ + "punpckhbw %%mm6, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \ + "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV1 */ \ + : \ + : "r" (p_y2), "r" (p_ub), "r" (p_vb), "r" (p_line2) ); \ + __asm__ __volatile__( \ + "movd (%1), %%mm3 \n\t" /* Load 4 Cbb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%2), %%mm4 \n\t" /* Load 4 Crb 00 00 00 00 v3 v2 v1 v0 */ \ + "movq (%0), %%mm0 \n\t" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm7, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm3, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm4, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $1, %%mm5 \n\t" /* Cbb * 2 */ \ + "psllw $1, %%mm6 \n\t" /* Crb * 2 */ \ + "paddw %%mm5, %%mm3 \n\t" /* Cbb * 3 */ \ + "paddw %%mm6, %%mm4 \n\t" /* Crb * 3 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb2 = 3*Cbb + 5*Cb */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr2 = 3*Crb + 5*Cr */ \ + "psrlw $3, %%mm1 \n\t" /* Cb2 = (3*Cbb + 5*Cb) / 8 */ \ + /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \ + "psrlw $3, %%mm2 \n\t" /* Cr2 = (3*Crb + 5*Cr) / 8 */ \ + "psllw $8, %%mm2 \n\t" \ + "por %%mm1, %%mm2 \n\t" /* Cr2 Cb2 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm1 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm2, %%mm1 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \ + "movntq %%mm1, (%3) \n\t" /* Store low YUYV2 */ \ + "punpckhbw %%mm2, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \ + "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV2 */ \ + : \ + : "r" (p_y1), "r" (p_ut), "r" (p_vt), "r" (p_line1) ); \ + p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \ + p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4; \ } while(0) #endif @@ -835,10 +1025,12 @@ static void yv12_to_yuy2_mmxext const uint8_t *p_y1, *p_y2 = y_src; const uint8_t *p_u = u_src; const uint8_t *p_v = v_src; - const uint8_t *p_u2 = u_src + u_src_pitch; - const uint8_t *p_v2 = v_src + v_src_pitch; + const uint8_t *p_ub, *p_vb; + const uint8_t *p_ut = u_src; + const uint8_t *p_vt = v_src; int i_x, i_y; + int utmp, vtmp; const int i_source_margin = y_src_pitch - width; const int i_source_u_margin = u_src_pitch - width/2; @@ -855,32 +1047,33 @@ static void yv12_to_yuy2_mmxext p_y1 = p_y2; p_y2 += y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + u_src_pitch; + p_vb = p_v + v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 8 ; i_x-- ; ) { - MMXEXT_YUV420_YUYV( ); + MMXEXT_YUV420_YUYV_PROGRESSIVE( ); } for( i_x = (width % 8) / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_PROGRESSIVE( ); } p_y2 += i_source_margin; p_u += i_source_u_margin; p_v += i_source_v_margin; - if( i_y > 1 ) { - p_u2 += i_source_u_margin; - p_v2 += i_source_v_margin; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - u_src_pitch; + p_vt = p_v - v_src_pitch; p_line2 += i_dest_margin; } } else { - p_u2 = u_src + 2*u_src_pitch; - p_v2 = v_src + 2*v_src_pitch; for( i_y = height / 4 ; i_y-- ; ) { p_line1 = p_line2; @@ -889,25 +1082,28 @@ static void yv12_to_yuy2_mmxext p_y1 = p_y2; p_y2 += 2 * y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + 2 * u_src_pitch; + p_vb = p_v + 2 * v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 8 ; i_x-- ; ) { - MMXEXT_YUV420_YUYV( ); + MMXEXT_YUV420_YUYV_INTERLACED_ODD( ); } for( i_x = (width % 8) / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_INTERLACED_ODD( ); } p_y2 += i_source_margin + y_src_pitch; p_u += i_source_u_margin + u_src_pitch; p_v += i_source_v_margin + v_src_pitch; - if( i_y > 1 ) { - p_u2 += i_source_u_margin + u_src_pitch; - p_v2 += i_source_v_margin + v_src_pitch; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - 2 * u_src_pitch; + p_vt = p_v - 2 * v_src_pitch; p_line2 += i_dest_margin + yuy2_pitch; } @@ -915,9 +1111,9 @@ static void yv12_to_yuy2_mmxext p_y2 = y_src + y_src_pitch; p_u = u_src + u_src_pitch; p_v = v_src + v_src_pitch; - p_u2 = u_src + 3*u_src_pitch; - p_v2 = v_src + 3*v_src_pitch; - + p_ut = p_u; + p_vt = p_v; + for( i_y = height / 4 ; i_y-- ; ) { p_line1 = p_line2; @@ -926,25 +1122,28 @@ static void yv12_to_yuy2_mmxext p_y1 = p_y2; p_y2 += 2 * y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + 2 * u_src_pitch; + p_vb = p_v + 2 * v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 8 ; i_x-- ; ) { - MMXEXT_YUV420_YUYV( ); + MMXEXT_YUV420_YUYV_INTERLACED_EVEN( ); } for( i_x = (width % 8) / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_INTERLACED_EVEN( ); } p_y2 += i_source_margin + y_src_pitch; p_u += i_source_u_margin + u_src_pitch; p_v += i_source_v_margin + v_src_pitch; - if( i_y > 1 ) { - p_u2 += i_source_u_margin + u_src_pitch; - p_v2 += i_source_v_margin + v_src_pitch; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - 2 * u_src_pitch; + p_vt = p_v - 2 * v_src_pitch; p_line2 += i_dest_margin + yuy2_pitch; } @@ -1143,7 +1342,7 @@ void init_yuv_conversion(void) { else yv12_to_yuy2 = yv12_to_yuy2_c; - /* determine best YV12 -> YUY2 converter to use */ + /* determine best YUY2 -> YV12 converter to use */ if (xine_mm_accel() & MM_ACCEL_X86_MMXEXT) yuy2_to_yv12 = yuy2_to_yv12_mmxext; else -- cgit v1.2.3 From 761030ff507cea3b6bd9bca4a755811725fdde1a Mon Sep 17 00:00:00 2001 From: "\"Torsten Jager\"" Date: Tue, 27 Dec 2011 15:11:28 +0100 Subject: Fixes nasty mpeg2 on ts A/V lag when using ff. --HG-- branch : point-release extra : rebase_source : 6e059c732a63d40b65b09f4ef725ec5ca45c4c1c --- src/combined/ffmpeg/ff_video_decoder.c | 122 +++++++++++++++++++-------------- 1 file changed, 72 insertions(+), 50 deletions(-) diff --git a/src/combined/ffmpeg/ff_video_decoder.c b/src/combined/ffmpeg/ff_video_decoder.c index a1d729df4..50357182a 100644 --- a/src/combined/ffmpeg/ff_video_decoder.c +++ b/src/combined/ffmpeg/ff_video_decoder.c @@ -96,6 +96,7 @@ struct ff_video_decoder_s { xine_stream_t *stream; int64_t pts; + int64_t last_pts; #ifdef AVCODEC_HAS_REORDERED_OPAQUE uint64_t pts_tag_mask; uint64_t pts_tag; @@ -433,6 +434,9 @@ static void init_video_codec (ff_video_decoder_t *this, unsigned int codec_type) break; } + /* dont want initial AV_NOPTS_VALUE here */ + this->context->reordered_opaque = 0; + } static void choose_speed_over_accuracy_cb(void *user_data, xine_cfg_entry_t *entry) { @@ -1081,6 +1085,54 @@ static void ff_handle_special_buffer (ff_video_decoder_t *this, buf_element_t *b } } +#ifdef AVCODEC_HAS_REORDERED_OPAQUE +static uint64_t ff_tag_pts(ff_video_decoder_t *this, uint64_t pts) +{ + return pts | this->pts_tag; +} + +static uint64_t ff_untag_pts(ff_video_decoder_t *this, uint64_t pts) +{ + if (this->pts_tag_mask == 0) + return pts; /* pts tagging inactive */ + + if (this->pts_tag != 0 && (pts & this->pts_tag_mask) != this->pts_tag) + return 0; /* reset pts if outdated while waiting for first pass (see below) */ + + return pts & ~this->pts_tag_mask; +} + +static void ff_check_pts_tagging(ff_video_decoder_t *this, uint64_t pts) +{ + if (this->pts_tag_mask == 0) + return; /* pts tagging inactive */ + if ((pts & this->pts_tag_mask) != this->pts_tag) { + this->pts_tag_stable_counter = 0; + return; /* pts still outdated */ + } + + /* the tag should be stable for 100 frames */ + this->pts_tag_stable_counter++; + + if (this->pts_tag != 0) { + if (this->pts_tag_stable_counter >= 100) { + /* first pass: reset pts_tag */ + this->pts_tag = 0; + this->pts_tag_stable_counter = 0; + } + } else if (pts == 0) + return; /* cannot detect second pass */ + else { + if (this->pts_tag_stable_counter >= 100) { + /* second pass: reset pts_tag_mask and pts_tag_counter */ + this->pts_tag_mask = 0; + this->pts_tag_counter = 0; + this->pts_tag_stable_counter = 0; + } + } +} +#endif /* AVCODEC_HAS_REORDERED_OPAQUE */ + static void ff_handle_mpeg12_buffer (ff_video_decoder_t *this, buf_element_t *buf) { vo_frame_t *img; @@ -1102,6 +1154,15 @@ static void ff_handle_mpeg12_buffer (ff_video_decoder_t *this, buf_element_t *bu uint8_t *current; int next_flush; +#ifdef AVCODEC_HAS_REORDERED_OPAQUE + /* apply valid pts to first frame _starting_ thereafter only */ + if (this->pts && !this->context->reordered_opaque) { + this->context->reordered_opaque = + this->av_frame->reordered_opaque = ff_tag_pts (this, this->pts); + this->pts = 0; + } +#endif /* AVCODEC_HAS_REORDERED_OPAQUE */ + got_picture = 0; if (!flush) { current = mpeg_parser_decode_data(this->mpeg_parser, @@ -1183,8 +1244,16 @@ static void ff_handle_mpeg12_buffer (ff_video_decoder_t *this, buf_element_t *bu free_img = 0; } +#ifdef AVCODEC_HAS_REORDERED_OPAQUE + /* get back reordered pts */ + img->pts = ff_untag_pts (this, this->av_frame->reordered_opaque); + ff_check_pts_tagging (this, this->av_frame->reordered_opaque); + this->av_frame->reordered_opaque = 0; + this->context->reordered_opaque = 0; +#else img->pts = this->pts; this->pts = 0; +#endif /* AVCODEC_HAS_REORDERED_OPAQUE */ if (this->av_frame->repeat_pict) img->duration = this->video_step * 3 / 2; @@ -1225,54 +1294,6 @@ static void ff_handle_mpeg12_buffer (ff_video_decoder_t *this, buf_element_t *bu } } -#ifdef AVCODEC_HAS_REORDERED_OPAQUE -static uint64_t ff_tag_pts(ff_video_decoder_t *this, uint64_t pts) -{ - return pts | this->pts_tag; -} - -static uint64_t ff_untag_pts(ff_video_decoder_t *this, uint64_t pts) -{ - if (this->pts_tag_mask == 0) - return pts; /* pts tagging inactive */ - - if (this->pts_tag != 0 && (pts & this->pts_tag_mask) != this->pts_tag) - return 0; /* reset pts if outdated while waiting for first pass (see below) */ - - return pts & ~this->pts_tag_mask; -} - -static void ff_check_pts_tagging(ff_video_decoder_t *this, uint64_t pts) -{ - if (this->pts_tag_mask == 0) - return; /* pts tagging inactive */ - if ((pts & this->pts_tag_mask) != this->pts_tag) { - this->pts_tag_stable_counter = 0; - return; /* pts still outdated */ - } - - /* the tag should be stable for 100 frames */ - this->pts_tag_stable_counter++; - - if (this->pts_tag != 0) { - if (this->pts_tag_stable_counter >= 100) { - /* first pass: reset pts_tag */ - this->pts_tag = 0; - this->pts_tag_stable_counter = 0; - } - } else if (pts == 0) - return; /* cannot detect second pass */ - else { - if (this->pts_tag_stable_counter >= 100) { - /* second pass: reset pts_tag_mask and pts_tag_counter */ - this->pts_tag_mask = 0; - this->pts_tag_counter = 0; - this->pts_tag_stable_counter = 0; - } - } -} -#endif /* AVCODEC_HAS_REORDERED_OPAQUE */ - static void ff_handle_buffer (ff_video_decoder_t *this, buf_element_t *buf) { uint8_t *chunk_buf = this->buf; AVRational avr00 = {0, 1}; @@ -1616,8 +1637,9 @@ static void ff_decode_data (video_decoder_t *this_gen, buf_element_t *buf) { } else { /* decode */ - if (buf->pts) - this->pts = buf->pts; + /* PES: each valid pts shall be used only once */ + if (buf->pts && (buf->pts != this->last_pts)) + this->last_pts = this->pts = buf->pts; if ((buf->type & 0xFFFF0000) == BUF_VIDEO_MPEG) { ff_handle_mpeg12_buffer(this, buf); -- cgit v1.2.3