diff options
author | Roland Scheidegger <rscheidegger_lists@hispeed.ch> | 2011-12-22 01:40:22 +0100 |
---|---|---|
committer | Roland Scheidegger <rscheidegger_lists@hispeed.ch> | 2011-12-22 01:40:22 +0100 |
commit | 8e8d3892412c5b3166efbc9aa4571dcaf2cc2a57 (patch) | |
tree | bc4fc35b8b80a7f6c9aeaafaa1bab112b259ce9e /src | |
parent | 8304f7562d4023c2a5c8d7ca767c5676273dcbec (diff) | |
download | xine-lib-8e8d3892412c5b3166efbc9aa4571dcaf2cc2a57.tar.gz xine-lib-8e8d3892412c5b3166efbc9aa4571dcaf2cc2a57.tar.bz2 |
Use proper chroma upsampling for yv12 to yuy2 conversion
The old code did some "averaging" which, while cheap, lead to serious
chroma shift because the weighting factors turned out to be pretty random
(arguably no averaging likely would have been given more correct results).
It also in fact lead to chroma ghosts.
To see why this was wrong read the following and then do the math.
http://www.hometheaterhifi.com/the-dvd-benchmark/179-the-chroma-upsampling-error-and-the-420-interlaced-chroma-problem.html
http://avisynth.org/mediawiki/Sampling
As an example, let's look what happens at line 4 for interlaced content
(where the code would have averaged chroma from chroma line 2 and 4):
Chroma line 2 contains chroma values for line 2 (25%) and 4 (75%) while
chroma line 4 contains chroma values for line 6 (25%) and 8 (75%) of the
original (prior to subsampling) frame.
Average these together and you get something quite wrong. Most importantly
the center of these weights will be at 5.5 instead of 4 (hence chroma shift).
For odd lines it is different (better but still wrong).
So, fix this by using the correct weights for reconstruction of the chroma
values (which is averaging for the progressive case for all pixels since the
samples are defined to be between the lines, and use different weighting
factors for odd/even/"upper"/"lower" lines).
This runs more than twice the instructions (for the mmx case), but I measured
only a performance impact of roughly 5% (on a Athlon64 X2) - seriously bound
by memory access (by comparison the sort-of-pointless post-deinterlace chroma
filter is nearly twice as slow hence if you don't need it because the values
are correct this will be a lot faster).
Note: this is only correct for codecs which use the same chroma positions
as mpeg2 (dv is definitely different, mpeg1 is also different but only for
horizontal positioning, which doesn't matter here). "yv12" as such seems
underspecified wrt chroma positioning.
On another note, while this algorithm may be correct, it is inherently
suboptimal doing this pre-deinterlace (and a post-deinterlace chroma
filter is not going to help much neither except it can blur the mess).
This NEEDS to be part of deinterlace (which btw would also be quite a bit
faster when handling planar directly due to saving one pass of going
through all memory).
The reason is while line 4 will now use the correct weighting factors,
the fact remains it will use chroma values originating from lines 2, 4, 6
and 8 of the original image. However, if the deinterlacer decides to weave
because there is no motion, it CAN and most likely wants to use chroma values
from the other field (hence values originating from line 2, 3, 4, 5 in this
case when using a very simple filter, with appropriate weighting).
--HG--
branch : point-release
extra : rebase_source : 808bb5785ca398970324bea6b391a9e24c576d2f
Diffstat (limited to 'src')
-rw-r--r-- | src/xine-utils/color.c | 377 |
1 files changed, 288 insertions, 89 deletions
diff --git a/src/xine-utils/color.c b/src/xine-utils/color.c index bea4cd952..3c2388c27 100644 --- a/src/xine-utils/color.c +++ b/src/xine-utils/color.c @@ -658,16 +658,31 @@ static void yuv411_to_yv12_c } -#define C_YUV420_YUYV( ) \ - *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ - *p_line1++ = *p_u; *p_line2++ = (*p_u++ + *p_u2++)>>1; \ - *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ - *p_line1++ = *p_v; *p_line2++ = (*p_v++ + *p_v2++)>>1; +#define C_YUV420_YUYV_PROGRESSIVE( ) \ + utmp = 3 * *p_u++; \ + vtmp = 3 * *p_v++; \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_ut++ + utmp) >> 2; *p_line2++ = (utmp + *p_ub++) >> 2; \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_vt++ + vtmp) >> 2; *p_line2++ = (vtmp + *p_vb++) >> 2; \ + +#define C_YUV420_YUYV_INTERLACED_ODD( ) \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_ut++ + *p_u * 7) >> 3; *p_line2++ = (*p_u++ * 5 + *p_ub++ * 3) >> 3; \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_vt++ + *p_v * 7) >> 3; *p_line2++ = (*p_v++ * 5 + *p_vb++ * 3) >> 3; \ + +#define C_YUV420_YUYV_INTERLACED_EVEN( ) \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_ut++ * 3 + *p_u * 5) >> 3; *p_line2++ = (*p_u++ * 7 + *p_ub++) >> 3; \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_vt++ * 3 + *p_v * 5) >> 3; *p_line2++ = (*p_v++ * 7 + *p_vb++) >> 3; \ /***************************************************************************** * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2 * original conversion routine from Videolan project - * changed to support interlaced frames and use simple mean interpolation [MF] + * changed to support interlaced frames and do correct chroma upsampling with + * correct weighting factors and no chroma shift. *****************************************************************************/ static void yv12_to_yuy2_c (const unsigned char *y_src, int y_src_pitch, @@ -680,10 +695,12 @@ static void yv12_to_yuy2_c const uint8_t *p_y1, *p_y2 = y_src; const uint8_t *p_u = u_src; const uint8_t *p_v = v_src; - const uint8_t *p_u2 = u_src + u_src_pitch; - const uint8_t *p_v2 = v_src + v_src_pitch; + const uint8_t *p_ub, *p_vb; + const uint8_t *p_ut = u_src; + const uint8_t *p_vt = v_src; int i_x, i_y; + int utmp, vtmp; const int i_source_margin = y_src_pitch - width; const int i_source_u_margin = u_src_pitch - width/2; @@ -701,28 +718,29 @@ static void yv12_to_yuy2_c p_y1 = p_y2; p_y2 += y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + u_src_pitch; + p_vb = p_v + v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_PROGRESSIVE( ); } p_y2 += i_source_margin; p_u += i_source_u_margin; p_v += i_source_v_margin; - if( i_y > 1 ) { - p_u2 += i_source_u_margin; - p_v2 += i_source_v_margin; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - u_src_pitch; + p_vt = p_v - v_src_pitch; p_line2 += i_dest_margin; } } else { - p_u2 = u_src + 2*u_src_pitch; - p_v2 = v_src + 2*v_src_pitch; for( i_y = height / 4 ; i_y-- ; ) { p_line1 = p_line2; @@ -731,21 +749,24 @@ static void yv12_to_yuy2_c p_y1 = p_y2; p_y2 += 2 * y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + 2 * u_src_pitch; + p_vb = p_v + 2 * v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_INTERLACED_ODD( ); } p_y2 += i_source_margin + y_src_pitch; p_u += i_source_u_margin + u_src_pitch; p_v += i_source_v_margin + v_src_pitch; - if( i_y > 1 ) { - p_u2 += i_source_u_margin + u_src_pitch; - p_v2 += i_source_v_margin + v_src_pitch; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - 2 * u_src_pitch; + p_vt = p_v - 2 * v_src_pitch; p_line2 += i_dest_margin + yuy2_pitch; } @@ -753,8 +774,8 @@ static void yv12_to_yuy2_c p_y2 = y_src + y_src_pitch; p_u = u_src + u_src_pitch; p_v = v_src + v_src_pitch; - p_u2 = u_src + 3*u_src_pitch; - p_v2 = v_src + 3*v_src_pitch; + p_ut = p_u; + p_vt = p_v; for( i_y = height / 4 ; i_y-- ; ) { @@ -764,21 +785,24 @@ static void yv12_to_yuy2_c p_y1 = p_y2; p_y2 += 2 * y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + 2 * u_src_pitch; + p_vb = p_v + 2 * v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_INTERLACED_EVEN( ); } p_y2 += i_source_margin + y_src_pitch; p_u += i_source_u_margin + u_src_pitch; p_v += i_source_v_margin + v_src_pitch; - if( i_y > 1 ) { - p_u2 += i_source_u_margin + u_src_pitch; - p_v2 += i_source_v_margin + v_src_pitch; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - 2 * u_src_pitch; + p_vt = p_v - 2 * v_src_pitch; p_line2 += i_dest_margin + yuy2_pitch; } @@ -788,38 +812,204 @@ static void yv12_to_yuy2_c #if defined(ARCH_X86) || defined(ARCH_X86_64) -#define MMXEXT_YUV420_YUYV( ) \ +#define MMXEXT_YUV420_YUYV_PROGRESSIVE( ) \ do { \ __asm__ __volatile__(".align 8 \n\t" \ "movq (%0), %%mm0 \n\t" /* Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 */ \ "movd (%1), %%mm1 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \ "movd (%2), %%mm2 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ - "punpcklbw %%mm2, %%mm1 \n\t" /* v3 u3 v2 u2 v1 u1 v0 u0 */ \ - "movq %%mm0, %%mm2 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \ - "punpcklbw %%mm1, %%mm2 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \ + "pxor %%mm7, %%mm7 \n\t" /* 00 00 00 00 00 00 00 00 */ \ + "punpcklbw %%mm7, %%mm1 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm2 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm1, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm2, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $1, %%mm3 \n\t" /* Cb * 2 */ \ + "psllw $1, %%mm4 \n\t" /* Cr * 2 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb * 3 */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr * 3 */ \ : \ : "r" (p_y1), "r" (p_u), "r" (p_v) ); \ __asm__ __volatile__( \ - "movd (%0), %%mm3 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \ - "movd (%1), %%mm4 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ - "punpcklbw %%mm4, %%mm3 \n\t" /* v3 u3 v2 u2 v1 u1 v0 u0 */ \ - "pavgb %%mm1, %%mm3 \n\t" /* (mean) v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movd (%0), %%mm3 \n\t" /* Load 4 Cbt 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%1), %%mm4 \n\t" /* Load 4 Crt 00 00 00 00 v3 v2 v1 v0 */ \ + "movd (%2), %%mm5 \n\t" /* Load 4 Cbb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%3), %%mm6 \n\t" /* Load 4 Crb 00 00 00 00 v3 v2 v1 v0 */ \ + "punpcklbw %%mm7, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "punpcklbw %%mm7, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "paddw %%mm1, %%mm3 \n\t" /* Cb1 = Cbt + 3*Cb */ \ + "paddw %%mm2, %%mm4 \n\t" /* Cr1 = Crt + 3*Cr */ \ + "paddw %%mm5, %%mm1 \n\t" /* Cb2 = Cbb + 3*Cb */ \ + "paddw %%mm6, %%mm2 \n\t" /* Cr2 = Crb + 3*Cr */ \ + "psrlw $2, %%mm3 \n\t" /* Cb1 = (Cbt + 3*Cb) / 4 */ \ + /* either the shifts by 2 and 8 or mask off bits and shift by 6 */ \ + "psrlw $2, %%mm4 \n\t" /* Cr1 = (Crt + 3*Cr) / 4 */ \ + "psllw $8, %%mm4 \n\t" \ + "por %%mm4, %%mm3 \n\t" /* Cr1 Cb1 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "psrlw $2, %%mm1 \n\t" /* Cb2 = (Cbb + 3*Cb) / 4 */ \ + "psrlw $2, %%mm2 \n\t" /* Cr2 = (Cbb + 3*Cb) / 4 */ \ + "psllw $8, %%mm2 \n\t" \ + "por %%mm1, %%mm2 \n\t" /* Cr2 Cb2 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm1 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm3, %%mm1 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \ : \ - : "r" (p_u2), "r" (p_v2) ); \ + : "r" (p_ut), "r" (p_vt), "r" (p_ub), "r" (p_vb) ); \ __asm__ __volatile__( \ - "movntq %%mm2, (%0) \n\t" /* Store low YUYV */ \ - "punpckhbw %%mm1, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \ - "movntq %%mm0, 8(%0) \n\t" /* Store high YUYV */ \ + "movntq %%mm1, (%0) \n\t" /* Store low YUYV1 */ \ + "punpckhbw %%mm3, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \ + "movntq %%mm0, 8(%0) \n\t" /* Store high YUYV1 */ \ "movq (%2), %%mm0 \n\t" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ - "movq %%mm0, %%mm2 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ - "punpcklbw %%mm3, %%mm2 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \ - "movntq %%mm2, (%1) \n\t" /* Store low YUYV */ \ - "punpckhbw %%mm3, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \ - "movntq %%mm0, 8(%1) \n\t" /* Store high YUYV */ \ + "movq %%mm0, %%mm1 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm2, %%mm1 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \ + "movntq %%mm1, (%1) \n\t" /* Store low YUYV2 */ \ + "punpckhbw %%mm2, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \ + "movntq %%mm0, 8(%1) \n\t" /* Store high YUYV2 */ \ : \ : "r" (p_line1), "r" (p_line2), "r" (p_y2) ); \ p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \ - p_u2 += 4; p_v2 += 4; \ + p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4; \ +} while(0) + +#define MMXEXT_YUV420_YUYV_INTERLACED_ODD( ) \ +do { \ + __asm__ __volatile__(".align 8 \n\t" \ + "movd (%0), %%mm1 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%1), %%mm2 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ + "pxor %%mm7, %%mm7 \n\t" /* 00 00 00 00 00 00 00 00 */ \ + "punpcklbw %%mm7, %%mm1 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm2 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm1, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm2, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $2, %%mm3 \n\t" /* Cb * 4 */ \ + "psllw $2, %%mm4 \n\t" /* Cr * 4 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb * 5 */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr * 5 */ \ + "psrlw $1, %%mm3 \n\t" /* Cb * 2 */ \ + "psrlw $1, %%mm4 \n\t" /* Cr * 2 */ \ + "paddw %%mm1, %%mm3 \n\t" /* Cb * 7 */ \ + "paddw %%mm2, %%mm4 \n\t" /* Cr * 7 */ \ + : \ + : "r" (p_u), "r" (p_v) ); \ + __asm__ __volatile__( \ + "movd (%1), %%mm5 \n\t" /* Load 4 Cbt 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%2), %%mm6 \n\t" /* Load 4 Crt 00 00 00 00 v3 v2 v1 v0 */ \ + "movq (%0), %%mm0 \n\t" /* Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm7, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "paddw %%mm3, %%mm5 \n\t" /* Cb1 = Cbt + 7*Cb */ \ + "paddw %%mm4, %%mm6 \n\t" /* Cr1 = Crt + 7*Cr */ \ + "psrlw $3, %%mm5 \n\t" /* Cb1 = (Cbt + 7*Cb) / 8 */ \ + /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \ + "psrlw $3, %%mm6 \n\t" /* Cr1 = (Crt + 7*Cr) / 8 */ \ + "psllw $8, %%mm6 \n\t" \ + "por %%mm5, %%mm6 \n\t" /* Cr1 Cb1 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm3 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm6, %%mm3 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \ + "movntq %%mm3, (%3) \n\t" /* Store low YUYV1 */ \ + "punpckhbw %%mm6, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \ + "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV1 */ \ + : \ + : "r" (p_y1), "r" (p_ut), "r" (p_vt), "r" (p_line1) ); \ + __asm__ __volatile__( \ + "movd (%1), %%mm3 \n\t" /* Load 4 Cbb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%2), %%mm4 \n\t" /* Load 4 Crb 00 00 00 00 v3 v2 v1 v0 */ \ + "movq (%0), %%mm0 \n\t" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm7, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm3, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm4, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $1, %%mm5 \n\t" /* Cbb * 2 */ \ + "psllw $1, %%mm6 \n\t" /* Crb * 2 */ \ + "paddw %%mm5, %%mm3 \n\t" /* Cbb * 3 */ \ + "paddw %%mm6, %%mm4 \n\t" /* Crb * 3 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb2 = 3*Cbb + 5*Cb */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr2 = 3*Crb + 5*Cr */ \ + "psrlw $3, %%mm1 \n\t" /* Cb2 = (3*Cbb + 5*Cb) / 8 */ \ + /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \ + "psrlw $3, %%mm2 \n\t" /* Cr2 = (3*Crb + 5*Cr) / 8 */ \ + "psllw $8, %%mm2 \n\t" \ + "por %%mm1, %%mm2 \n\t" /* Cr2 Cb2 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm1 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm2, %%mm1 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \ + "movntq %%mm1, (%3) \n\t" /* Store low YUYV2 */ \ + "punpckhbw %%mm2, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \ + "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV2 */ \ + : \ + : "r" (p_y2), "r" (p_ub), "r" (p_vb), "r" (p_line2) ); \ + p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \ + p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4; \ +} while(0) + +#define MMXEXT_YUV420_YUYV_INTERLACED_EVEN( ) \ +/* same as above, except the assembly input arguments are switched */ \ +do { \ + __asm__ __volatile__(".align 8 \n\t" \ + "movd (%0), %%mm1 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%1), %%mm2 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ + "pxor %%mm7, %%mm7 \n\t" /* 00 00 00 00 00 00 00 00 */ \ + "punpcklbw %%mm7, %%mm1 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm2 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm1, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm2, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $2, %%mm3 \n\t" /* Cb * 4 */ \ + "psllw $2, %%mm4 \n\t" /* Cr * 4 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb * 5 */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr * 5 */ \ + "psrlw $1, %%mm3 \n\t" /* Cb * 2 */ \ + "psrlw $1, %%mm4 \n\t" /* Cr * 2 */ \ + "paddw %%mm1, %%mm3 \n\t" /* Cb * 7 */ \ + "paddw %%mm2, %%mm4 \n\t" /* Cr * 7 */ \ + : \ + : "r" (p_u), "r" (p_v) ); \ + __asm__ __volatile__( \ + "movd (%1), %%mm5 \n\t" /* Load 4 Cbt 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%2), %%mm6 \n\t" /* Load 4 Crt 00 00 00 00 v3 v2 v1 v0 */ \ + "movq (%0), %%mm0 \n\t" /* Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm7, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "paddw %%mm3, %%mm5 \n\t" /* Cb1 = Cbt + 7*Cb */ \ + "paddw %%mm4, %%mm6 \n\t" /* Cr1 = Crt + 7*Cr */ \ + "psrlw $3, %%mm5 \n\t" /* Cb1 = (Cbt + 7*Cb) / 8 */ \ + /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \ + "psrlw $3, %%mm6 \n\t" /* Cr1 = (Crt + 7*Cr) / 8 */ \ + "psllw $8, %%mm6 \n\t" \ + "por %%mm5, %%mm6 \n\t" /* Cr1 Cb1 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm3 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm6, %%mm3 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \ + "movntq %%mm3, (%3) \n\t" /* Store low YUYV1 */ \ + "punpckhbw %%mm6, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \ + "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV1 */ \ + : \ + : "r" (p_y2), "r" (p_ub), "r" (p_vb), "r" (p_line2) ); \ + __asm__ __volatile__( \ + "movd (%1), %%mm3 \n\t" /* Load 4 Cbb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%2), %%mm4 \n\t" /* Load 4 Crb 00 00 00 00 v3 v2 v1 v0 */ \ + "movq (%0), %%mm0 \n\t" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm7, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm3, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm4, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $1, %%mm5 \n\t" /* Cbb * 2 */ \ + "psllw $1, %%mm6 \n\t" /* Crb * 2 */ \ + "paddw %%mm5, %%mm3 \n\t" /* Cbb * 3 */ \ + "paddw %%mm6, %%mm4 \n\t" /* Crb * 3 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb2 = 3*Cbb + 5*Cb */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr2 = 3*Crb + 5*Cr */ \ + "psrlw $3, %%mm1 \n\t" /* Cb2 = (3*Cbb + 5*Cb) / 8 */ \ + /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \ + "psrlw $3, %%mm2 \n\t" /* Cr2 = (3*Crb + 5*Cr) / 8 */ \ + "psllw $8, %%mm2 \n\t" \ + "por %%mm1, %%mm2 \n\t" /* Cr2 Cb2 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm1 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm2, %%mm1 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \ + "movntq %%mm1, (%3) \n\t" /* Store low YUYV2 */ \ + "punpckhbw %%mm2, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \ + "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV2 */ \ + : \ + : "r" (p_y1), "r" (p_ut), "r" (p_vt), "r" (p_line1) ); \ + p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \ + p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4; \ } while(0) #endif @@ -835,10 +1025,12 @@ static void yv12_to_yuy2_mmxext const uint8_t *p_y1, *p_y2 = y_src; const uint8_t *p_u = u_src; const uint8_t *p_v = v_src; - const uint8_t *p_u2 = u_src + u_src_pitch; - const uint8_t *p_v2 = v_src + v_src_pitch; + const uint8_t *p_ub, *p_vb; + const uint8_t *p_ut = u_src; + const uint8_t *p_vt = v_src; int i_x, i_y; + int utmp, vtmp; const int i_source_margin = y_src_pitch - width; const int i_source_u_margin = u_src_pitch - width/2; @@ -855,32 +1047,33 @@ static void yv12_to_yuy2_mmxext p_y1 = p_y2; p_y2 += y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + u_src_pitch; + p_vb = p_v + v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 8 ; i_x-- ; ) { - MMXEXT_YUV420_YUYV( ); + MMXEXT_YUV420_YUYV_PROGRESSIVE( ); } for( i_x = (width % 8) / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_PROGRESSIVE( ); } p_y2 += i_source_margin; p_u += i_source_u_margin; p_v += i_source_v_margin; - if( i_y > 1 ) { - p_u2 += i_source_u_margin; - p_v2 += i_source_v_margin; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - u_src_pitch; + p_vt = p_v - v_src_pitch; p_line2 += i_dest_margin; } } else { - p_u2 = u_src + 2*u_src_pitch; - p_v2 = v_src + 2*v_src_pitch; for( i_y = height / 4 ; i_y-- ; ) { p_line1 = p_line2; @@ -889,25 +1082,28 @@ static void yv12_to_yuy2_mmxext p_y1 = p_y2; p_y2 += 2 * y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + 2 * u_src_pitch; + p_vb = p_v + 2 * v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 8 ; i_x-- ; ) { - MMXEXT_YUV420_YUYV( ); + MMXEXT_YUV420_YUYV_INTERLACED_ODD( ); } for( i_x = (width % 8) / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_INTERLACED_ODD( ); } p_y2 += i_source_margin + y_src_pitch; p_u += i_source_u_margin + u_src_pitch; p_v += i_source_v_margin + v_src_pitch; - if( i_y > 1 ) { - p_u2 += i_source_u_margin + u_src_pitch; - p_v2 += i_source_v_margin + v_src_pitch; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - 2 * u_src_pitch; + p_vt = p_v - 2 * v_src_pitch; p_line2 += i_dest_margin + yuy2_pitch; } @@ -915,9 +1111,9 @@ static void yv12_to_yuy2_mmxext p_y2 = y_src + y_src_pitch; p_u = u_src + u_src_pitch; p_v = v_src + v_src_pitch; - p_u2 = u_src + 3*u_src_pitch; - p_v2 = v_src + 3*v_src_pitch; - + p_ut = p_u; + p_vt = p_v; + for( i_y = height / 4 ; i_y-- ; ) { p_line1 = p_line2; @@ -926,25 +1122,28 @@ static void yv12_to_yuy2_mmxext p_y1 = p_y2; p_y2 += 2 * y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + 2 * u_src_pitch; + p_vb = p_v + 2 * v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 8 ; i_x-- ; ) { - MMXEXT_YUV420_YUYV( ); + MMXEXT_YUV420_YUYV_INTERLACED_EVEN( ); } for( i_x = (width % 8) / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_INTERLACED_EVEN( ); } p_y2 += i_source_margin + y_src_pitch; p_u += i_source_u_margin + u_src_pitch; p_v += i_source_v_margin + v_src_pitch; - if( i_y > 1 ) { - p_u2 += i_source_u_margin + u_src_pitch; - p_v2 += i_source_v_margin + v_src_pitch; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - 2 * u_src_pitch; + p_vt = p_v - 2 * v_src_pitch; p_line2 += i_dest_margin + yuy2_pitch; } @@ -1143,7 +1342,7 @@ void init_yuv_conversion(void) { else yv12_to_yuy2 = yv12_to_yuy2_c; - /* determine best YV12 -> YUY2 converter to use */ + /* determine best YUY2 -> YV12 converter to use */ if (xine_mm_accel() & MM_ACCEL_X86_MMXEXT) yuy2_to_yv12 = yuy2_to_yv12_mmxext; else |