summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTorsten Jager <t.jager@gmx.de>2012-04-17 14:27:36 +0300
committerTorsten Jager <t.jager@gmx.de>2012-04-17 14:27:36 +0300
commit6f50ec4dfb4bc53ca1f9a6caaf703281edea86b8 (patch)
treeaf2f25b48a733426597d8efa7638727837fe64a4
parent5a6586d1c506525cbcc17f557944094b3bb3bd28 (diff)
downloadxine-lib-6f50ec4dfb4bc53ca1f9a6caaf703281edea86b8.tar.gz
xine-lib-6f50ec4dfb4bc53ca1f9a6caaf703281edea86b8.tar.bz2
Improved mmx_yuv2rgb()
yuv2rgb_mmx.c scales YUV and rounds them down to 8 bits individually before the addition. That causes red and blue to be off by up to 2, green even off by 3. This little patch does the stuff using 10 bits per component, plus correct rounding. There seems to be no noticable impact on performance, but color gradients come out much smoother now.
-rw-r--r--src/video_out/yuv2rgb_mmx.c17
1 files changed, 13 insertions, 4 deletions
diff --git a/src/video_out/yuv2rgb_mmx.c b/src/video_out/yuv2rgb_mmx.c
index 9cbb675bb..c0c362bfd 100644
--- a/src/video_out/yuv2rgb_mmx.c
+++ b/src/video_out/yuv2rgb_mmx.c
@@ -119,6 +119,7 @@ static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv, mmx_cs
{
static mmx_t mmx_80w = {0x0080008000800080ULL};
static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffULL};
+ static mmx_t mmx_0002w = {0x0002000200020002ULL};
movq_m2r (*py, mm6); // mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
pxor_r2r (mm4, mm4); // mm4 = 0
@@ -133,10 +134,11 @@ static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv, mmx_cs
psrlw_i2r (8, mm7); // mm7 = Y7 Y5 Y3 Y1
movd_m2r (*pv, mm1); // mm1 = 00 00 00 00 v3 v2 v1 v0
- psllw_i2r (3, mm6); // promote precision
+ psllw_i2r (5, mm6); // promote precision
pmulhw_m2r (csc->Y_coeff, mm6); // mm6 = luma_rgb even
- psllw_i2r (3, mm7); // promote precision
+ paddsw_m2r (mmx_0002w, mm6); // +0.5 for later rounding
+ psllw_i2r (5, mm7); // promote precision
punpcklbw_r2r (mm4, mm0); // mm0 = u3 u2 u1 u0
@@ -144,12 +146,13 @@ static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv, mmx_cs
punpcklbw_r2r (mm4, mm1); // mm1 = v3 v2 v1 v0
pmulhw_m2r (csc->Y_coeff, mm7); // mm7 = luma_rgb odd
- psllw_i2r (3, mm0); // promote precision
+ paddsw_m2r (mmx_0002w, mm7); // +0.5 for later rounding
+ psllw_i2r (5, mm0); // promote precision
psubsw_m2r (mmx_80w, mm1); // v -= 128
movq_r2r (mm0, mm2); // mm2 = u3 u2 u1 u0
- psllw_i2r (3, mm1); // promote precision
+ psllw_i2r (5, mm1); // promote precision
movq_r2r (mm1, mm4); // mm4 = v3 v2 v1 v0
@@ -168,12 +171,14 @@ static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv, mmx_cs
paddsw_r2r (mm6, mm0); // mm0 = B6 B4 B2 B0
paddsw_r2r (mm7, mm3); // mm3 = B7 B5 B3 B1
+ psraw_i2r (2, mm0); // div round
packuswb_r2r (mm0, mm0); // saturate to 0-255
pmulhw_m2r (csc->U_green, mm2); // mm2 = u * u_green
+ psraw_i2r (2, mm3); // div round
packuswb_r2r (mm3, mm3); // saturate to 0-255
@@ -197,16 +202,20 @@ static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv, mmx_cs
paddsw_r2r (mm6, mm2); // mm2 = G6 G4 G2 G0
+ psraw_i2r (2, mm2); // div round
packuswb_r2r (mm2, mm2); // saturate to 0-255
paddsw_r2r (mm6, mm1); // mm1 = R6 R4 R2 R0
+ psraw_i2r (2, mm1); // div round
packuswb_r2r (mm1, mm1); // saturate to 0-255
paddsw_r2r (mm7, mm4); // mm4 = R7 R5 R3 R1
+ psraw_i2r (2, mm4); // div round
packuswb_r2r (mm4, mm4); // saturate to 0-255
paddsw_r2r (mm7, mm5); // mm5 = G7 G5 G3 G1
+ psraw_i2r (2, mm5); // div round
packuswb_r2r (mm5, mm5); // saturate to 0-255