From 365a4bcbae8cd1a59304f9d07ee03f66a84e0f7c Mon Sep 17 00:00:00 2001
From: Petri Hintukainen <phintuka@users.sourceforge.net>
Date: Wed, 18 Apr 2007 05:42:18 +0300
Subject: Alphablending optimizations: - validate palette alpha values in
 overlay manager   (one check / overlay / palette index) instead of   checking
 every alpha value twice for every   blended pixel in every frame - remove
 unneeded calculations - approximiate expensive integer divisions with  
 multiplication and shift

---
 src/xine-engine/alphablend.c    | 78 ++++++++++++++++-------------------------
 src/xine-engine/video_out.h     |  2 ++
 src/xine-engine/video_overlay.c | 10 +++++-
 3 files changed, 41 insertions(+), 49 deletions(-)

diff --git a/src/xine-engine/alphablend.c b/src/xine-engine/alphablend.c
index 9947da365..2e3af5422 100644
--- a/src/xine-engine/alphablend.c
+++ b/src/xine-engine/alphablend.c
@@ -1062,46 +1062,37 @@ static void blend_yuv_exact(uint8_t *dst_cr, uint8_t *dst_cb,
     /* get opacity of the 4 pixels that share chroma */
     int o00 = (*blend_yuv_data)[ 0 ][ 0 ][ x + 0 ];
     int o01 = (*blend_yuv_data)[ 0 ][ 0 ][ x + 1 ];
+    int o = o00 + o01;
     int o10 = (*blend_yuv_data)[ 0 ][ 1 ][ x + 0 ];
+    o += o10;
     int o11 = (*blend_yuv_data)[ 0 ][ 1 ][ x + 1 ];
+    o += o11;
 
     /* are there any pixels a little bit opaque? */
-    if (o00 || o01 || o10 || o11) {
+    if (o) {
       /* get the chroma components of the 4 pixels */
-      int cr00 = -128 + (*blend_yuv_data)[ 1 ][ 0 ][ x + 0 ];
-      int cr01 = -128 + (*blend_yuv_data)[ 1 ][ 0 ][ x + 1 ];
-      int cr10 = -128 + (*blend_yuv_data)[ 1 ][ 1 ][ x + 0 ];
-      int cr11 = -128 + (*blend_yuv_data)[ 1 ][ 1 ][ x + 1 ];
+      int cr00 = (*blend_yuv_data)[ 1 ][ 0 ][ x + 0 ];
+      int cr01 = (*blend_yuv_data)[ 1 ][ 0 ][ x + 1 ];
+      int cr10 = (*blend_yuv_data)[ 1 ][ 1 ][ x + 0 ];
+      int cr11 = (*blend_yuv_data)[ 1 ][ 1 ][ x + 1 ];
           
-      int cb00 = -128 + (*blend_yuv_data)[ 2 ][ 0 ][ x + 0 ];
-      int cb01 = -128 + (*blend_yuv_data)[ 2 ][ 0 ][ x + 1 ];
-      int cb10 = -128 + (*blend_yuv_data)[ 2 ][ 1 ][ x + 0 ];
-      int cb11 = -128 + (*blend_yuv_data)[ 2 ][ 1 ][ x + 1 ];
+      int cb00 = (*blend_yuv_data)[ 2 ][ 0 ][ x + 0 ];
+      int cb01 = (*blend_yuv_data)[ 2 ][ 0 ][ x + 1 ];
+      int cb10 = (*blend_yuv_data)[ 2 ][ 1 ][ x + 0 ];
+      int cb11 = (*blend_yuv_data)[ 2 ][ 1 ][ x + 1 ];
 
       /* are all pixels completely opaque? */
-      if (o00 >= 0xf && o01 >= 0xf && o10 >= 0xf && o11 >= 0xf) {
+      if (o >= 4*0xf) {
         /* set the output chroma to the average of the four pixels */
-        *dst_cr = 128 + (cr00 + cr01 + cr10 + cr11) / 4;
-        *dst_cb = 128 + (cb00 + cb01 + cb10 + cb11) / 4;
+        *dst_cr = (cr00 + cr01 + cr10 + cr11) / 4;
+        *dst_cb = (cb00 + cb01 + cb10 + cb11) / 4;
       } else {
-        int t4, cr, cb;
-        
-        /* blending required, so clamp opacity values to allowed range */
-        if (o00 > 0xf) o00 = 0xf;
-        if (o01 > 0xf) o01 = 0xf;
-        if (o10 > 0xf) o10 = 0xf;
-        if (o11 > 0xf) o11 = 0xf;
-
         /* calculate transparency of background over the four pixels */
-        t4 = (0xf - o00) + (0xf - o01) + (0xf - o10) + (0xf - o11);
-
-        /* get background chroma */
-        cr = -128 + *dst_cr;
-        cb = -128 + *dst_cb;
+        int t4 = 4*0xf - o;
 
         /* blend the output chroma to the average of the four pixels */
-        *dst_cr = 128 + (cr * t4 + cr00 * o00 + cr01 * o01 + cr10 * o10 + cr11 * o11) / (4 * 0xf);
-        *dst_cb = 128 + (cb * t4 + cb00 * o00 + cb01 * o01 + cb10 * o10 + cb11 * o11) / (4 * 0xf);
+        *dst_cr = ((*dst_cr * t4 + cr00 * o00 + cr01 * o01 + cr10 * o10 + cr11 * o11) * (0x1111+1)) >> 18;
+        *dst_cb = ((*dst_cb * t4 + cb00 * o00 + cb01 * o01 + cb10 * o10 + cb11 * o11) * (0x1111+1)) >> 18;
       }
     }
 
@@ -1480,38 +1471,29 @@ static void blend_yuy2_exact(uint8_t *dst_cr, uint8_t *dst_cb,
     /* get opacity of the 2 pixels that share chroma */
     int o0 = (*blend_yuy2_data)[ 0 ][ x + 0 ];
     int o1 = (*blend_yuy2_data)[ 0 ][ x + 1 ];
+    int o = o0 + o1;
 
     /* are there any pixels a little bit opaque? */
-    if (o0 || o1) {
+    if (o) {
       /* get the chroma components of the 2 pixels */
-      int cr0 = -128 + (*blend_yuy2_data)[ 1 ][ x + 0 ];
-      int cr1 = -128 + (*blend_yuy2_data)[ 1 ][ x + 1 ];
+      int cr0 = (*blend_yuy2_data)[ 1 ][ x + 0 ];
+      int cr1 = (*blend_yuy2_data)[ 1 ][ x + 1 ];
           
-      int cb0 = -128 + (*blend_yuy2_data)[ 2 ][ x + 0 ];
-      int cb1 = -128 + (*blend_yuy2_data)[ 2 ][ x + 1 ];
+      int cb0 = (*blend_yuy2_data)[ 2 ][ x + 0 ];
+      int cb1 = (*blend_yuy2_data)[ 2 ][ x + 1 ];
 
       /* are all pixels completely opaque? */
-      if (o0 >= 0xf && o1 >= 0xf) {
+      if (o >= 2*0xf) {
         /* set the output chroma to the average of the two pixels */
-        *dst_cr = 128 + (cr0 + cr1) / 2;
-        *dst_cb = 128 + (cb0 + cb1) / 2;
+        *dst_cr = (cr0 + cr1) / 2;
+        *dst_cb = (cb0 + cb1) / 2;
       } else {
-        int t2, cr, cb;
-        
-        /* blending required, so clamp opacity values to allowed range */
-        if (o0 > 0xf) o0 = 0xf;
-        if (o1 > 0xf) o1 = 0xf;
-
         /* calculate transparency of background over the two pixels */
-        t2 = (0xf - o0) + (0xf - o1);
-
-        /* get background chroma */
-        cr = -128 + *dst_cr;
-        cb = -128 + *dst_cb;
+        int t2 = 2*0xf - o;
 
         /* blend the output chroma to the average of the two pixels */
-        *dst_cr = 128 + (cr * t2 + cr0 * o0 + cr1 * o1) / (2 * 0xf);
-        *dst_cb = 128 + (cb * t2 + cb0 * o0 + cb1 * o1) / (2 * 0xf);
+        *dst_cr = ((*dst_cr * t2 + cr0 * o0 + cr1 * o1) * (0x1111+1)) >> 17;
+        *dst_cb = ((*dst_cb * t2 + cb0 * o0 + cb1 * o1) * (0x1111+1)) >> 17;
       }
     }
 
diff --git a/src/xine-engine/video_out.h b/src/xine-engine/video_out.h
index 7b42c43ed..085752dd8 100644
--- a/src/xine-engine/video_out.h
+++ b/src/xine-engine/video_out.h
@@ -255,6 +255,8 @@ struct xine_video_port_s {
    the palette. This should probably be classified as a bug. */
 #define OVL_PALETTE_SIZE 256
 
+#define OVL_MAX_OPACITY  0x0f
+
 /* number of recent frames to keep in memory
    these frames are needed by some deinterlace algorithms
    FIXME: we need a method to flush the recent frames (new stream)
diff --git a/src/xine-engine/video_overlay.c b/src/xine-engine/video_overlay.c
index 7bd9292c8..231aa5a70 100644
--- a/src/xine-engine/video_overlay.c
+++ b/src/xine-engine/video_overlay.c
@@ -288,10 +288,18 @@ static int32_t video_overlay_add_event(video_overlay_manager_t *this_gen,  void
     }
     
     if( event->object.overlay ) {
+      int i;
+      for(i = 0; i < OVL_PALETTE_SIZE; i++) {
+	if(event->object.overlay->trans[i] >= OVL_MAX_OPACITY)
+	  event->object.overlay->trans[i] = OVL_MAX_OPACITY;
+	if(event->object.overlay->hili_trans[i] >= OVL_MAX_OPACITY)
+	  event->object.overlay->hili_trans[i] = OVL_MAX_OPACITY;
+      }
+
       this->events[new_event].event->object.overlay = xine_xmalloc (sizeof(vo_overlay_t));
       xine_fast_memcpy(this->events[new_event].event->object.overlay, 
            event->object.overlay, sizeof(vo_overlay_t));
-    
+
       /* We took the callers rle and data, therefore it will be our job to free it */
       /* clear callers overlay so it will not be freed twice */
       memset(event->object.overlay,0,sizeof(vo_overlay_t));
-- 
cgit v1.2.3


From 88c59571bc29d92bced7d5f1f7681d9a39649aa7 Mon Sep 17 00:00:00 2001
From: Petri Hintukainen <phintuka@users.sourceforge.net>
Date: Mon, 23 Apr 2007 13:13:04 +0300
Subject: Add comments about used blending equation Remove unused x_odd
 parameter from blend_???_exact functions

---
 src/xine-engine/alphablend.c | 55 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/src/xine-engine/alphablend.c b/src/xine-engine/alphablend.c
index 2e3af5422..25c223272 100644
--- a/src/xine-engine/alphablend.c
+++ b/src/xine-engine/alphablend.c
@@ -1052,8 +1052,7 @@ static void mem_blend8(uint8_t *mem, uint8_t val, uint8_t o, size_t sz)
   }
 }
 
-static void blend_yuv_exact(uint8_t *dst_cr, uint8_t *dst_cb,
-                            int src_width, int x_odd,
+static void blend_yuv_exact(uint8_t *dst_cr, uint8_t *dst_cb, int src_width, 
                             uint8_t *(*blend_yuv_data)[ 3 ][ 2 ])
 {
   int x;
@@ -1091,6 +1090,7 @@ static void blend_yuv_exact(uint8_t *dst_cr, uint8_t *dst_cb,
         int t4 = 4*0xf - o;
 
         /* blend the output chroma to the average of the four pixels */
+        /* for explanation of the used equation, see blend_yuy2_exact() */
         *dst_cr = ((*dst_cr * t4 + cr00 * o00 + cr01 * o01 + cr10 * o10 + cr11 * o11) * (0x1111+1)) >> 18;
         *dst_cb = ((*dst_cb * t4 + cb00 * o00 + cb01 * o01 + cb10 * o10 + cb11 * o11) * (0x1111+1)) >> 18;
       }
@@ -1431,7 +1431,7 @@ void _x_blend_yuv (uint8_t *dst_base[3], vo_overlay_t * img_overl,
             memset(&(*blend_yuv_data)[ 0 ][ 1 ][ 0 ], 0, exact_blend_width_m2);
           }
           
-          blend_yuv_exact(dst_cr, dst_cb, exact_blend_width, x_odd, blend_yuv_data);
+          blend_yuv_exact(dst_cr, dst_cb, exact_blend_width, blend_yuv_data);
           
           any_line_buffered = 0;
         }
@@ -1452,7 +1452,7 @@ void _x_blend_yuv (uint8_t *dst_base[3], vo_overlay_t * img_overl,
         memset(&(*blend_yuv_data)[ 0 ][ 1 ][ 0 ], 0, exact_blend_width_m2);
       }
       
-      blend_yuv_exact(dst_cr, dst_cb, exact_blend_width, x_odd, blend_yuv_data);
+      blend_yuv_exact(dst_cr, dst_cb, exact_blend_width, blend_yuv_data);
     }
   }
       
@@ -1461,8 +1461,7 @@ void _x_blend_yuv (uint8_t *dst_base[3], vo_overlay_t * img_overl,
 #endif
 }
             
-static void blend_yuy2_exact(uint8_t *dst_cr, uint8_t *dst_cb,
-                             int src_width, int x_odd,
+static void blend_yuy2_exact(uint8_t *dst_cr, uint8_t *dst_cb, int src_width,
                              uint8_t *(*blend_yuy2_data)[ 3 ])
 {
   int x;
@@ -1491,7 +1490,49 @@ static void blend_yuy2_exact(uint8_t *dst_cr, uint8_t *dst_cb,
         /* calculate transparency of background over the two pixels */
         int t2 = 2*0xf - o;
 
+	/*
+	 * No need to adjust chroma values with +/- 128:
+	 *   *dst_cb 
+	 *   = 128 + ((*dst_cb-128) * t2 + (cb0-128) * o0 + (cb1-128) * o1) / (2 * 0xf);
+	 *   = 128 + (*dst_cb * t2 + cb0 * o0 + cb1 * o1 + (t2*(-128) - 128*o0 - 128*o1)) / (2 * 0xf);
+	 *   = 128 + (*dst_cb * t2 + cb0 * o0 + cb1 * o1 + ((2*0xf-o0-o1)*(-128) - 128*o0 - 128*o1)) / (2 * 0xf);
+	 *   = 128 + (*dst_cb * t2 + cb0 * o0 + cb1 * o1 + (2*0xf*(-128))) / (2 * 0xf);
+	 *   = 128 + (*dst_cb * t2 + cb0 * o0 + cb1 * o1) / (2 * 0xf) - 128;
+	 *   =       (*dst_cb * t2 + cb0 * o0 + cb1 * o1) / (2 * 0xf);
+	 *
+	 * Convert slow divisions to multiplication and shift:
+	 *     X/0xf
+	 *   = X * (1/0xf)
+	 *   = X * (0x1111/0x1111) * (1/0xf)
+	 *   = X * 0x1111/0xffff
+	 *   =(almost) X * 0x1112/0x10000
+	 *   = (X * 0x1112) >> 16
+	 *
+	 * The tricky point is 0x1111/0xffff --> 0x1112/0x10000. 
+	 * All calculations are done using integers and X is in 
+	 * range of [0 ... 0xff*0xf*4]. This results in error of
+	 *     X*0x1112/0x10000 - X/0xf
+	 *   = X*(0x1112/0x10000 - 1/0xf)
+	 *   = X*(0x0.1112 - 0x0.111111...)
+	 *   = X*0.0000eeeeee....
+	 *   = [0 ... 0.37c803fc...]    when X in [0...3bc4]
+	 * As the error is less than 1 and always positive, whole error
+	 * "disappears" during truncation (>>16). Rounding to exact results is
+	 * guaranteed by selecting 0x1112 instead of more accurate 0x1111
+	 * (with 0x1111 error=X*(-0.00001111...)). With 0x1112 error is 
+	 * always positive, but still less than one.
+	 * So, one can forget the "=(almost)" as it is really "=" when source
+	 * operands are within 0...0xff (U,V) and 0...0xf (A).
+	 *
+	 * 1/0x10000 (= >>16) was originally selected because of MMX pmullhw
+	 * instruction; it makes possible to do whole calculation in MMX using
+	 * uint16's (pmullhw is (X*Y)>>16).
+	 * 
+	 * Here X/(2*0xf) = X/0xf/2 = ((X*0x1112)>>16)>>1 = (X*0x1112)>>17
+	 */
+
         /* blend the output chroma to the average of the two pixels */
+        /* *dst_cr = 128 + ((*dst_cr-128) * t2 + (cr0-128) * o0 + (cr1-128) * o1) / (2 * 0xf); */
         *dst_cr = ((*dst_cr * t2 + cr0 * o0 + cr1 * o1) * (0x1111+1)) >> 17;
         *dst_cb = ((*dst_cb * t2 + cb0 * o0 + cb1 * o1) * (0x1111+1)) >> 17;
       }
@@ -1837,7 +1878,7 @@ void _x_blend_yuy2 (uint8_t * dst_img, vo_overlay_t * img_overl,
     if (enable_exact_blending) {
       /* blend buffered line */
       if (any_line_buffered) {
-        blend_yuy2_exact(dst_y - x_odd * 2 + 3, dst_y - x_odd * 2 + 1, exact_blend_width, x_odd, blend_yuy2_data);
+        blend_yuy2_exact(dst_y - x_odd * 2 + 3, dst_y - x_odd * 2 + 1, exact_blend_width, blend_yuy2_data);
         
         any_line_buffered = 0;
       }
-- 
cgit v1.2.3