implement simple interpolation for 4:2:0 -> 4:2:2 upsampling

use mmxext non-temporal hints and pavgb CVS patchset: 5086 CVS date: 2003/06/22 15:03:43
author: Miguel Freitas <miguelfreitas@users.sourceforge.net> 2003-06-22 15:03:43 +0000
committer: Miguel Freitas <miguelfreitas@users.sourceforge.net> 2003-06-22 15:03:43 +0000
commit: 8531aac55030077b509f4db3cb5c85f04b08f7a5 (patch)
tree: e5d75cdff2ad3f63cb56da0c6e117d0c7f9f8162
parent: 3a29d80023f7950f43323b9e88c00a8601533a80 (diff)
download: xine-lib-8531aac55030077b509f4db3cb5c85f04b08f7a5.tar.gz
xine-lib-8531aac55030077b509f4db3cb5c85f04b08f7a5.tar.bz2
1 files changed, 84 insertions, 21 deletions
diff --git a/src/xine-utils/color.c b/src/xine-utils/color.c
index 28cdc6879..ecddbe2f6 100644
--- a/src/xine-utils/color.c
+++ b/src/xine-utils/color.c
@@ -61,7 +61,7 @@
  * instructions), these macros will automatically map to those special
  * instructions.
  *
- * $Id: color.c,v 1.17 2003/06/03 03:33:16 miguelfreitas Exp $
+ * $Id: color.c,v 1.18 2003/06/22 15:03:43 miguelfreitas Exp $
  */
 
 #include "xine_internal.h"
@@ -654,16 +654,16 @@ void yuv411_to_yv12_c
 
 }
 
-#define C_YUV420_YUYV( )                                                    \
-    *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++;                     \
-    *(p_line1)++ =            *(p_line2)++ = *(p_u)++;                      \
-    *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++;                     \
-    *(p_line1)++ =            *(p_line2)++ = *(p_v)++;                      \
+#define C_YUV420_YUYV( )                                          \
+    *p_line1++ = *p_y1++; *p_line2++ = *p_y2++;                   \
+    *p_line1++ = *p_u;    *p_line2++ = (*p_u++ + *p_u2++)>>1;     \
+    *p_line1++ = *p_y1++; *p_line2++ = *p_y2++;                   \
+    *p_line1++ = *p_v;    *p_line2++ = (*p_v++ + *p_v2++)>>1;
 
 /*****************************************************************************
  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
  * original conversion routine from Videolan project
- * changed to support interlaced frames. no filter/interpolation yet.
+ * changed to support interlaced frames and use simple mean interpolation [MF]
  *****************************************************************************/
 void yv12_to_yuy2_c
   (unsigned char *y_src, int y_src_pitch, 
@@ -676,6 +676,8 @@ void yv12_to_yuy2_c
     uint8_t *p_y1, *p_y2 = y_src;
     uint8_t *p_u = u_src;
     uint8_t *p_v = v_src;
+    uint8_t *p_u2 = u_src + u_src_pitch;
+    uint8_t *p_v2 = v_src + v_src_pitch;
 
     int i_x, i_y;
 
@@ -706,11 +708,20 @@ void yv12_to_yuy2_c
           p_y2 += i_source_margin;
           p_u += i_source_u_margin;
           p_v += i_source_v_margin;
+          if( i_y > 1 ) {
+            p_u2 += i_source_u_margin;
+            p_v2 += i_source_v_margin;
+          } else {
+            p_u2 = p_u;
+            p_v2 = p_v;
+          }
           p_line2 += i_dest_margin;
       }
 
     } else {
 
+      p_u2 = u_src + 2*u_src_pitch;
+      p_v2 = v_src + 2*v_src_pitch;
       for( i_y = height / 4 ; i_y-- ; )
       {
           p_line1 = p_line2;
@@ -730,6 +741,13 @@ void yv12_to_yuy2_c
           p_y2 += i_source_margin + y_src_pitch;
           p_u += i_source_u_margin + u_src_pitch;
           p_v += i_source_v_margin + v_src_pitch;
+          if( i_y > 1 ) {
+            p_u2 += i_source_u_margin + u_src_pitch;
+            p_v2 += i_source_v_margin + v_src_pitch;
+          } else {
+            p_u2 = p_u;
+            p_v2 = p_v;
+          }
           p_line2 += i_dest_margin + yuy2_pitch;
       }
   
@@ -737,6 +755,8 @@ void yv12_to_yuy2_c
       p_y2 = y_src + y_src_pitch;
       p_u = u_src + u_src_pitch;
       p_v = v_src + v_src_pitch;
+      p_u2 = u_src + 3*u_src_pitch;
+      p_v2 = v_src + 3*v_src_pitch;
   
       for( i_y = height / 4 ; i_y-- ; )
       {
@@ -757,6 +777,13 @@ void yv12_to_yuy2_c
           p_y2 += i_source_margin + y_src_pitch;
           p_u += i_source_u_margin + u_src_pitch;
           p_v += i_source_v_margin + v_src_pitch;
+          if( i_y > 1 ) {
+            p_u2 += i_source_u_margin + u_src_pitch;
+            p_v2 += i_source_v_margin + v_src_pitch;
+          } else {
+            p_u2 = p_u;
+            p_v2 = p_v;
+          }
           p_line2 += i_dest_margin + yuy2_pitch;
       }
 
@@ -766,7 +793,7 @@ void yv12_to_yuy2_c
 
 #ifdef ARCH_X86
 
-#define MMX_YUV420_YUYV( )                                                         \
+#define MMXEXT_YUV420_YUYV( )                                                      \
 do {                                                                               \
    __asm__ __volatile__(".align 8 \n\t"                                            \
     "movq       (%0), %%mm0 \n\t"  /* Load 8 Y          y7 y6 y5 y4 y3 y2 y1 y0 */ \
@@ -777,24 +804,32 @@ do {
     "punpcklbw %%mm1, %%mm2 \n\t"  /*                   v1 y3 u1 y2 v0 y1 u0 y0 */ \
     :                                                                              \
     : "r" (p_y1), "r" (p_u), "r" (p_v) );                                          \
-   __asm__ __volatile__(".align 8 \n\t"                                            \
-    "movq      %%mm2, (%0)  \n\t"  /* Store low YUYV                            */ \
+   __asm__ __volatile__(                                                           \
+    "movd       (%0), %%mm3 \n\t"  /* Load 4 Cb         00 00 00 00 u3 u2 u1 u0 */ \
+    "movd       (%1), %%mm4 \n\t"  /* Load 4 Cr         00 00 00 00 v3 v2 v1 v0 */ \
+    "punpcklbw %%mm4, %%mm3 \n\t"  /*                   v3 u3 v2 u2 v1 u1 v0 u0 */ \
+    "pavgb     %%mm1, %%mm3 \n\t"  /* (mean)            v3 u3 v2 u2 v1 u1 v0 u0 */ \
+    :                                                                              \
+    : "r" (p_u2), "r" (p_v2) );                                                    \
+   __asm__ __volatile__(                                                           \
+    "movntq    %%mm2, (%0)  \n\t"  /* Store low YUYV                            */ \
     "punpckhbw %%mm1, %%mm0 \n\t"  /*                   v3 y7 u3 y6 v2 y5 u2 y4 */ \
-    "movq      %%mm0, 8(%0) \n\t"  /* Store high YUYV                           */ \
+    "movntq    %%mm0, 8(%0) \n\t"  /* Store high YUYV                           */ \
     "movq       (%2), %%mm0 \n\t"  /* Load 8 Y          Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
     "movq      %%mm0, %%mm2 \n\t"  /*                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
-    "punpcklbw %%mm1, %%mm2 \n\t"  /*                   v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
-    "movq      %%mm2, (%1)  \n\t"  /* Store low YUYV                            */ \
-    "punpckhbw %%mm1, %%mm0 \n\t"  /*                   v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
-    "movq      %%mm0, 8(%1) \n\t"  /* Store high YUYV                           */ \
+    "punpcklbw %%mm3, %%mm2 \n\t"  /*                   v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \
+    "movntq    %%mm2, (%1)  \n\t"  /* Store low YUYV                            */ \
+    "punpckhbw %%mm3, %%mm0 \n\t"  /*                   v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \
+    "movntq    %%mm0, 8(%1) \n\t"  /* Store high YUYV                           */ \
     :                                                                              \
     : "r" (p_line1),  "r" (p_line2),  "r" (p_y2) );                                \
   p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4;          \
+  p_u2 += 4; p_v2 += 4;                                                            \
 } while(0)
 
 #endif
 
-void yv12_to_yuy2_mmx
+void yv12_to_yuy2_mmxext
   (unsigned char *y_src, int y_src_pitch, 
    unsigned char *u_src, int u_src_pitch, 
    unsigned char *v_src, int v_src_pitch, 
@@ -805,6 +840,8 @@ void yv12_to_yuy2_mmx
     uint8_t *p_y1, *p_y2 = y_src;
     uint8_t *p_u = u_src;
     uint8_t *p_v = v_src;
+    uint8_t *p_u2 = u_src + u_src_pitch;
+    uint8_t *p_v2 = v_src + v_src_pitch;
 
     int i_x, i_y;
 
@@ -825,17 +862,26 @@ void yv12_to_yuy2_mmx
   
           for( i_x = width / 8 ; i_x-- ; )
           {
-              MMX_YUV420_YUYV( );
+              MMXEXT_YUV420_YUYV( );
           }
   
           p_y2 += i_source_margin;
           p_u += i_source_u_margin;
           p_v += i_source_v_margin;
+          if( i_y > 1 ) {
+            p_u2 += i_source_u_margin;
+            p_v2 += i_source_v_margin;
+          } else {
+            p_u2 = p_u;
+            p_v2 = p_v;
+          }
           p_line2 += i_dest_margin;
       }
 
     } else {
 
+      p_u2 = u_src + 2*u_src_pitch;
+      p_v2 = v_src + 2*v_src_pitch;
       for( i_y = height / 4 ; i_y-- ; )
       {
           p_line1 = p_line2;
@@ -846,12 +892,19 @@ void yv12_to_yuy2_mmx
   
           for( i_x = width / 8 ; i_x-- ; )
           {
-              MMX_YUV420_YUYV( );
+              MMXEXT_YUV420_YUYV( );
           }
   
           p_y2 += i_source_margin + y_src_pitch;
           p_u += i_source_u_margin + u_src_pitch;
           p_v += i_source_v_margin + v_src_pitch;
+          if( i_y > 1 ) {
+            p_u2 += i_source_u_margin + u_src_pitch;
+            p_v2 += i_source_v_margin + v_src_pitch;
+          } else {
+            p_u2 = p_u;
+            p_v2 = p_v;
+          }
           p_line2 += i_dest_margin + yuy2_pitch;
       }
   
@@ -859,6 +912,8 @@ void yv12_to_yuy2_mmx
       p_y2 = y_src + y_src_pitch;
       p_u = u_src + u_src_pitch;
       p_v = v_src + v_src_pitch;
+      p_u2 = u_src + 3*u_src_pitch;
+      p_v2 = v_src + 3*v_src_pitch;
   
       for( i_y = height / 4 ; i_y-- ; )
       {
@@ -870,17 +925,25 @@ void yv12_to_yuy2_mmx
   
           for( i_x = width / 8 ; i_x-- ; )
           {
-              MMX_YUV420_YUYV( );
+              MMXEXT_YUV420_YUYV( );
           }
   
           p_y2 += i_source_margin + y_src_pitch;
           p_u += i_source_u_margin + u_src_pitch;
           p_v += i_source_v_margin + v_src_pitch;
+          if( i_y > 1 ) {
+            p_u2 += i_source_u_margin + u_src_pitch;
+            p_v2 += i_source_v_margin + v_src_pitch;
+          } else {
+            p_u2 = p_u;
+            p_v2 = p_v;
+          }
           p_line2 += i_dest_margin + yuy2_pitch;
       }
 
     }
 
+    sfence();
     emms();
 
 #endif
@@ -920,8 +983,8 @@ void init_yuv_conversion(void) {
     yuv444_to_yuy2 = yuv444_to_yuy2_c;
 
   /* determine best YV12 -> YUY2 converter to use */
-  if (xine_mm_accel() & MM_ACCEL_X86_MMX)
-    yv12_to_yuy2 = yv12_to_yuy2_mmx;
+  if (xine_mm_accel() & MM_ACCEL_X86_MMXEXT)
+    yv12_to_yuy2 = yv12_to_yuy2_mmxext;
   else
     yv12_to_yuy2 = yv12_to_yuy2_c;
author	Miguel Freitas <miguelfreitas@users.sourceforge.net>	2003-06-22 15:03:43 +0000
committer	Miguel Freitas <miguelfreitas@users.sourceforge.net>	2003-06-22 15:03:43 +0000
commit	8531aac55030077b509f4db3cb5c85f04b08f7a5 (patch)
tree	e5d75cdff2ad3f63cb56da0c6e117d0c7f9f8162
parent	3a29d80023f7950f43323b9e88c00a8601533a80 (diff)
download	xine-lib-8531aac55030077b509f4db3cb5c85f04b08f7a5.tar.gz xine-lib-8531aac55030077b509f4db3cb5c85f04b08f7a5.tar.bz2