diff options
author | Darren Salt <linux@youmustbejoking.demon.co.uk> | 2011-12-31 22:45:27 +0000 |
---|---|---|
committer | Darren Salt <linux@youmustbejoking.demon.co.uk> | 2011-12-31 22:45:27 +0000 |
commit | ec02c795c9511d832b2034a5ce55c1709152b9c7 (patch) | |
tree | 9557df6d2f0ef07ead5a62011c45d11d8474f1bc | |
parent | 96801061c49f647d815b01375037484c374a813f (diff) | |
parent | 93a57f68079e39d73cdd884e631a23cfb315de3f (diff) | |
download | xine-lib-ec02c795c9511d832b2034a5ce55c1709152b9c7.tar.gz xine-lib-ec02c795c9511d832b2034a5ce55c1709152b9c7.tar.bz2 |
Merge from 1.1.
-rw-r--r-- | src/combined/ffmpeg/ff_video_decoder.c | 115 | ||||
-rw-r--r-- | src/xine-utils/color.c | 377 |
2 files changed, 353 insertions, 139 deletions
diff --git a/src/combined/ffmpeg/ff_video_decoder.c b/src/combined/ffmpeg/ff_video_decoder.c index c75848bee..b11967880 100644 --- a/src/combined/ffmpeg/ff_video_decoder.c +++ b/src/combined/ffmpeg/ff_video_decoder.c @@ -89,6 +89,7 @@ struct ff_video_decoder_s { xine_stream_t *stream; int64_t pts; + int64_t last_pts; uint64_t pts_tag_mask; uint64_t pts_tag; int pts_tag_counter; @@ -422,6 +423,9 @@ static void init_video_codec (ff_video_decoder_t *this, unsigned int codec_type) break; } + /* dont want initial AV_NOPTS_VALUE here */ + this->context->reordered_opaque = 0; + } static void choose_speed_over_accuracy_cb(void *user_data, xine_cfg_entry_t *entry) { @@ -1070,6 +1074,52 @@ static void ff_handle_special_buffer (ff_video_decoder_t *this, buf_element_t *b } } +static uint64_t ff_tag_pts(ff_video_decoder_t *this, uint64_t pts) +{ + return pts | this->pts_tag; +} + +static uint64_t ff_untag_pts(ff_video_decoder_t *this, uint64_t pts) +{ + if (this->pts_tag_mask == 0) + return pts; /* pts tagging inactive */ + + if (this->pts_tag != 0 && (pts & this->pts_tag_mask) != this->pts_tag) + return 0; /* reset pts if outdated while waiting for first pass (see below) */ + + return pts & ~this->pts_tag_mask; +} + +static void ff_check_pts_tagging(ff_video_decoder_t *this, uint64_t pts) +{ + if (this->pts_tag_mask == 0) + return; /* pts tagging inactive */ + if ((pts & this->pts_tag_mask) != this->pts_tag) { + this->pts_tag_stable_counter = 0; + return; /* pts still outdated */ + } + + /* the tag should be stable for 100 frames */ + this->pts_tag_stable_counter++; + + if (this->pts_tag != 0) { + if (this->pts_tag_stable_counter >= 100) { + /* first pass: reset pts_tag */ + this->pts_tag = 0; + this->pts_tag_stable_counter = 0; + } + } else if (pts == 0) + return; /* cannot detect second pass */ + else { + if (this->pts_tag_stable_counter >= 100) { + /* second pass: reset pts_tag_mask and pts_tag_counter */ + this->pts_tag_mask = 0; + this->pts_tag_counter = 0; + this->pts_tag_stable_counter = 0; + } + } +} + static void ff_handle_mpeg12_buffer (ff_video_decoder_t *this, buf_element_t *buf) { vo_frame_t *img; @@ -1091,6 +1141,13 @@ static void ff_handle_mpeg12_buffer (ff_video_decoder_t *this, buf_element_t *bu uint8_t *current; int next_flush; + /* apply valid pts to first frame _starting_ thereafter only */ + if (this->pts && !this->context->reordered_opaque) { + this->context->reordered_opaque = + this->av_frame->reordered_opaque = ff_tag_pts (this, this->pts); + this->pts = 0; + } + got_picture = 0; if (!flush) { current = mpeg_parser_decode_data(this->mpeg_parser, @@ -1172,8 +1229,11 @@ static void ff_handle_mpeg12_buffer (ff_video_decoder_t *this, buf_element_t *bu free_img = 0; } - img->pts = this->pts; - this->pts = 0; + /* get back reordered pts */ + img->pts = ff_untag_pts (this, this->av_frame->reordered_opaque); + ff_check_pts_tagging (this, this->av_frame->reordered_opaque); + this->av_frame->reordered_opaque = 0; + this->context->reordered_opaque = 0; if (this->av_frame->repeat_pict) img->duration = this->video_step * 3 / 2; @@ -1214,52 +1274,6 @@ static void ff_handle_mpeg12_buffer (ff_video_decoder_t *this, buf_element_t *bu } } -static uint64_t ff_tag_pts(ff_video_decoder_t *this, uint64_t pts) -{ - return pts | this->pts_tag; -} - -static uint64_t ff_untag_pts(ff_video_decoder_t *this, uint64_t pts) -{ - if (this->pts_tag_mask == 0) - return pts; /* pts tagging inactive */ - - if (this->pts_tag != 0 && (pts & this->pts_tag_mask) != this->pts_tag) - return 0; /* reset pts if outdated while waiting for first pass (see below) */ - - return pts & ~this->pts_tag_mask; -} - -static void ff_check_pts_tagging(ff_video_decoder_t *this, uint64_t pts) -{ - if (this->pts_tag_mask == 0) - return; /* pts tagging inactive */ - if ((pts & this->pts_tag_mask) != this->pts_tag) { - this->pts_tag_stable_counter = 0; - return; /* pts still outdated */ - } - - /* the tag should be stable for 100 frames */ - this->pts_tag_stable_counter++; - - if (this->pts_tag != 0) { - if (this->pts_tag_stable_counter >= 100) { - /* first pass: reset pts_tag */ - this->pts_tag = 0; - this->pts_tag_stable_counter = 0; - } - } else if (pts == 0) - return; /* cannot detect second pass */ - else { - if (this->pts_tag_stable_counter >= 100) { - /* second pass: reset pts_tag_mask and pts_tag_counter */ - this->pts_tag_mask = 0; - this->pts_tag_counter = 0; - this->pts_tag_stable_counter = 0; - } - } -} - static void ff_handle_buffer (ff_video_decoder_t *this, buf_element_t *buf) { uint8_t *chunk_buf = this->buf; AVRational avr00 = {0, 1}; @@ -1585,8 +1599,9 @@ static void ff_decode_data (video_decoder_t *this_gen, buf_element_t *buf) { ff_handle_preview_buffer(this, buf); /* decode */ - if (buf->pts) - this->pts = buf->pts; + /* PES: each valid pts shall be used only once */ + if (buf->pts && (buf->pts != this->last_pts)) + this->last_pts = this->pts = buf->pts; if ((buf->type & 0xFFFF0000) == BUF_VIDEO_MPEG) { ff_handle_mpeg12_buffer(this, buf); diff --git a/src/xine-utils/color.c b/src/xine-utils/color.c index 5c433ed59..930fb77d0 100644 --- a/src/xine-utils/color.c +++ b/src/xine-utils/color.c @@ -659,16 +659,31 @@ static void yuv411_to_yv12_c } -#define C_YUV420_YUYV( ) \ - *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ - *p_line1++ = *p_u; *p_line2++ = (*p_u++ + *p_u2++)>>1; \ - *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ - *p_line1++ = *p_v; *p_line2++ = (*p_v++ + *p_v2++)>>1; +#define C_YUV420_YUYV_PROGRESSIVE( ) \ + utmp = 3 * *p_u++; \ + vtmp = 3 * *p_v++; \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_ut++ + utmp) >> 2; *p_line2++ = (utmp + *p_ub++) >> 2; \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_vt++ + vtmp) >> 2; *p_line2++ = (vtmp + *p_vb++) >> 2; \ + +#define C_YUV420_YUYV_INTERLACED_ODD( ) \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_ut++ + *p_u * 7) >> 3; *p_line2++ = (*p_u++ * 5 + *p_ub++ * 3) >> 3; \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_vt++ + *p_v * 7) >> 3; *p_line2++ = (*p_v++ * 5 + *p_vb++ * 3) >> 3; \ + +#define C_YUV420_YUYV_INTERLACED_EVEN( ) \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_ut++ * 3 + *p_u * 5) >> 3; *p_line2++ = (*p_u++ * 7 + *p_ub++) >> 3; \ + *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \ + *p_line1++ = (*p_vt++ * 3 + *p_v * 5) >> 3; *p_line2++ = (*p_v++ * 7 + *p_vb++) >> 3; \ /***************************************************************************** * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2 * original conversion routine from Videolan project - * changed to support interlaced frames and use simple mean interpolation [MF] + * changed to support interlaced frames and do correct chroma upsampling with + * correct weighting factors and no chroma shift. *****************************************************************************/ static void yv12_to_yuy2_c (const unsigned char *y_src, int y_src_pitch, @@ -681,10 +696,12 @@ static void yv12_to_yuy2_c const uint8_t *p_y1, *p_y2 = y_src; const uint8_t *p_u = u_src; const uint8_t *p_v = v_src; - const uint8_t *p_u2 = u_src + u_src_pitch; - const uint8_t *p_v2 = v_src + v_src_pitch; + const uint8_t *p_ub, *p_vb; + const uint8_t *p_ut = u_src; + const uint8_t *p_vt = v_src; int i_x, i_y; + int utmp, vtmp; const int i_source_margin = y_src_pitch - width; const int i_source_u_margin = u_src_pitch - width/2; @@ -702,28 +719,29 @@ static void yv12_to_yuy2_c p_y1 = p_y2; p_y2 += y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + u_src_pitch; + p_vb = p_v + v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_PROGRESSIVE( ); } p_y2 += i_source_margin; p_u += i_source_u_margin; p_v += i_source_v_margin; - if( i_y > 1 ) { - p_u2 += i_source_u_margin; - p_v2 += i_source_v_margin; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - u_src_pitch; + p_vt = p_v - v_src_pitch; p_line2 += i_dest_margin; } } else { - p_u2 = u_src + 2*u_src_pitch; - p_v2 = v_src + 2*v_src_pitch; for( i_y = height / 4 ; i_y-- ; ) { p_line1 = p_line2; @@ -732,21 +750,24 @@ static void yv12_to_yuy2_c p_y1 = p_y2; p_y2 += 2 * y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + 2 * u_src_pitch; + p_vb = p_v + 2 * v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_INTERLACED_ODD( ); } p_y2 += i_source_margin + y_src_pitch; p_u += i_source_u_margin + u_src_pitch; p_v += i_source_v_margin + v_src_pitch; - if( i_y > 1 ) { - p_u2 += i_source_u_margin + u_src_pitch; - p_v2 += i_source_v_margin + v_src_pitch; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - 2 * u_src_pitch; + p_vt = p_v - 2 * v_src_pitch; p_line2 += i_dest_margin + yuy2_pitch; } @@ -754,8 +775,8 @@ static void yv12_to_yuy2_c p_y2 = y_src + y_src_pitch; p_u = u_src + u_src_pitch; p_v = v_src + v_src_pitch; - p_u2 = u_src + 3*u_src_pitch; - p_v2 = v_src + 3*v_src_pitch; + p_ut = p_u; + p_vt = p_v; for( i_y = height / 4 ; i_y-- ; ) { @@ -765,21 +786,24 @@ static void yv12_to_yuy2_c p_y1 = p_y2; p_y2 += 2 * y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + 2 * u_src_pitch; + p_vb = p_v + 2 * v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_INTERLACED_EVEN( ); } p_y2 += i_source_margin + y_src_pitch; p_u += i_source_u_margin + u_src_pitch; p_v += i_source_v_margin + v_src_pitch; - if( i_y > 1 ) { - p_u2 += i_source_u_margin + u_src_pitch; - p_v2 += i_source_v_margin + v_src_pitch; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - 2 * u_src_pitch; + p_vt = p_v - 2 * v_src_pitch; p_line2 += i_dest_margin + yuy2_pitch; } @@ -789,38 +813,204 @@ static void yv12_to_yuy2_c #if defined(ARCH_X86) || defined(ARCH_X86_64) -#define MMXEXT_YUV420_YUYV( ) \ +#define MMXEXT_YUV420_YUYV_PROGRESSIVE( ) \ do { \ __asm__ __volatile__(".align 8 \n\t" \ "movq (%0), %%mm0 \n\t" /* Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 */ \ "movd (%1), %%mm1 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \ "movd (%2), %%mm2 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ - "punpcklbw %%mm2, %%mm1 \n\t" /* v3 u3 v2 u2 v1 u1 v0 u0 */ \ - "movq %%mm0, %%mm2 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \ - "punpcklbw %%mm1, %%mm2 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \ + "pxor %%mm7, %%mm7 \n\t" /* 00 00 00 00 00 00 00 00 */ \ + "punpcklbw %%mm7, %%mm1 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm2 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm1, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm2, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $1, %%mm3 \n\t" /* Cb * 2 */ \ + "psllw $1, %%mm4 \n\t" /* Cr * 2 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb * 3 */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr * 3 */ \ : \ : "r" (p_y1), "r" (p_u), "r" (p_v) ); \ __asm__ __volatile__( \ - "movd (%0), %%mm3 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \ - "movd (%1), %%mm4 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ - "punpcklbw %%mm4, %%mm3 \n\t" /* v3 u3 v2 u2 v1 u1 v0 u0 */ \ - "pavgb %%mm1, %%mm3 \n\t" /* (mean) v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movd (%0), %%mm3 \n\t" /* Load 4 Cbt 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%1), %%mm4 \n\t" /* Load 4 Crt 00 00 00 00 v3 v2 v1 v0 */ \ + "movd (%2), %%mm5 \n\t" /* Load 4 Cbb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%3), %%mm6 \n\t" /* Load 4 Crb 00 00 00 00 v3 v2 v1 v0 */ \ + "punpcklbw %%mm7, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "punpcklbw %%mm7, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "paddw %%mm1, %%mm3 \n\t" /* Cb1 = Cbt + 3*Cb */ \ + "paddw %%mm2, %%mm4 \n\t" /* Cr1 = Crt + 3*Cr */ \ + "paddw %%mm5, %%mm1 \n\t" /* Cb2 = Cbb + 3*Cb */ \ + "paddw %%mm6, %%mm2 \n\t" /* Cr2 = Crb + 3*Cr */ \ + "psrlw $2, %%mm3 \n\t" /* Cb1 = (Cbt + 3*Cb) / 4 */ \ + /* either the shifts by 2 and 8 or mask off bits and shift by 6 */ \ + "psrlw $2, %%mm4 \n\t" /* Cr1 = (Crt + 3*Cr) / 4 */ \ + "psllw $8, %%mm4 \n\t" \ + "por %%mm4, %%mm3 \n\t" /* Cr1 Cb1 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "psrlw $2, %%mm1 \n\t" /* Cb2 = (Cbb + 3*Cb) / 4 */ \ + "psrlw $2, %%mm2 \n\t" /* Cr2 = (Cbb + 3*Cb) / 4 */ \ + "psllw $8, %%mm2 \n\t" \ + "por %%mm1, %%mm2 \n\t" /* Cr2 Cb2 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm1 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm3, %%mm1 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \ : \ - : "r" (p_u2), "r" (p_v2) ); \ + : "r" (p_ut), "r" (p_vt), "r" (p_ub), "r" (p_vb) ); \ __asm__ __volatile__( \ - "movntq %%mm2, (%0) \n\t" /* Store low YUYV */ \ - "punpckhbw %%mm1, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \ - "movntq %%mm0, 8(%0) \n\t" /* Store high YUYV */ \ + "movntq %%mm1, (%0) \n\t" /* Store low YUYV1 */ \ + "punpckhbw %%mm3, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \ + "movntq %%mm0, 8(%0) \n\t" /* Store high YUYV1 */ \ "movq (%2), %%mm0 \n\t" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ - "movq %%mm0, %%mm2 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ - "punpcklbw %%mm3, %%mm2 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \ - "movntq %%mm2, (%1) \n\t" /* Store low YUYV */ \ - "punpckhbw %%mm3, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \ - "movntq %%mm0, 8(%1) \n\t" /* Store high YUYV */ \ + "movq %%mm0, %%mm1 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm2, %%mm1 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \ + "movntq %%mm1, (%1) \n\t" /* Store low YUYV2 */ \ + "punpckhbw %%mm2, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \ + "movntq %%mm0, 8(%1) \n\t" /* Store high YUYV2 */ \ : \ : "r" (p_line1), "r" (p_line2), "r" (p_y2) ); \ p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \ - p_u2 += 4; p_v2 += 4; \ + p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4; \ +} while(0) + +#define MMXEXT_YUV420_YUYV_INTERLACED_ODD( ) \ +do { \ + __asm__ __volatile__(".align 8 \n\t" \ + "movd (%0), %%mm1 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%1), %%mm2 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ + "pxor %%mm7, %%mm7 \n\t" /* 00 00 00 00 00 00 00 00 */ \ + "punpcklbw %%mm7, %%mm1 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm2 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm1, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm2, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $2, %%mm3 \n\t" /* Cb * 4 */ \ + "psllw $2, %%mm4 \n\t" /* Cr * 4 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb * 5 */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr * 5 */ \ + "psrlw $1, %%mm3 \n\t" /* Cb * 2 */ \ + "psrlw $1, %%mm4 \n\t" /* Cr * 2 */ \ + "paddw %%mm1, %%mm3 \n\t" /* Cb * 7 */ \ + "paddw %%mm2, %%mm4 \n\t" /* Cr * 7 */ \ + : \ + : "r" (p_u), "r" (p_v) ); \ + __asm__ __volatile__( \ + "movd (%1), %%mm5 \n\t" /* Load 4 Cbt 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%2), %%mm6 \n\t" /* Load 4 Crt 00 00 00 00 v3 v2 v1 v0 */ \ + "movq (%0), %%mm0 \n\t" /* Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm7, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "paddw %%mm3, %%mm5 \n\t" /* Cb1 = Cbt + 7*Cb */ \ + "paddw %%mm4, %%mm6 \n\t" /* Cr1 = Crt + 7*Cr */ \ + "psrlw $3, %%mm5 \n\t" /* Cb1 = (Cbt + 7*Cb) / 8 */ \ + /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \ + "psrlw $3, %%mm6 \n\t" /* Cr1 = (Crt + 7*Cr) / 8 */ \ + "psllw $8, %%mm6 \n\t" \ + "por %%mm5, %%mm6 \n\t" /* Cr1 Cb1 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm3 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm6, %%mm3 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \ + "movntq %%mm3, (%3) \n\t" /* Store low YUYV1 */ \ + "punpckhbw %%mm6, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \ + "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV1 */ \ + : \ + : "r" (p_y1), "r" (p_ut), "r" (p_vt), "r" (p_line1) ); \ + __asm__ __volatile__( \ + "movd (%1), %%mm3 \n\t" /* Load 4 Cbb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%2), %%mm4 \n\t" /* Load 4 Crb 00 00 00 00 v3 v2 v1 v0 */ \ + "movq (%0), %%mm0 \n\t" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm7, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm3, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm4, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $1, %%mm5 \n\t" /* Cbb * 2 */ \ + "psllw $1, %%mm6 \n\t" /* Crb * 2 */ \ + "paddw %%mm5, %%mm3 \n\t" /* Cbb * 3 */ \ + "paddw %%mm6, %%mm4 \n\t" /* Crb * 3 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb2 = 3*Cbb + 5*Cb */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr2 = 3*Crb + 5*Cr */ \ + "psrlw $3, %%mm1 \n\t" /* Cb2 = (3*Cbb + 5*Cb) / 8 */ \ + /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \ + "psrlw $3, %%mm2 \n\t" /* Cr2 = (3*Crb + 5*Cr) / 8 */ \ + "psllw $8, %%mm2 \n\t" \ + "por %%mm1, %%mm2 \n\t" /* Cr2 Cb2 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm1 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm2, %%mm1 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \ + "movntq %%mm1, (%3) \n\t" /* Store low YUYV2 */ \ + "punpckhbw %%mm2, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \ + "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV2 */ \ + : \ + : "r" (p_y2), "r" (p_ub), "r" (p_vb), "r" (p_line2) ); \ + p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \ + p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4; \ +} while(0) + +#define MMXEXT_YUV420_YUYV_INTERLACED_EVEN( ) \ +/* same as above, except the assembly input arguments are switched */ \ +do { \ + __asm__ __volatile__(".align 8 \n\t" \ + "movd (%0), %%mm1 \n\t" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%1), %%mm2 \n\t" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ + "pxor %%mm7, %%mm7 \n\t" /* 00 00 00 00 00 00 00 00 */ \ + "punpcklbw %%mm7, %%mm1 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm2 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm1, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm2, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $2, %%mm3 \n\t" /* Cb * 4 */ \ + "psllw $2, %%mm4 \n\t" /* Cr * 4 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb * 5 */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr * 5 */ \ + "psrlw $1, %%mm3 \n\t" /* Cb * 2 */ \ + "psrlw $1, %%mm4 \n\t" /* Cr * 2 */ \ + "paddw %%mm1, %%mm3 \n\t" /* Cb * 7 */ \ + "paddw %%mm2, %%mm4 \n\t" /* Cr * 7 */ \ + : \ + : "r" (p_u), "r" (p_v) ); \ + __asm__ __volatile__( \ + "movd (%1), %%mm5 \n\t" /* Load 4 Cbt 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%2), %%mm6 \n\t" /* Load 4 Crt 00 00 00 00 v3 v2 v1 v0 */ \ + "movq (%0), %%mm0 \n\t" /* Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm7, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "paddw %%mm3, %%mm5 \n\t" /* Cb1 = Cbt + 7*Cb */ \ + "paddw %%mm4, %%mm6 \n\t" /* Cr1 = Crt + 7*Cr */ \ + "psrlw $3, %%mm5 \n\t" /* Cb1 = (Cbt + 7*Cb) / 8 */ \ + /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \ + "psrlw $3, %%mm6 \n\t" /* Cr1 = (Crt + 7*Cr) / 8 */ \ + "psllw $8, %%mm6 \n\t" \ + "por %%mm5, %%mm6 \n\t" /* Cr1 Cb1 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm3 \n\t" /* y7 y6 y5 y4 y3 y2 y1 y0 */ \ + "punpcklbw %%mm6, %%mm3 \n\t" /* v1 y3 u1 y2 v0 y1 u0 y0 */ \ + "movntq %%mm3, (%3) \n\t" /* Store low YUYV1 */ \ + "punpckhbw %%mm6, %%mm0 \n\t" /* v3 y7 u3 y6 v2 y5 u2 y4 */ \ + "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV1 */ \ + : \ + : "r" (p_y2), "r" (p_ub), "r" (p_vb), "r" (p_line2) ); \ + __asm__ __volatile__( \ + "movd (%1), %%mm3 \n\t" /* Load 4 Cbb 00 00 00 00 u3 u2 u1 u0 */ \ + "movd (%2), %%mm4 \n\t" /* Load 4 Crb 00 00 00 00 v3 v2 v1 v0 */ \ + "movq (%0), %%mm0 \n\t" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm7, %%mm3 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm7, %%mm4 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm3, %%mm5 \n\t" /* 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm4, %%mm6 \n\t" /* 00 v3 00 v2 00 v1 00 v0 */ \ + "psllw $1, %%mm5 \n\t" /* Cbb * 2 */ \ + "psllw $1, %%mm6 \n\t" /* Crb * 2 */ \ + "paddw %%mm5, %%mm3 \n\t" /* Cbb * 3 */ \ + "paddw %%mm6, %%mm4 \n\t" /* Crb * 3 */ \ + "paddw %%mm3, %%mm1 \n\t" /* Cb2 = 3*Cbb + 5*Cb */ \ + "paddw %%mm4, %%mm2 \n\t" /* Cr2 = 3*Crb + 5*Cr */ \ + "psrlw $3, %%mm1 \n\t" /* Cb2 = (3*Cbb + 5*Cb) / 8 */ \ + /* either the shifts by 3 and 8 or mask off bits and shift by 5 */ \ + "psrlw $3, %%mm2 \n\t" /* Cr2 = (3*Crb + 5*Cr) / 8 */ \ + "psllw $8, %%mm2 \n\t" \ + "por %%mm1, %%mm2 \n\t" /* Cr2 Cb2 interl v3 u3 v2 u2 v1 u1 v0 u0 */ \ + "movq %%mm0, %%mm1 \n\t" /* Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ + "punpcklbw %%mm2, %%mm1 \n\t" /* v1 Y3 u1 Y2 v0 Y1 u0 Y0 */ \ + "movntq %%mm1, (%3) \n\t" /* Store low YUYV2 */ \ + "punpckhbw %%mm2, %%mm0 \n\t" /* v3 Y7 u3 Y6 v2 Y5 u2 Y4 */ \ + "movntq %%mm0, 8(%3) \n\t" /* Store high YUYV2 */ \ + : \ + : "r" (p_y1), "r" (p_ut), "r" (p_vt), "r" (p_line1) ); \ + p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \ + p_ub += 4; p_vb += 4; p_ut += 4; p_vt += 4; \ } while(0) #endif @@ -836,10 +1026,12 @@ static void yv12_to_yuy2_mmxext const uint8_t *p_y1, *p_y2 = y_src; const uint8_t *p_u = u_src; const uint8_t *p_v = v_src; - const uint8_t *p_u2 = u_src + u_src_pitch; - const uint8_t *p_v2 = v_src + v_src_pitch; + const uint8_t *p_ub, *p_vb; + const uint8_t *p_ut = u_src; + const uint8_t *p_vt = v_src; int i_x, i_y; + int utmp, vtmp; const int i_source_margin = y_src_pitch - width; const int i_source_u_margin = u_src_pitch - width/2; @@ -856,32 +1048,33 @@ static void yv12_to_yuy2_mmxext p_y1 = p_y2; p_y2 += y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + u_src_pitch; + p_vb = p_v + v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 8 ; i_x-- ; ) { - MMXEXT_YUV420_YUYV( ); + MMXEXT_YUV420_YUYV_PROGRESSIVE( ); } for( i_x = (width % 8) / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_PROGRESSIVE( ); } p_y2 += i_source_margin; p_u += i_source_u_margin; p_v += i_source_v_margin; - if( i_y > 1 ) { - p_u2 += i_source_u_margin; - p_v2 += i_source_v_margin; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - u_src_pitch; + p_vt = p_v - v_src_pitch; p_line2 += i_dest_margin; } } else { - p_u2 = u_src + 2*u_src_pitch; - p_v2 = v_src + 2*v_src_pitch; for( i_y = height / 4 ; i_y-- ; ) { p_line1 = p_line2; @@ -890,25 +1083,28 @@ static void yv12_to_yuy2_mmxext p_y1 = p_y2; p_y2 += 2 * y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + 2 * u_src_pitch; + p_vb = p_v + 2 * v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 8 ; i_x-- ; ) { - MMXEXT_YUV420_YUYV( ); + MMXEXT_YUV420_YUYV_INTERLACED_ODD( ); } for( i_x = (width % 8) / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_INTERLACED_ODD( ); } p_y2 += i_source_margin + y_src_pitch; p_u += i_source_u_margin + u_src_pitch; p_v += i_source_v_margin + v_src_pitch; - if( i_y > 1 ) { - p_u2 += i_source_u_margin + u_src_pitch; - p_v2 += i_source_v_margin + v_src_pitch; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - 2 * u_src_pitch; + p_vt = p_v - 2 * v_src_pitch; p_line2 += i_dest_margin + yuy2_pitch; } @@ -916,9 +1112,9 @@ static void yv12_to_yuy2_mmxext p_y2 = y_src + y_src_pitch; p_u = u_src + u_src_pitch; p_v = v_src + v_src_pitch; - p_u2 = u_src + 3*u_src_pitch; - p_v2 = v_src + 3*v_src_pitch; - + p_ut = p_u; + p_vt = p_v; + for( i_y = height / 4 ; i_y-- ; ) { p_line1 = p_line2; @@ -927,25 +1123,28 @@ static void yv12_to_yuy2_mmxext p_y1 = p_y2; p_y2 += 2 * y_src_pitch; + if( i_y > 1 ) { + p_ub = p_u + 2 * u_src_pitch; + p_vb = p_v + 2 * v_src_pitch; + } else { + p_ub = p_u; + p_vb = p_v; + } + for( i_x = width / 8 ; i_x-- ; ) { - MMXEXT_YUV420_YUYV( ); + MMXEXT_YUV420_YUYV_INTERLACED_EVEN( ); } for( i_x = (width % 8) / 2 ; i_x-- ; ) { - C_YUV420_YUYV( ); + C_YUV420_YUYV_INTERLACED_EVEN( ); } p_y2 += i_source_margin + y_src_pitch; p_u += i_source_u_margin + u_src_pitch; p_v += i_source_v_margin + v_src_pitch; - if( i_y > 1 ) { - p_u2 += i_source_u_margin + u_src_pitch; - p_v2 += i_source_v_margin + v_src_pitch; - } else { - p_u2 = p_u; - p_v2 = p_v; - } + p_ut = p_u - 2 * u_src_pitch; + p_vt = p_v - 2 * v_src_pitch; p_line2 += i_dest_margin + yuy2_pitch; } @@ -1144,7 +1343,7 @@ void init_yuv_conversion(void) { else yv12_to_yuy2 = yv12_to_yuy2_c; - /* determine best YV12 -> YUY2 converter to use */ + /* determine best YUY2 -> YV12 converter to use */ if (xine_mm_accel() & MM_ACCEL_X86_MMXEXT) yuy2_to_yv12 = yuy2_to_yv12_mmxext; else |