diff options
27 files changed, 2183 insertions, 3457 deletions
diff --git a/src/libffmpeg/libavcodec/Makefile.am b/src/libffmpeg/libavcodec/Makefile.am index c2ffc84bc..8e2a6a4df 100644 --- a/src/libffmpeg/libavcodec/Makefile.am +++ b/src/libffmpeg/libavcodec/Makefile.am @@ -2,6 +2,8 @@ ## Process this file with automake to produce Makefile.in ## +SUBDIRS = armv4l i386 mlib + #CFLAGS = -D_FILE_OFFSET_BITS=64 @GLOBAL_CFLAGS@ -DCONFIG_DECODERS -DHAVE_AV_CONFIG_H CFLAGS = @GLOBAL_CFLAGS@ @LIBFFMPEG_CFLAGS@ -DCONFIG_DECODERS -DHAVE_AV_CONFIG_H @@ -11,28 +13,22 @@ LIBTOOL = $(SHELL) $(top_builddir)/libtool-nofpic noinst_LTLIBRARIES = libavcodec.la -if HAVE_FFMMX -mmx_modules = mpegvideo_mmx.c sad_mmx.s dsputil_mmx.c idct_mmx.c -#mmx_modules = mpegvideo_mmx.c sad_mmx.s -endif - -if HAVE_MLIB -mlib_modules = dsputil_mlib.c idct_mlib.c -endif - -EXTRA_DIST = mpegvideo_mmx.c sad_mmx.s dsputil_mmx.c idct_mmx.c \ - dsputil_mlib.c idct_mlib.c - -libavcodec_la_SOURCES = dsputil.c fdctref.c jfdctfst.c mpeg12.c \ - utils.c rv10.c h263.c jrevdct.c \ - common.c h263dec.c msmpeg4.c \ - mpegvideo.c mjpeg.c motion_est.c \ - $(mmx_modules) $(mlib_modules) +libavcodec_la_SOURCES = common.c utils.c mpegvideo.c h263.c jrevdct.c jfdctfst.c \ + mjpeg.c dsputil.c \ + motion_est.c imgconvert.c msmpeg4.c \ + mpeg12.c h263dec.c rv10.c simple_idct.c +#imgresample.c + +libavcodec_la_LDFLAGS = \ + $(top_builddir)/src/libffmpeg/libavcodec/armv4l/libavcodec_armv4l.la \ + $(top_builddir)/src/libffmpeg/libavcodec/i386/libavcodec_mmx.la \ + $(top_builddir)/src/libffmpeg/libavcodec/mlib/libavcodec_mlib.la \ + -avoid-version -module -noinst_HEADERS = avcodec.h dsputil.h mpegvideo.h dsputil_mmx_avg.h\ +noinst_HEADERS = avcodec.h dsputil.h mpegvideo.h \ common.h h263data.h mpeg4data.h msmpeg4data.h \ - mpeg12data.h + mpeg12data.h simple_idct.h .s.lo: $(ASCOMPILE) -o $@ `test -f $< || echo '$(srcdir)/'`$< diff --git a/src/libffmpeg/libavcodec/avcodec.h b/src/libffmpeg/libavcodec/avcodec.h index 5155c8fd2..864ebb3ba 100644 --- a/src/libffmpeg/libavcodec/avcodec.h +++ b/src/libffmpeg/libavcodec/avcodec.h @@ -1,3 +1,6 @@ +#ifndef AVCODEC_H +#define AVCODEC_H + #include "common.h" enum CodecID { @@ -9,11 +12,20 @@ enum CodecID { CODEC_ID_AC3, CODEC_ID_MJPEG, CODEC_ID_MPEG4, - CODEC_ID_PCM, CODEC_ID_RAWVIDEO, CODEC_ID_MSMPEG4, CODEC_ID_H263P, CODEC_ID_H263I, + + /* various pcm "codecs" */ + CODEC_ID_PCM_S16LE, + CODEC_ID_PCM_S16BE, + CODEC_ID_PCM_U16LE, + CODEC_ID_PCM_U16BE, + CODEC_ID_PCM_S8, + CODEC_ID_PCM_U8, + CODEC_ID_PCM_MULAW, + CODEC_ID_PCM_ALAW, }; enum CodecType { @@ -30,6 +42,11 @@ enum PixelFormat { PIX_FMT_YUV444P, }; +/* currently unused, may be used if 24/32 bits samples ever supported */ +enum SampleFormat { + SAMPLE_FMT_S16 = 0, /* signed 16 bits */ +}; + /* in bytes */ #define AVCODEC_MAX_AUDIO_FRAME_SIZE 18432 @@ -74,6 +91,7 @@ typedef struct AVCodecContext { /* audio only */ int sample_rate; /* samples per sec */ int channels; + int sample_fmt; /* sample format, currenly unused */ /* the following data should not be initialized */ int frame_size; /* in samples, initialized when calling 'init' */ @@ -85,6 +103,19 @@ typedef struct AVCodecContext { struct AVCodec *codec; void *priv_data; + /* The following data is for RTP friendly coding */ + /* By now only H.263/H.263+ coder honours this */ + int rtp_mode; /* 1 for activate RTP friendly-mode */ + /* highers numbers represent more error-prone */ + /* enviroments, by now just "1" exist */ + + int rtp_payload_size; /* The size of the RTP payload, the coder will */ + /* do it's best to deliver a chunk with size */ + /* below rtp_payload_size, the chunk will start */ + /* with a start code on some codecs like H.263 */ + /* This doesn't take account of any particular */ + /* headers inside the transmited RTP payload */ + /* the following fields are ignored */ void *opaque; /* can be used to carry app specific stuff */ char codec_name[32]; @@ -130,13 +161,30 @@ extern AVCodec mpeg_decoder; extern AVCodec h263i_decoder; extern AVCodec rv10_decoder; extern AVCodec mjpeg_decoder; +#ifdef FF_AUDIO_CODECS +extern AVCodec mp3_decoder; + +/* pcm codecs */ +#define PCM_CODEC(id, name) \ +extern AVCodec name ## _decoder; \ +extern AVCodec name ## _encoder; -/* dummy raw codecs */ -extern AVCodec pcm_codec; +PCM_CODEC(CODEC_ID_PCM_S16LE, pcm_s16le); +PCM_CODEC(CODEC_ID_PCM_S16BE, pcm_s16be); +PCM_CODEC(CODEC_ID_PCM_U16LE, pcm_u16le); +PCM_CODEC(CODEC_ID_PCM_U16BE, pcm_u16be); +PCM_CODEC(CODEC_ID_PCM_S8, pcm_s8); +PCM_CODEC(CODEC_ID_PCM_U8, pcm_u8); +PCM_CODEC(CODEC_ID_PCM_ALAW, pcm_alaw); +PCM_CODEC(CODEC_ID_PCM_MULAW, pcm_mulaw); + +#undef PCM_CODEC +#endif + +/* dummy raw video codec */ extern AVCodec rawvideo_codec; /* the following codecs use external GPL libs */ -extern AVCodec mp3_decoder; extern AVCodec ac3_decoder; /* resample.c */ @@ -184,6 +232,7 @@ void avcodec_init(void); void register_avcodec(AVCodec *format); AVCodec *avcodec_find_encoder(enum CodecID id); +AVCodec *avcodec_find_encoder_by_name(const char *name); AVCodec *avcodec_find_decoder(enum CodecID id); AVCodec *avcodec_find_decoder_by_name(const char *name); void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode); @@ -203,3 +252,13 @@ int avcodec_encode_video(AVCodecContext *avctx, UINT8 *buf, int buf_size, int avcodec_close(AVCodecContext *avctx); void avcodec_register_all(void); + +#ifdef FF_POSTPROCESS +#ifndef MBC +#define MBC 48 +#define MBR 36 +#endif +extern int quant_store[MBR+1][MBC+1]; // [Review] +#endif + +#endif /* AVCODEC_H */ diff --git a/src/libffmpeg/libavcodec/common.c b/src/libffmpeg/libavcodec/common.c index b3ce07c05..8f0fdc1bb 100644 --- a/src/libffmpeg/libavcodec/common.c +++ b/src/libffmpeg/libavcodec/common.c @@ -250,6 +250,52 @@ void align_get_bits(GetBitContext *s) get_bits(s, n); } } +/* This function is identical to get_bits_long(), the */ +/* only diference is that it doesn't touch the buffer */ +/* it is usefull to see the buffer. */ + +unsigned int show_bits_long(GetBitContext *s, int n) +{ + unsigned int val; + int bit_cnt; + unsigned int bit_buf; + UINT8 *buf_ptr; + + bit_buf = s->bit_buf; + bit_cnt = s->bit_cnt - n; + + val = bit_buf >> (32 - n); + buf_ptr = s->buf_ptr; + buf_ptr += 4; + + /* handle common case: we can read everything */ + if (buf_ptr <= s->buf_end) { +#ifdef ARCH_X86 + bit_buf = bswap_32(*((unsigned long*)(&buf_ptr[-4]))); +#else + bit_buf = (buf_ptr[-4] << 24) | + (buf_ptr[-3] << 16) | + (buf_ptr[-2] << 8) | + (buf_ptr[-1]); +#endif + } else { + buf_ptr -= 4; + bit_buf = 0; + if (buf_ptr < s->buf_end) + bit_buf |= *buf_ptr++ << 24; + if (buf_ptr < s->buf_end) + bit_buf |= *buf_ptr++ << 16; + if (buf_ptr < s->buf_end) + bit_buf |= *buf_ptr++ << 8; + if (buf_ptr < s->buf_end) + bit_buf |= *buf_ptr++; + } + val |= bit_buf >> (32 + bit_cnt); + bit_buf <<= - bit_cnt; + bit_cnt += 32; + + return val; +} /* VLC decoding */ diff --git a/src/libffmpeg/libavcodec/common.h b/src/libffmpeg/libavcodec/common.h index 8ea58b2e7..be9678cf4 100644 --- a/src/libffmpeg/libavcodec/common.h +++ b/src/libffmpeg/libavcodec/common.h @@ -1,10 +1,13 @@ #ifndef COMMON_H #define COMMON_H -#define FFMPEG_VERSION_INT 0x000405 -#define FFMPEG_VERSION "0.4.5" +#define FFMPEG_VERSION_INT 0x000406 +#define FFMPEG_VERSION "0.4.6" +/* CVS version as 26-12-2001 */ -#ifdef WIN32 +#undef DEBUG + +#if defined(WIN32) && !defined(__MINGW32__) #define CONFIG_WIN32 #endif @@ -43,6 +46,7 @@ typedef INT16 int16_t; typedef UINT32 uint32_t; typedef INT32 int32_t; +#ifndef __MINGW32__ #define INT64_C(c) (c ## i64) #define UINT64_C(c) (c ## i64) @@ -56,6 +60,11 @@ typedef INT32 int32_t; #pragma warning( disable : 4244 ) #pragma warning( disable : 4305 ) +#else +#define INT64_C(c) (c ## LL) +#define UINT64_C(c) (c ## ULL) +#endif /* __MINGW32__ */ + #define M_PI 3.14159265358979323846 #define M_SQRT2 1.41421356237309504880 /* sqrt(2) */ @@ -71,10 +80,12 @@ typedef INT32 int32_t; #define snprintf _snprintf +#ifndef __MINGW32__ +/* no config.h with VC */ #define CONFIG_ENCODERS 1 #define CONFIG_DECODERS 1 #define CONFIG_AC3 1 -#define CONFIG_MPGLIB 1 +#endif #else @@ -112,15 +123,35 @@ typedef signed long long INT64; #include "fastmemcpy.h" #endif +#endif /* HAVE_AV_CONFIG_H */ + +#endif /* !CONFIG_WIN32 */ + +/* debug stuff */ +#ifdef HAVE_AV_CONFIG_H + #ifndef DEBUG #define NDEBUG #endif #include <assert.h> -#endif /* HAVE_AV_CONFIG_H */ +/* dprintf macros */ +#if defined(CONFIG_WIN32) && !defined(__MINGW32__) + +inline void dprintf(const char* fmt,...) {} + +#else + +#ifdef DEBUG +#define dprintf(fmt,args...) printf(fmt, ## args) +#else +#define dprintf(fmt,args...) +#endif #endif /* !CONFIG_WIN32 */ +#endif /* HAVE_AV_CONFIG_H */ + /* bit output */ struct PutBitContext; @@ -141,7 +172,7 @@ void init_put_bits(PutBitContext *s, void *opaque, void (*write_data)(void *, UINT8 *, int)); void put_bits(PutBitContext *s, int n, unsigned int value); -INT64 get_bit_count(PutBitContext *s); +INT64 get_bit_count(PutBitContext *s); /* XXX: change function name */ void align_put_bits(PutBitContext *s); void flush_put_bits(PutBitContext *s); @@ -168,6 +199,7 @@ void init_get_bits(GetBitContext *s, UINT8 *buffer, int buffer_size); unsigned int get_bits_long(GetBitContext *s, int n); +unsigned int show_bits_long(GetBitContext *s, int n); static inline unsigned int get_bits(GetBitContext *s, int n){ if(s->bit_cnt>=n){ @@ -197,6 +229,19 @@ static inline unsigned int get_bits1(GetBitContext *s){ return get_bits_long(s,1); } +/* This function is identical to get_bits(), the only */ +/* diference is that it doesn't touch the buffer */ +/* it is usefull to see the buffer. */ +static inline unsigned int show_bits(GetBitContext *s, int n) +{ + if(s->bit_cnt>=n) { + /* most common case here */ + unsigned int val = s->bit_buf >> (32 - n); + return val; + } + return show_bits_long(s,n); +} + static inline void skip_bits(GetBitContext *s, int n){ if(s->bit_cnt>=n){ /* most common case here */ @@ -223,6 +268,10 @@ static inline void skip_bits1(GetBitContext *s){ } } +static inline int get_bits_count(GetBitContext *s) +{ + return (s->buf_ptr - s->buf) * 8 - s->bit_cnt; +} void align_get_bits(GetBitContext *s); int init_vlc(VLC *vlc, int nb_bits, int nb_codes, diff --git a/src/libffmpeg/libavcodec/dsputil.c b/src/libffmpeg/libavcodec/dsputil.c index e68833b7c..f699b2ef6 100644 --- a/src/libffmpeg/libavcodec/dsputil.c +++ b/src/libffmpeg/libavcodec/dsputil.c @@ -20,7 +20,7 @@ #include <stdio.h> #include "avcodec.h" #include "dsputil.h" -#include "xineutils.h" +#include "simple_idct.h" void (*ff_idct)(DCTELEM *block); void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size); @@ -389,6 +389,27 @@ int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h) /* permute block according so that it corresponds to the MMX idct order */ +#ifdef SIMPLE_IDCT +void block_permute(INT16 *block) +{ + int i; + INT16 temp[64]; + +// for(i=0; i<64; i++) temp[i] = block[ block_permute_op(i) ]; + for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i]; + + for(i=0; i<64; i++) block[i] = temp[i]; +/* + for(i=0; i<64; i++) + { + if((i&7)==0) printf("\n"); + printf("%2d ", block[i]); + } +*/ +} + +#else + void block_permute(INT16 *block) { int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; @@ -410,16 +431,12 @@ void block_permute(INT16 *block) block += 8; } } +#endif void dsputil_init(void) { int i, j; -#ifdef ARCH_X86 - uint32_t mm = xine_mm_accel(); -#endif - int use_permuted_mmx_idct; - int accel_dsputil; - + int use_permuted_idct; for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i; for(i=0;i<MAX_NEG_CROP;i++) { @@ -431,7 +448,11 @@ void dsputil_init(void) squareTbl[i] = (i - 256) * (i - 256); } +#ifdef SIMPLE_IDCT + ff_idct = simple_idct; +#else ff_idct = j_rev_dct; +#endif get_pixels = get_pixels_c; put_pixels_clamped = put_pixels_clamped_c; add_pixels_clamped = add_pixels_clamped_c; @@ -442,46 +463,34 @@ void dsputil_init(void) pix_abs16x16_xy2 = pix_abs16x16_xy2_c; av_fdct = jpeg_fdct_ifast; - use_permuted_mmx_idct = 1; - accel_dsputil = 0; + use_permuted_idct = 1; -#ifdef ARCH_X86 - if (!accel_dsputil && (mm & MM_ACCEL_X86_MMX)) { - dsputil_init_mmx(); - accel_dsputil = 1; - /* printf("AVCODEC: Using mmx idct\n"); */ - } +#ifdef HAVE_MMX + dsputil_init_mmx(); #endif #ifdef ARCH_ARMV4L - if (!accel_dsputil) { - dsputil_init_armv4l(); - accel_dsputil = 1; - /* printf("AVCODEC: Using armv4l idct\n"); */ - } + dsputil_init_armv4l(); #endif #ifdef HAVE_MLIB - if (!accel_dsputil) { - dsputil_init_mlib(); - accel_dsputil = 1; - use_permuted_mmx_idct = 0; - /* printf("AVCODEC: Using mediaLib idct\n"); */ - } + dsputil_init_mlib(); + use_permuted_idct = 0; #endif - if (!accel_dsputil) { - /* printf("AVCODEC: Using C idct\n"); */ - } - if (use_permuted_mmx_idct) { - /* permute for IDCT */ - for(i=0;i<64;i++) { - j = zigzag_direct[i]; - zigzag_direct[i] = block_permute_op(j); - j = ff_alternate_horizontal_scan[i]; - ff_alternate_horizontal_scan[i] = block_permute_op(j); - j = ff_alternate_vertical_scan[i]; - ff_alternate_vertical_scan[i] = block_permute_op(j); - } - block_permute(default_intra_matrix); - block_permute(default_non_intra_matrix); +#ifdef SIMPLE_IDCT + if(ff_idct == simple_idct) use_permuted_idct=0; +#endif + + if (use_permuted_idct) { + /* permute for IDCT */ + for(i=0;i<64;i++) { + j = zigzag_direct[i]; + zigzag_direct[i] = block_permute_op(j); + j = ff_alternate_horizontal_scan[i]; + ff_alternate_horizontal_scan[i] = block_permute_op(j); + j = ff_alternate_vertical_scan[i]; + ff_alternate_vertical_scan[i] = block_permute_op(j); + } + block_permute(default_intra_matrix); + block_permute(default_non_intra_matrix); } } diff --git a/src/libffmpeg/libavcodec/dsputil.h b/src/libffmpeg/libavcodec/dsputil.h index e401065a0..182b1e706 100644 --- a/src/libffmpeg/libavcodec/dsputil.h +++ b/src/libffmpeg/libavcodec/dsputil.h @@ -67,19 +67,53 @@ int pix_abs16x16_x2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h); int pix_abs16x16_y2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h); int pix_abs16x16_xy2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h); +#if defined (SIMPLE_IDCT) && defined (ARCH_X86) +static inline int block_permute_op(int j) +{ +static const int table[64]={ + 0x00, 0x08, 0x01, 0x09, 0x04, 0x0C, 0x05, 0x0D, + 0x10, 0x18, 0x11, 0x19, 0x14, 0x1C, 0x15, 0x1D, + 0x02, 0x0A, 0x03, 0x0B, 0x06, 0x0E, 0x07, 0x0F, + 0x12, 0x1A, 0x13, 0x1B, 0x16, 0x1E, 0x17, 0x1F, + 0x20, 0x28, 0x21, 0x29, 0x24, 0x2C, 0x25, 0x2D, + 0x30, 0x38, 0x31, 0x39, 0x34, 0x3C, 0x35, 0x3D, + 0x22, 0x2A, 0x23, 0x2B, 0x26, 0x2E, 0x27, 0x2F, + 0x32, 0x3A, 0x33, 0x3B, 0x36, 0x3E, 0x37, 0x3F, +}; + + return table[j]; +} +#elif defined (SIMPLE_IDCT) +static inline int block_permute_op(int j) +{ + return j; +} +#else static inline int block_permute_op(int j) { return (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); } +#endif void block_permute(INT16 *block); #if defined(ARCH_X86) +#define HAVE_MMX 1 + #include "xineutils.h" extern int mm_flags; +#define mm_support() xine_mm_accel() + +#if 0 +static inline void emms(void) +{ + __asm __volatile ("emms;":::"memory"); +} +#endif + #define emms_c() \ {\ if (mm_flags & MM_MMX)\ @@ -92,36 +126,29 @@ void dsputil_init_mmx(void); #elif defined(ARCH_ARMV4L) +#define emms_c() + /* This is to use 4 bytes read to the IDCT pointers for some 'zero' line ptimizations */ #define __align8 __attribute__ ((aligned (4))) void dsputil_init_armv4l(void); -#endif - - - -#if defined(HAVE_MLIB) +#elif defined(HAVE_MLIB) + +#define emms_c() /* SPARC/VIS IDCT needs 8-byte aligned DCT blocks */ #define __align8 __attribute__ ((aligned (8))) -void dsputil_init_mlib(void); +void dsputil_init_mlib(void); -#endif /* HAVE_MLIB */ +#else +#define emms_c() -/* - * provide empty defaults, if the target specific accelerated dsputils did - * not define these: - */ -#ifndef __align8 -#define __align8 -#endif +#define __align8 -#ifndef emms_c -#define emms_c() #endif #endif diff --git a/src/libffmpeg/libavcodec/dsputil_mlib.c b/src/libffmpeg/libavcodec/dsputil_mlib.c deleted file mode 100644 index 4be97d0c7..000000000 --- a/src/libffmpeg/libavcodec/dsputil_mlib.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Sun mediaLib optimized DSP utils - * Copyright (c) 2001 Juergen Keil. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include "dsputil.h" - -#include <mlib_types.h> -#include <mlib_status.h> -#include <mlib_sys.h> -#include <mlib_video.h> - - -static void put_pixels_mlib (uint8_t * dest, const uint8_t * ref, - int stride, int height) -{ - assert(height == 16 || height == 8); - if (height == 16) - mlib_VideoCopyRef_U8_U8_8x16(dest, (uint8_t *)ref, stride); - else - mlib_VideoCopyRef_U8_U8_8x8 (dest, (uint8_t *)ref, stride); -} - -static void put_pixels_x2_mlib (uint8_t * dest, const uint8_t * ref, - int stride, int height) -{ - assert(height == 16 || height == 8); - if (height == 16) - mlib_VideoInterpX_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride); - else - mlib_VideoInterpX_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride); -} - -static void put_pixels_y2_mlib (uint8_t * dest, const uint8_t * ref, - int stride, int height) -{ - assert(height == 16 || height == 8); - if (height == 16) - mlib_VideoInterpY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride); - else - mlib_VideoInterpY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride); -} - -static void put_pixels_xy2_mlib(uint8_t * dest, const uint8_t * ref, - int stride, int height) -{ - assert(height == 16 || height == 8); - if (height == 16) - mlib_VideoInterpXY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride); - else - mlib_VideoInterpXY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride); -} - -static void avg_pixels_mlib (uint8_t * dest, const uint8_t * ref, - int stride, int height) -{ - assert(height == 16 || height == 8); - if (height == 16) - mlib_VideoCopyRefAve_U8_U8_8x16(dest, (uint8_t *)ref, stride); - else - mlib_VideoCopyRefAve_U8_U8_8x8 (dest, (uint8_t *)ref, stride); -} - -static void avg_pixels_x2_mlib (uint8_t * dest, const uint8_t * ref, - int stride, int height) -{ - assert(height == 16 || height == 8); - if (height == 16) - mlib_VideoInterpAveX_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride); - else - mlib_VideoInterpAveX_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride); -} - -static void avg_pixels_y2_mlib (uint8_t * dest, const uint8_t * ref, - int stride, int height) -{ - assert(height == 16 || height == 8); - if (height == 16) - mlib_VideoInterpAveY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride); - else - mlib_VideoInterpAveY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride); -} - -static void avg_pixels_xy2_mlib (uint8_t * dest, const uint8_t * ref, - int stride, int height) -{ - assert(height == 16 || height == 8); - if (height == 16) - mlib_VideoInterpAveXY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride); - else - mlib_VideoInterpAveXY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride); -} - - -static void add_pixels_clamped_mlib(const DCTELEM *block, UINT8 *pixels, int line_size) -{ - mlib_VideoAddBlock_U8_S16(pixels, (mlib_s16 *)block, line_size); -} - - -extern void ff_fdct_mlib(DCTELEM *data); -extern void ff_idct_mlib(DCTELEM *data); - -void dsputil_init_mlib(void) -{ - av_fdct = ff_fdct_mlib; - ff_idct = ff_idct_mlib; - - put_pixels_tab[0] = put_pixels_mlib; - put_pixels_tab[1] = put_pixels_x2_mlib; - put_pixels_tab[2] = put_pixels_y2_mlib; - put_pixels_tab[3] = put_pixels_xy2_mlib; - - avg_pixels_tab[0] = avg_pixels_mlib; - avg_pixels_tab[1] = avg_pixels_x2_mlib; - avg_pixels_tab[2] = avg_pixels_y2_mlib; - avg_pixels_tab[3] = avg_pixels_xy2_mlib; - - put_no_rnd_pixels_tab[0] = put_pixels_mlib; - - add_pixels_clamped = add_pixels_clamped_mlib; -} diff --git a/src/libffmpeg/libavcodec/dsputil_mmx.c b/src/libffmpeg/libavcodec/dsputil_mmx.c deleted file mode 100644 index b806c34e5..000000000 --- a/src/libffmpeg/libavcodec/dsputil_mmx.c +++ /dev/null @@ -1,1057 +0,0 @@ -/* - * MMX optimized DSP utils - * Copyright (c) 2000, 2001 Gerard Lantau. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - */ - -#include "dsputil.h" -#include "xineutils.h" - -int mm_flags; /* multimedia extension flags */ - -int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); -int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h); -int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); -int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); -int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); - -/* external functions, from idct_mmx.c */ -void ff_mmx_idct(DCTELEM *block); -void ff_mmxext_idct(DCTELEM *block); - -/* pixel operations */ -static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; -static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; -//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; -//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; - -/***********************************/ -/* 3Dnow specific */ - -#define DEF(x) x ## _3dnow -/* for Athlons PAVGUSB is prefered */ -#define PAVGB "pavgusb" - -#include "dsputil_mmx_avg.h" - -#undef DEF -#undef PAVGB - -/***********************************/ -/* MMX2 specific */ - -#define DEF(x) x ## _sse - -/* Introduced only in MMX2 set */ -#define PAVGB "pavgb" - -#include "dsputil_mmx_avg.h" - -#undef DEF -#undef PAVGB - -/***********************************/ -/* standard MMX */ - -static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) -{ - DCTELEM *p; - const UINT8 *pix; - int i; - - /* read the pixels */ - p = block; - pix = pixels; - __asm __volatile("pxor %%mm7, %%mm7":::"memory"); - for(i=0;i<4;i++) { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm2, 8%0\n\t" - "movq %%mm1, 16%0\n\t" - "movq %%mm3, 24%0\n\t" - :"=m"(*p) - :"m"(*pix), "m"(*(pix+line_size)) - :"memory"); - pix += line_size*2; - p += 16; - } -} - -static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) -{ - const DCTELEM *p; - UINT8 *pix; - int i; - static int p_inc = 32; /* hack to avoid gcc-2.95.2 loop unrolling bug */ - - /* read the pixels */ - p = block; - pix = pixels; - for(i=0;i<2;i++) { - __asm __volatile( - "movq %4, %%mm0\n\t" - "movq 8%4, %%mm1\n\t" - "movq 16%4, %%mm2\n\t" - "movq 24%4, %%mm3\n\t" - "movq 32%4, %%mm4\n\t" - "movq 40%4, %%mm5\n\t" - "movq 48%4, %%mm6\n\t" - "movq 56%4, %%mm7\n\t" - "packuswb %%mm1, %%mm0\n\t" - "packuswb %%mm3, %%mm2\n\t" - "packuswb %%mm5, %%mm4\n\t" - "packuswb %%mm7, %%mm6\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm2, %1\n\t" - "movq %%mm4, %2\n\t" - "movq %%mm6, %3\n\t" - :"=m"(*pix), "=m"(*(pix+line_size)) - ,"=m"(*(pix+line_size*2)), "=m"(*(pix+line_size*3)) - :"m"(*p) - :"memory"); - pix += line_size*4; - p += p_inc; - } -} - -static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) -{ - const DCTELEM *p; - UINT8 *pix; - int i; - static int p_inc = 16; /* hack to avoid gcc-2.95.2 loop unrolling bug */ - - /* read the pixels */ - p = block; - pix = pixels; - __asm __volatile("pxor %%mm7, %%mm7":::"memory"); - for(i=0;i<4;i++) { - __asm __volatile( - "movq %2, %%mm0\n\t" - "movq 8%2, %%mm1\n\t" - "movq 16%2, %%mm2\n\t" - "movq 24%2, %%mm3\n\t" - "movq %0, %%mm4\n\t" - "movq %1, %%mm6\n\t" - "movq %%mm4, %%mm5\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpckhbw %%mm7, %%mm5\n\t" - "paddsw %%mm4, %%mm0\n\t" - "paddsw %%mm5, %%mm1\n\t" - "movq %%mm6, %%mm5\n\t" - "punpcklbw %%mm7, %%mm6\n\t" - "punpckhbw %%mm7, %%mm5\n\t" - "paddsw %%mm6, %%mm2\n\t" - "paddsw %%mm5, %%mm3\n\t" - "packuswb %%mm1, %%mm0\n\t" - "packuswb %%mm3, %%mm2\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm2, %1\n\t" - :"=m"(*pix), "=m"(*(pix+line_size)) - :"m"(*p) - :"memory"); - pix += line_size*2; - p += p_inc; - } -} - -static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - int dh, hh; - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - hh=h>>2; - dh=h&3; - while(hh--) { - __asm __volatile( - "movq %4, %%mm0\n\t" - "movq %5, %%mm1\n\t" - "movq %6, %%mm2\n\t" - "movq %7, %%mm3\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, %1\n\t" - "movq %%mm2, %2\n\t" - "movq %%mm3, %3\n\t" - :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3)) - :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3)) - :"memory"); - pix = pix + line_size*4; - p = p + line_size*4; - } - while(dh--) { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix = pix + line_size; - p = p + line_size; - } -} - -static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t" - "movq %0, %%mm4\n\t" - ::"m"(mm_wone):"memory"); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq 1%1, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm4, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; p += line_size; - } while (--h); -} - -static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t" - "movq %0, %%mm4\n\t" - ::"m"(mm_wone):"memory"); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm4, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size; - } while (--h); -} - -static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t" - "movq %0, %%mm6\n\t" - ::"m"(mm_wtwo):"memory"); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq 1%1, %%mm4\n\t" - "movq 1%2, %%mm5\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "movq %%mm4, %%mm1\n\t" - "movq %%mm5, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpcklbw %%mm7, %%mm5\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm5, %%mm4\n\t" - "paddusw %%mm3, %%mm1\n\t" - "paddusw %%mm6, %%mm4\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm1, %%mm2\n\t" - "psrlw $2, %%mm0\n\t" - "psrlw $2, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size; - } while(--h); -} - -static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory"); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq 1%1, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += line_size; - } while (--h); -} - -static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory"); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size; - } while(--h); -} - -static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t" - "movq %0, %%mm6\n\t" - ::"m"(mm_wone):"memory"); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq 1%1, %%mm4\n\t" - "movq 1%2, %%mm5\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "movq %%mm4, %%mm1\n\t" - "movq %%mm5, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpcklbw %%mm7, %%mm5\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm5, %%mm4\n\t" - "paddusw %%mm3, %%mm1\n\t" - "paddusw %%mm6, %%mm4\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm1, %%mm2\n\t" - "psrlw $2, %%mm0\n\t" - "psrlw $2, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size; - } while(--h); -} - -static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t" - "movq %0, %%mm6\n\t" - ::"m"(mm_wone):"memory"); - do { - __asm __volatile( - "movq %0, %%mm0\n\t" - "movq %1, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "paddusw %%mm6, %%mm0\n\t" - "paddusw %%mm6, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += line_size; - } - while (--h); -} - -static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t" - "movq %0, %%mm6\n\t" - ::"m"(mm_wone):"memory"); - do { - __asm __volatile( - "movq %1, %%mm1\n\t" - "movq %0, %%mm0\n\t" - "movq 1%1, %%mm4\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "movq %%mm4, %%mm5\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpckhbw %%mm7, %%mm5\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "paddusw %%mm4, %%mm1\n\t" - "paddusw %%mm5, %%mm3\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm6, %%mm3\n\t" - "psrlw $1, %%mm1\n\t" - "psrlw $1, %%mm3\n\t" - "paddusw %%mm6, %%mm0\n\t" - "paddusw %%mm6, %%mm2\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += line_size; - } while (--h); -} - -static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t" - "movq %0, %%mm6\n\t" - ::"m"(mm_wone):"memory"); - do { - __asm __volatile( - "movq %1, %%mm1\n\t" - "movq %0, %%mm0\n\t" - "movq %2, %%mm4\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "movq %%mm4, %%mm5\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpckhbw %%mm7, %%mm5\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "paddusw %%mm4, %%mm1\n\t" - "paddusw %%mm5, %%mm3\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm6, %%mm3\n\t" - "psrlw $1, %%mm1\n\t" - "psrlw $1, %%mm3\n\t" - "paddusw %%mm6, %%mm0\n\t" - "paddusw %%mm6, %%mm2\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size ; - } while(--h); -} - -static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t" - "movq %0, %%mm6\n\t" - ::"m"(mm_wtwo):"memory"); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq 1%1, %%mm4\n\t" - "movq 1%2, %%mm5\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "movq %%mm4, %%mm1\n\t" - "movq %%mm5, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpcklbw %%mm7, %%mm5\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm5, %%mm4\n\t" - "paddusw %%mm3, %%mm1\n\t" - "paddusw %%mm6, %%mm4\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm1, %%mm2\n\t" - "movq %3, %%mm5\n\t" - "psrlw $2, %%mm0\n\t" - "movq %0, %%mm1\n\t" - "psrlw $2, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "paddusw %%mm5, %%mm0\n\t" - "paddusw %%mm5, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)), "m"(mm_wone) - :"memory"); - pix += line_size; - p += line_size ; - } while(--h); -} - -static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory"); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %0, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += line_size ; - } while (--h); -} - -static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t":::"memory"); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq 1%1, %%mm1\n\t" - "movq %0, %%mm4\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "movq %%mm4, %%mm5\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpckhbw %%mm7, %%mm5\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm5, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += line_size; - } while (--h); -} - -static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t":::"memory"); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq %0, %%mm4\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "movq %%mm4, %%mm5\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpckhbw %%mm7, %%mm5\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm5, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size ; - } while(--h); -} - -static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t" - "movq %0, %%mm6\n\t" - ::"m"(mm_wone):"memory"); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq 1%1, %%mm4\n\t" - "movq 1%2, %%mm5\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "movq %%mm4, %%mm1\n\t" - "movq %%mm5, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpcklbw %%mm7, %%mm5\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm5, %%mm4\n\t" - "paddusw %%mm3, %%mm1\n\t" - "paddusw %%mm6, %%mm4\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm1, %%mm2\n\t" - "movq %0, %%mm1\n\t" - "psrlw $2, %%mm0\n\t" - "movq %%mm1, %%mm3\n\t" - "psrlw $2, %%mm2\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size; - } while(--h); -} - -static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) -{ - DCTELEM *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile("pxor %%mm7, %%mm7":::"memory"); - do { - __asm __volatile( - "movq %0, %%mm0\n\t" - "movq %1, %%mm2\n\t" - "movq 8%0, %%mm1\n\t" - "movq %%mm2, %%mm3\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "psubsw %%mm2, %%mm0\n\t" - "psubsw %%mm3, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, 8%0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += 8; - } while (--h); -} - -static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) -{ - DCTELEM *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t" - "movq %0, %%mm6" - ::"m"(mm_wone):"memory"); - do { - __asm __volatile( - "movq %0, %%mm0\n\t" - "movq %1, %%mm2\n\t" - "movq 8%0, %%mm1\n\t" - "movq 1%1, %%mm4\n\t" - "movq %%mm2, %%mm3\n\t" - "movq %%mm4, %%mm5\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpckhbw %%mm7, %%mm5\n\t" - "paddusw %%mm4, %%mm2\n\t" - "paddusw %%mm5, %%mm3\n\t" - "paddusw %%mm6, %%mm2\n\t" - "paddusw %%mm6, %%mm3\n\t" - "psrlw $1, %%mm2\n\t" - "psrlw $1, %%mm3\n\t" - "psubsw %%mm2, %%mm0\n\t" - "psubsw %%mm3, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, 8%0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += 8; - } while (--h); -} - -static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) -{ - DCTELEM *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t" - "movq %0, %%mm6" - ::"m"(mm_wone):"memory"); - do { - __asm __volatile( - "movq %0, %%mm0\n\t" - "movq %1, %%mm2\n\t" - "movq 8%0, %%mm1\n\t" - "movq %2, %%mm4\n\t" - "movq %%mm2, %%mm3\n\t" - "movq %%mm4, %%mm5\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpckhbw %%mm7, %%mm5\n\t" - "paddusw %%mm4, %%mm2\n\t" - "paddusw %%mm5, %%mm3\n\t" - "paddusw %%mm6, %%mm2\n\t" - "paddusw %%mm6, %%mm3\n\t" - "psrlw $1, %%mm2\n\t" - "psrlw $1, %%mm3\n\t" - "psubsw %%mm2, %%mm0\n\t" - "psubsw %%mm3, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, 8%0\n\t" - :"=m"(*p) - :"m"(*pix), "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += 8; - } while (--h); -} - -static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) -{ - DCTELEM *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t" - "movq %0, %%mm6\n\t" - ::"m"(mm_wtwo):"memory"); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq 1%1, %%mm4\n\t" - "movq 1%2, %%mm5\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "movq %%mm4, %%mm1\n\t" - "movq %%mm5, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpcklbw %%mm7, %%mm5\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm5, %%mm4\n\t" - "paddusw %%mm3, %%mm1\n\t" - "paddusw %%mm6, %%mm4\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm1, %%mm2\n\t" - "movq %0, %%mm1\n\t" - "movq 8%0, %%mm3\n\t" - "psrlw $2, %%mm0\n\t" - "psrlw $2, %%mm2\n\t" - "psubsw %%mm0, %%mm1\n\t" - "psubsw %%mm2, %%mm3\n\t" - "movq %%mm1, %0\n\t" - "movq %%mm3, 8%0\n\t" - :"=m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += 8 ; - } while(--h); -} - -void dsputil_init_mmx(void) -{ - mm_flags = xine_mm_accel(); -#if 0 - printf("CPU flags:"); - if (mm_flags & MM_MMX) - printf(" mmx"); - if (mm_flags & MM_MMXEXT) - printf(" mmxext"); - if (mm_flags & MM_3DNOW) - printf(" 3dnow"); - if (mm_flags & MM_SSE) - printf(" sse"); - if (mm_flags & MM_SSE2) - printf(" sse2"); - printf("\n"); -#endif - - if (mm_flags & MM_MMX) { - get_pixels = get_pixels_mmx; - put_pixels_clamped = put_pixels_clamped_mmx; - add_pixels_clamped = add_pixels_clamped_mmx; - - pix_abs16x16 = pix_abs16x16_mmx; - pix_abs16x16_x2 = pix_abs16x16_x2_mmx; - pix_abs16x16_y2 = pix_abs16x16_y2_mmx; - pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; - /* av_fdct = fdct_mmx; */ - - put_pixels_tab[0] = put_pixels_mmx; - put_pixels_tab[1] = put_pixels_x2_mmx; - put_pixels_tab[2] = put_pixels_y2_mmx; - put_pixels_tab[3] = put_pixels_xy2_mmx; - - put_no_rnd_pixels_tab[0] = put_pixels_mmx; - put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; - put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; - put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; - - avg_pixels_tab[0] = avg_pixels_mmx; - avg_pixels_tab[1] = avg_pixels_x2_mmx; - avg_pixels_tab[2] = avg_pixels_y2_mmx; - avg_pixels_tab[3] = avg_pixels_xy2_mmx; - - avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; - avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; - avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; - avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; - - sub_pixels_tab[0] = sub_pixels_mmx; - sub_pixels_tab[1] = sub_pixels_x2_mmx; - sub_pixels_tab[2] = sub_pixels_y2_mmx; - sub_pixels_tab[3] = sub_pixels_xy2_mmx; - - if (mm_flags & MM_MMXEXT) { - pix_abs16x16 = pix_abs16x16_sse; - } - - if (mm_flags & MM_SSE) { - put_pixels_tab[1] = put_pixels_x2_sse; - put_pixels_tab[2] = put_pixels_y2_sse; - - avg_pixels_tab[0] = avg_pixels_sse; - avg_pixels_tab[1] = avg_pixels_x2_sse; - avg_pixels_tab[2] = avg_pixels_y2_sse; - avg_pixels_tab[3] = avg_pixels_xy2_sse; - - sub_pixels_tab[1] = sub_pixels_x2_sse; - sub_pixels_tab[2] = sub_pixels_y2_sse; - } else if (mm_flags & MM_3DNOW) { - put_pixels_tab[1] = put_pixels_x2_3dnow; - put_pixels_tab[2] = put_pixels_y2_3dnow; - - avg_pixels_tab[0] = avg_pixels_3dnow; - avg_pixels_tab[1] = avg_pixels_x2_3dnow; - avg_pixels_tab[2] = avg_pixels_y2_3dnow; - avg_pixels_tab[3] = avg_pixels_xy2_3dnow; - - sub_pixels_tab[1] = sub_pixels_x2_3dnow; - sub_pixels_tab[2] = sub_pixels_y2_3dnow; - } - - /* idct */ - if (mm_flags & MM_MMXEXT) { - ff_idct = ff_mmxext_idct; - } else { - ff_idct = ff_mmx_idct; - } - } -} diff --git a/src/libffmpeg/libavcodec/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/dsputil_mmx_avg.h deleted file mode 100644 index 5cd640f71..000000000 --- a/src/libffmpeg/libavcodec/dsputil_mmx_avg.h +++ /dev/null @@ -1,344 +0,0 @@ -/* - * DSP utils : average functions are compiled twice for 3dnow/mmx2 - * Copyright (c) 2000, 2001 Gerard Lantau. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - */ - -static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - int dh, hh; - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - hh=h>>2; - dh=h&3; - while(hh--) { - __asm __volatile( - "movq %4, %%mm0\n\t" - "movq 1%4, %%mm1\n\t" - "movq %5, %%mm2\n\t" - "movq 1%5, %%mm3\n\t" - "movq %6, %%mm4\n\t" - "movq 1%6, %%mm5\n\t" - "movq %7, %%mm6\n\t" - "movq 1%7, %%mm7\n\t" - PAVGB" %%mm1, %%mm0\n\t" - PAVGB" %%mm3, %%mm2\n\t" - PAVGB" %%mm5, %%mm4\n\t" - PAVGB" %%mm7, %%mm6\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm2, %1\n\t" - "movq %%mm4, %2\n\t" - "movq %%mm6, %3\n\t" - :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3)) - :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3)) - :"memory"); - pix += line_size*4; p += line_size*4; - } - while(dh--) { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq 1%1, %%mm1\n\t" - PAVGB" %%mm1, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; p += line_size; - } -} - -static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - int dh, hh; - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - - hh=h>>1; - dh=h&1; - while(hh--) { - __asm __volatile( - "movq %2, %%mm0\n\t" - "movq %3, %%mm1\n\t" - "movq %4, %%mm2\n\t" - PAVGB" %%mm1, %%mm0\n\t" - PAVGB" %%mm2, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, %1\n\t" - :"=m"(*p), "=m"(*(p+line_size)) - :"m"(*pix), "m"(*(pix+line_size)), - "m"(*(pix+line_size*2)) - :"memory"); - pix += line_size*2; - p += line_size*2; - } - if(dh) { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - PAVGB" %%mm1, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - } -} - -static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - int dh, hh; - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - hh=h>>2; - dh=h&3; - while(hh--) { - __asm __volatile( - "movq %0, %%mm0\n\t" - "movq %4, %%mm1\n\t" - "movq %1, %%mm2\n\t" - "movq %5, %%mm3\n\t" - "movq %2, %%mm4\n\t" - "movq %6, %%mm5\n\t" - "movq %3, %%mm6\n\t" - "movq %7, %%mm7\n\t" - PAVGB" %%mm1, %%mm0\n\t" - PAVGB" %%mm3, %%mm2\n\t" - PAVGB" %%mm5, %%mm4\n\t" - PAVGB" %%mm7, %%mm6\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm2, %1\n\t" - "movq %%mm4, %2\n\t" - "movq %%mm6, %3\n\t" - :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3)) - :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3)) - :"memory"); - pix += line_size*4; p += line_size*4; - } - while(dh--) { - __asm __volatile( - "movq %0, %%mm0\n\t" - "movq %1, %%mm1\n\t" - PAVGB" %%mm1, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; p += line_size; - } -} - -static void DEF(avg_pixels_x2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - int dh, hh; - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - hh=h>>1; - dh=h&1; - while(hh--) { - __asm __volatile( - "movq %2, %%mm2\n\t" - "movq 1%2, %%mm3\n\t" - "movq %3, %%mm4\n\t" - "movq 1%3, %%mm5\n\t" - "movq %0, %%mm0\n\t" - "movq %1, %%mm1\n\t" - PAVGB" %%mm3, %%mm2\n\t" - PAVGB" %%mm2, %%mm0\n\t" - PAVGB" %%mm5, %%mm4\n\t" - PAVGB" %%mm4, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, %1\n\t" - :"=m"(*p), "=m"(*(p+line_size)) - :"m"(*pix), "m"(*(pix+line_size)) - :"memory"); - pix += line_size*2; - p += line_size*2; - } - if(dh) { - __asm __volatile( - "movq %1, %%mm1\n\t" - "movq 1%1, %%mm2\n\t" - "movq %0, %%mm0\n\t" - PAVGB" %%mm2, %%mm1\n\t" - PAVGB" %%mm1, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - } -} - -static void DEF(avg_pixels_y2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - int dh, hh; - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - hh=h>>1; - dh=h&1; - while(hh--) { - __asm __volatile( - "movq %2, %%mm2\n\t" - "movq %3, %%mm3\n\t" - "movq %3, %%mm4\n\t" - "movq %4, %%mm5\n\t" - "movq %0, %%mm0\n\t" - "movq %1, %%mm1\n\t" - PAVGB" %%mm3, %%mm2\n\t" - PAVGB" %%mm2, %%mm0\n\t" - PAVGB" %%mm5, %%mm4\n\t" - PAVGB" %%mm4, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, %1\n\t" - :"=m"(*p), "=m"(*(p+line_size)) - :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)) - :"memory"); - pix += line_size*2; - p += line_size*2; - } - if(dh) { - __asm __volatile( - "movq %1, %%mm1\n\t" - "movq %2, %%mm2\n\t" - "movq %0, %%mm0\n\t" - PAVGB" %%mm2, %%mm1\n\t" - PAVGB" %%mm1, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), "m"(*(pix+line_size)) - :"memory"); - } -} - -static void DEF(avg_pixels_xy2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t" - "movq %0, %%mm6\n\t" - ::"m"(mm_wtwo):"memory"); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq 1%1, %%mm4\n\t" - "movq 1%2, %%mm5\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "movq %%mm4, %%mm1\n\t" - "movq %%mm5, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpcklbw %%mm7, %%mm5\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm5, %%mm4\n\t" - "paddusw %%mm3, %%mm1\n\t" - "paddusw %%mm6, %%mm4\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm1, %%mm2\n\t" - "psrlw $2, %%mm0\n\t" - "psrlw $2, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - PAVGB" %0, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size ; - } while(--h); -} - -static void DEF(sub_pixels_x2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h) -{ - DCTELEM *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7":::"memory"); - do { - __asm __volatile( - "movq 1%1, %%mm2\n\t" - "movq %0, %%mm0\n\t" - PAVGB" %1, %%mm2\n\t" - "movq 8%0, %%mm1\n\t" - "movq %%mm2, %%mm3\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "psubsw %%mm2, %%mm0\n\t" - "psubsw %%mm3, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, 8%0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += 8; - } while (--h); -} - -static void DEF(sub_pixels_y2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h) -{ - DCTELEM *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7":::"memory"); - do { - __asm __volatile( - "movq %2, %%mm2\n\t" - "movq %0, %%mm0\n\t" - PAVGB" %1, %%mm2\n\t" - "movq 8%0, %%mm1\n\t" - "movq %%mm2, %%mm3\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "psubsw %%mm2, %%mm0\n\t" - "psubsw %%mm3, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, 8%0\n\t" - :"=m"(*p) - :"m"(*pix), "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += 8; - } while (--h); -} - diff --git a/src/libffmpeg/libavcodec/h263.c b/src/libffmpeg/libavcodec/h263.c index 29e6ab191..da694411f 100644 --- a/src/libffmpeg/libavcodec/h263.c +++ b/src/libffmpeg/libavcodec/h263.c @@ -1,7 +1,7 @@ /* * H263/MPEG4 backend for ffmpeg encoder and decoder * Copyright (c) 2000,2001 Gerard Lantau. - * H263+ support for custom picture format. + * H263+ support. * Copyright (c) 2001 Juan J. Sierralta P. * * This program is free software; you can redistribute it and/or modify @@ -28,9 +28,11 @@ static void h263_encode_block(MpegEncContext * s, DCTELEM * block, int n); static void h263_encode_motion(MpegEncContext * s, int val); +static void h263p_encode_umotion(MpegEncContext * s, int val); static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n); static int h263_decode_motion(MpegEncContext * s, int pred); +static int h263p_decode_umotion(MpegEncContext * s, int pred); static int h263_decode_block(MpegEncContext * s, DCTELEM * block, int n, int coded); static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block, @@ -57,7 +59,7 @@ int h263_get_picture_format(int width, int height) void h263_encode_picture_header(MpegEncContext * s, int picture_number) { - int format, umvplus; + int format; align_put_bits(&s->pb); put_bits(&s->pb, 22, 0x20); @@ -69,10 +71,10 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number) put_bits(&s->pb, 1, 0); /* split screen off */ put_bits(&s->pb, 1, 0); /* camera off */ put_bits(&s->pb, 1, 0); /* freeze picture release off */ - + + format = h263_get_picture_format(s->width, s->height); if (!s->h263_plus) { /* H.263v1 */ - format = h263_get_picture_format(s->width, s->height); put_bits(&s->pb, 3, format); put_bits(&s->pb, 1, (s->pict_type == P_TYPE)); /* By now UMV IS DISABLED ON H.263v1, since the restrictions @@ -89,10 +91,14 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number) /* H.263 Plus PTYPE */ put_bits(&s->pb, 3, 7); put_bits(&s->pb,3,1); /* Update Full Extended PTYPE */ - put_bits(&s->pb,3,6); /* Custom Source Format */ + if (format == 7) + put_bits(&s->pb,3,6); /* Custom Source Format */ + else + put_bits(&s->pb, 3, format); + put_bits(&s->pb,1,0); /* Custom PCF: off */ - umvplus = (s->pict_type == P_TYPE) && s->unrestricted_mv; - put_bits(&s->pb, 1, umvplus); /* Unrestricted Motion Vector */ + s->umvplus = (s->pict_type == P_TYPE) && s->unrestricted_mv; + put_bits(&s->pb, 1, s->umvplus); /* Unrestricted Motion Vector */ put_bits(&s->pb,1,0); /* SAC: off */ put_bits(&s->pb,1,0); /* Advanced Prediction Mode: off */ put_bits(&s->pb,1,0); /* Advanced Intra Coding: off */ @@ -116,14 +122,17 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number) /* This should be here if PLUSPTYPE */ put_bits(&s->pb, 1, 0); /* Continuous Presence Multipoint mode: off */ - /* Custom Picture Format (CPFMT) */ + if (format == 7) { + /* Custom Picture Format (CPFMT) */ - put_bits(&s->pb,4,2); /* Aspect ratio: CIF 12:11 (4:3) picture */ - put_bits(&s->pb,9,(s->width >> 2) - 1); - put_bits(&s->pb,1,1); /* "1" to prevent start code emulation */ - put_bits(&s->pb,9,(s->height >> 2)); + put_bits(&s->pb,4,2); /* Aspect ratio: CIF 12:11 (4:3) picture */ + put_bits(&s->pb,9,(s->width >> 2) - 1); + put_bits(&s->pb,1,1); /* "1" to prevent start code emulation */ + put_bits(&s->pb,9,(s->height >> 2)); + } + /* Unlimited Unrestricted Motion Vectors Indicator (UUI) */ - if (umvplus) + if (s->umvplus) put_bits(&s->pb,1,1); /* Limited according tables of Annex D */ put_bits(&s->pb, 5, s->qscale); } @@ -131,40 +140,82 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number) put_bits(&s->pb, 1, 0); /* no PEI */ } +int h263_encode_gob_header(MpegEncContext * s, int mb_line) +{ + int pdif=0; + + /* Check to see if we need to put a new GBSC */ + /* for RTP packetization */ + if (s->rtp_mode) { + pdif = s->pb.buf_ptr - s->ptr_lastgob; + if (pdif >= s->rtp_payload_size) { + /* Bad luck, packet must be cut before */ + align_put_bits(&s->pb); + s->ptr_lastgob = s->pb.buf_ptr; + put_bits(&s->pb, 17, 1); /* GBSC */ + s->gob_number = mb_line; + put_bits(&s->pb, 5, s->gob_number); /* GN */ + put_bits(&s->pb, 2, 1); /* GFID */ + put_bits(&s->pb, 5, s->qscale); /* GQUANT */ + return pdif; + } else if (pdif + s->mb_line_avgsize >= s->rtp_payload_size) { + /* Cut the packet before we can't */ + align_put_bits(&s->pb); + s->ptr_lastgob = s->pb.buf_ptr; + put_bits(&s->pb, 17, 1); /* GBSC */ + s->gob_number = mb_line; + put_bits(&s->pb, 5, s->gob_number); /* GN */ + put_bits(&s->pb, 2, 1); /* GFID */ + put_bits(&s->pb, 5, s->qscale); /* GQUANT */ + return pdif; + } + } + return 0; +} + void h263_encode_mb(MpegEncContext * s, DCTELEM block[6][64], int motion_x, int motion_y) { int cbpc, cbpy, i, cbp, pred_x, pred_y; - + // printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y); - if (!s->mb_intra) { - /* compute cbp */ - cbp = 0; - for (i = 0; i < 6; i++) { - if (s->block_last_index[i] >= 0) - cbp |= 1 << (5 - i); - } - if ((cbp | motion_x | motion_y) == 0) { - /* skip macroblock */ - put_bits(&s->pb, 1, 1); - return; - } - put_bits(&s->pb, 1, 0); /* mb coded */ - cbpc = cbp & 3; - put_bits(&s->pb, - inter_MCBPC_bits[cbpc], - inter_MCBPC_code[cbpc]); - cbpy = cbp >> 2; - cbpy ^= 0xf; - put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]); - - /* motion vectors: 16x16 mode only now */ - h263_pred_motion(s, 0, &pred_x, &pred_y); - - h263_encode_motion(s, motion_x - pred_x); - h263_encode_motion(s, motion_y - pred_y); - } else { + if (!s->mb_intra) { + /* compute cbp */ + cbp = 0; + for (i = 0; i < 6; i++) { + if (s->block_last_index[i] >= 0) + cbp |= 1 << (5 - i); + } + if ((cbp | motion_x | motion_y) == 0) { + /* skip macroblock */ + put_bits(&s->pb, 1, 1); + return; + } + put_bits(&s->pb, 1, 0); /* mb coded */ + cbpc = cbp & 3; + put_bits(&s->pb, + inter_MCBPC_bits[cbpc], + inter_MCBPC_code[cbpc]); + cbpy = cbp >> 2; + cbpy ^= 0xf; + put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]); + + /* motion vectors: 16x16 mode only now */ + h263_pred_motion(s, 0, &pred_x, &pred_y); + + if (!s->umvplus) { + h263_encode_motion(s, motion_x - pred_x); + h263_encode_motion(s, motion_y - pred_y); + } + else { + h263p_encode_umotion(s, motion_x - pred_x); + h263p_encode_umotion(s, motion_y - pred_y); + if (((motion_x - pred_x) == 1) && ((motion_y - pred_y) == 1)) + /* To prevent Start Code emulation */ + put_bits(&s->pb,1,1); + } + } else { /* compute cbp */ cbp = 0; for (i = 0; i < 6; i++) { @@ -234,7 +285,7 @@ INT16 *h263_pred_motion(MpegEncContext * s, int block, mot_val = s->motion_val[(x) + (y) * wrap]; /* special case for first line */ - if (y == 1 || s->first_slice_line) { + if (y == 1 || s->first_slice_line || s->first_gob_line) { A = s->motion_val[(x-1) + (y) * wrap]; *px = A[0]; *py = A[1]; @@ -305,6 +356,45 @@ static void h263_encode_motion(MpegEncContext * s, int val) } } +/* Encode MV differences on H.263+ with Unrestricted MV mode */ +static void h263p_encode_umotion(MpegEncContext * s, int val) +{ + short sval = 0; + short i = 0; + short n_bits = 0; + short temp_val; + int code = 0; + int tcode; + + if ( val == 0) + put_bits(&s->pb, 1, 1); + else if (val == 1) + put_bits(&s->pb, 3, 0); + else if (val == -1) + put_bits(&s->pb, 3, 2); + else { + + sval = ((val < 0) ? (short)(-val):(short)val); + temp_val = sval; + + while (temp_val != 0) { + temp_val = temp_val >> 1; + n_bits++; + } + + i = n_bits - 1; + while (i > 0) { + tcode = (sval & (1 << (i-1))) >> (i-1); + tcode = (tcode << 1) | 1; + code = (code << 2) | tcode; + i--; + } + code = ((code << 1) | (val < 0)) << 1; + put_bits(&s->pb, (2*n_bits)+1, code); + //printf("\nVal = %d\tCode = %d", sval, code); + } +} + void h263_encode_init_vlc(MpegEncContext *s) { static int done = 0; @@ -464,7 +554,7 @@ static int mpeg4_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr, int *di return pred; } -void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n, +void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n, int dir) { int x, y, wrap, i; @@ -489,22 +579,22 @@ void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n, /* left prediction */ ac_val -= 16; for(i=1;i<8;i++) { - block[i*8] += ac_val[i]; + block[block_permute_op(i*8)] += ac_val[i]; } } else { /* top prediction */ ac_val -= 16 * wrap; for(i=1;i<8;i++) { - block[i] += ac_val[i + 8]; + block[block_permute_op(i)] += ac_val[i + 8]; } } } /* left copy */ for(i=1;i<8;i++) - ac_val1[i] = block[i * 8]; + ac_val1[i] = block[block_permute_op(i * 8)]; /* top copy */ for(i=1;i<8;i++) - ac_val1[8 + i] = block[i]; + ac_val1[8 + i] = block[block_permute_op(i)]; } static inline void mpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr) @@ -693,7 +783,7 @@ void h263_decode_init_vlc(MpegEncContext *s) init_vlc(&intra_MCBPC_vlc, 6, 8, intra_MCBPC_bits, 1, 1, intra_MCBPC_code, 1, 1); - init_vlc(&inter_MCBPC_vlc, 9, 20, + init_vlc(&inter_MCBPC_vlc, 9, 25, inter_MCBPC_bits, 1, 1, inter_MCBPC_code, 1, 1); init_vlc(&cbpy_vlc, 6, 16, @@ -715,13 +805,38 @@ void h263_decode_init_vlc(MpegEncContext *s) } } +int h263_decode_gob_header(MpegEncContext *s) +{ + unsigned int val, gfid; + + /* Check for GOB Start Code */ + val = show_bits(&s->gb, 16); + if (val == 0) { + /* We have a GBSC probably with GSTUFF */ + skip_bits(&s->gb, 16); /* Drop the zeros */ + while (get_bits1(&s->gb) == 0); /* Seek the '1' bit */ +#ifdef DEBUG + fprintf(stderr,"\nGOB Start Code at MB %d\n", (s->mb_y * s->mb_width) + s->mb_x); +#endif + s->gob_number = get_bits(&s->gb, 5); /* GN */ + gfid = get_bits(&s->gb, 2); /* GFID */ + s->qscale = get_bits(&s->gb, 5); /* GQUANT */ +#ifdef DEBUG + fprintf(stderr, "\nGN: %u GFID: %u Quant: %u\n", gn, gfid, s->qscale); +#endif + return 1; + } + return 0; + +} + int h263_decode_mb(MpegEncContext *s, DCTELEM block[6][64]) { int cbpc, cbpy, i, cbp, pred_x, pred_y, mx, my, dquant; INT16 *mot_val; static INT8 quant_tab[4] = { -1, -2, 1, 2 }; - + if (s->pict_type == P_TYPE) { if (get_bits1(&s->gb)) { /* skip mb */ @@ -736,8 +851,14 @@ int h263_decode_mb(MpegEncContext *s, return 0; } cbpc = get_vlc(&s->gb, &inter_MCBPC_vlc); + //fprintf(stderr, "\tCBPC: %d", cbpc); if (cbpc < 0) return -1; + if (cbpc > 20) + cbpc+=3; + else if (cbpc == 20) + fprintf(stderr, "Stuffing !"); + dquant = cbpc & 8; s->mb_intra = ((cbpc & 4) != 0); } else { @@ -763,33 +884,55 @@ int h263_decode_mb(MpegEncContext *s, /* 16x16 motion prediction */ s->mv_type = MV_TYPE_16X16; h263_pred_motion(s, 0, &pred_x, &pred_y); - mx = h263_decode_motion(s, pred_x); + if (s->umvplus_dec) + mx = h263p_decode_umotion(s, pred_x); + else + mx = h263_decode_motion(s, pred_x); if (mx >= 0xffff) return -1; - my = h263_decode_motion(s, pred_y); + + if (s->umvplus_dec) + my = h263p_decode_umotion(s, pred_y); + else + my = h263_decode_motion(s, pred_y); if (my >= 0xffff) return -1; s->mv[0][0][0] = mx; s->mv[0][0][1] = my; + /*fprintf(stderr, "\n MB %d", (s->mb_y * s->mb_width) + s->mb_x); + fprintf(stderr, "\n\tmvx: %d\t\tpredx: %d", mx, pred_x); + fprintf(stderr, "\n\tmvy: %d\t\tpredy: %d", my, pred_y);*/ + if (s->umvplus_dec && (mx - pred_x) == 1 && (my - pred_y) == 1) + skip_bits1(&s->gb); /* Bit stuffing to prevent PSC */ + } else { s->mv_type = MV_TYPE_8X8; for(i=0;i<4;i++) { mot_val = h263_pred_motion(s, i, &pred_x, &pred_y); - mx = h263_decode_motion(s, pred_x); + if (s->umvplus_dec) + mx = h263p_decode_umotion(s, pred_x); + else + mx = h263_decode_motion(s, pred_x); if (mx >= 0xffff) return -1; - my = h263_decode_motion(s, pred_y); + + if (s->umvplus_dec) + my = h263p_decode_umotion(s, pred_y); + else + my = h263_decode_motion(s, pred_y); if (my >= 0xffff) return -1; s->mv[0][i][0] = mx; s->mv[0][i][1] = my; + if (s->umvplus_dec && (mx - pred_x) == 1 && (my - pred_y) == 1) + skip_bits1(&s->gb); /* Bit stuffing to prevent PSC */ mot_val[0] = mx; mot_val[1] = my; } } } else { s->ac_pred = 0; - if (s->h263_pred) { + if (s->h263_pred) { s->ac_pred = get_bits1(&s->gb); } cbpy = get_vlc(&s->gb, &cbpy_vlc); @@ -853,10 +996,37 @@ static int h263_decode_motion(MpegEncContext * s, int pred) val += 64; if (pred > 32 && val > 63) val -= 64; + } return val; } +/* Decodes RVLC of H.263+ UMV */ +static int h263p_decode_umotion(MpegEncContext * s, int pred) +{ + int code = 0, sign; + + if (get_bits1(&s->gb)) /* Motion difference = 0 */ + return pred; + + code = 2 + get_bits1(&s->gb); + + while (get_bits1(&s->gb)) + { + code <<= 1; + code += get_bits1(&s->gb); + } + sign = code & 1; + code >>= 1; + + code = (sign) ? (pred - code) : (pred + code); +#ifdef DEBUG + fprintf(stderr,"H.263+ UMV Motion = %d\n", code); +#endif + return code; + +} + static int h263_decode_block(MpegEncContext * s, DCTELEM * block, int n, int coded) { @@ -1081,16 +1251,21 @@ int h263_decode_picture_header(MpegEncContext *s) skip_bits1(&s->gb); /* camera off */ skip_bits1(&s->gb); /* freeze picture release off */ + /* Reset GOB number */ + s->gob_number = 0; + format = get_bits(&s->gb, 3); - if (format != 7) { + if (format != 7 && format != 6) { s->h263_plus = 0; /* H.263v1 */ width = h263_format[format][0]; height = h263_format[format][1]; if (!width) return -1; - + + s->width = width; + s->height = height; s->pict_type = I_TYPE + get_bits1(&s->gb); s->unrestricted_mv = get_bits1(&s->gb); @@ -1098,33 +1273,68 @@ int h263_decode_picture_header(MpegEncContext *s) if (get_bits1(&s->gb) != 0) return -1; /* SAC: off */ - if (get_bits1(&s->gb) != 0) - return -1; /* advanced prediction mode: off */ + if (get_bits1(&s->gb) != 0) { + s->mv_type = MV_TYPE_8X8; /* Advanced prediction mode */ + } + if (get_bits1(&s->gb) != 0) return -1; /* not PB frame */ s->qscale = get_bits(&s->gb, 5); skip_bits1(&s->gb); /* Continuous Presence Multipoint mode: off */ } else { - s->h263_plus = 1; + int ufep; + /* H.263v2 */ - if (get_bits(&s->gb, 3) != 1) - return -1; - if (get_bits(&s->gb, 3) != 6) /* custom source format */ + s->h263_plus = 1; + ufep = get_bits(&s->gb, 3); /* Update Full Extended PTYPE */ + + if (ufep == 1) { + /* OPPTYPE */ + format = get_bits(&s->gb, 3); + skip_bits(&s->gb,1); /* Custom PCF */ + s->umvplus_dec = get_bits(&s->gb, 1); /* Unrestricted Motion Vector */ + skip_bits1(&s->gb); /* Syntax-based Arithmetic Coding (SAC) */ + if (get_bits1(&s->gb) != 0) { + s->mv_type = MV_TYPE_8X8; /* Advanced prediction mode */ + } + skip_bits(&s->gb, 8); + skip_bits(&s->gb, 3); /* Reserved */ + } else if (ufep != 0) return -1; - skip_bits(&s->gb, 12); - skip_bits(&s->gb, 3); + + /* MPPTYPE */ s->pict_type = get_bits(&s->gb, 3) + 1; if (s->pict_type != I_TYPE && s->pict_type != P_TYPE) return -1; skip_bits(&s->gb, 7); - skip_bits(&s->gb, 4); /* aspect ratio */ - width = (get_bits(&s->gb, 9) + 1) * 4; - skip_bits1(&s->gb); - height = get_bits(&s->gb, 9) * 4; - if (height == 0) - return -1; + + /* Get the picture dimensions */ + if (ufep) { + if (format == 6) { + /* Custom Picture Format (CPFMT) */ + skip_bits(&s->gb, 4); /* aspect ratio */ + width = (get_bits(&s->gb, 9) + 1) * 4; + skip_bits1(&s->gb); + height = get_bits(&s->gb, 9) * 4; +#ifdef DEBUG + fprintf(stderr,"\nH.263+ Custom picture: %dx%d\n",width,height); +#endif + } + else { + width = h263_format[format][0]; + height = h263_format[format][1]; + } + if ((width == 0) || (height == 0)) + return -1; + s->width = width; + s->height = height; + if (s->umvplus_dec) { + skip_bits1(&s->gb); /* Unlimited Unrestricted Motion Vectors Indicator (UUI) */ + } + } + s->qscale = get_bits(&s->gb, 5); } /* PEI */ @@ -1132,8 +1342,6 @@ int h263_decode_picture_header(MpegEncContext *s) skip_bits(&s->gb, 8); } s->f_code = 1; - s->width = width; - s->height = height; return 0; } @@ -1335,3 +1543,4 @@ int intel_h263_decode_picture_header(MpegEncContext *s) s->f_code = 1; return 0; } + diff --git a/src/libffmpeg/libavcodec/h263data.h b/src/libffmpeg/libavcodec/h263data.h index 4fd9d3629..d41996a3b 100644 --- a/src/libffmpeg/libavcodec/h263data.h +++ b/src/libffmpeg/libavcodec/h263data.h @@ -4,6 +4,27 @@ static const UINT8 intra_MCBPC_code[8] = { 1, 1, 2, 3, 1, 1, 2, 3 }; static const UINT8 intra_MCBPC_bits[8] = { 1, 3, 3, 3, 4, 6, 6, 6 }; /* inter MCBPC, mb_type = (inter), (intra), (interq), (intraq), (inter4v) */ +/* Changed the tables for interq and inter4v+q, following the standard ** Juanjo ** */ +static const UINT8 inter_MCBPC_code[25] = { + 1, 3, 2, 5, + 3, 4, 3, 3, + 3, 7, 6, 5, + 4, 4, 3, 2, + 2, 5, 4, 5, + 1, /* Stuffing */ + 2, 12, 14, 15, +}; +static const UINT8 inter_MCBPC_bits[25] = { + 1, 4, 4, 6, + 5, 8, 8, 7, + 3, 7, 7, 9, + 6, 9, 9, 9, + 3, 7, 7, 8, + 9, /* Stuffing */ + 11, 13, 13, 13, +}; + +/* This is the old table static const UINT8 inter_MCBPC_code[20] = { 1, 3, 2, 5, 3, 4, 3, 3, @@ -17,7 +38,7 @@ static const UINT8 inter_MCBPC_bits[20] = { 12, 12, 12, 12, 6, 9, 9, 9, 3, 7, 7, 8, -}; +};*/ static const UINT8 cbpy_tab[16][2] = { diff --git a/src/libffmpeg/libavcodec/h263dec.c b/src/libffmpeg/libavcodec/h263dec.c index ec14c09e2..be76e3030 100644 --- a/src/libffmpeg/libavcodec/h263dec.c +++ b/src/libffmpeg/libavcodec/h263dec.c @@ -23,7 +23,7 @@ #include "avcodec.h" #include "mpegvideo.h" -#undef DEBUG +//#define DEBUG static int h263_decode_init(AVCodecContext *avctx) { @@ -39,6 +39,8 @@ static int h263_decode_init(AVCodecContext *avctx) /* select sub codec */ switch(avctx->codec->id) { case CODEC_ID_H263: + s->gob_number = 0; + s->first_gob_line = 0; break; case CODEC_ID_MPEG4: s->time_increment_bits = 4; /* default value for broken headers */ @@ -56,8 +58,9 @@ static int h263_decode_init(AVCodecContext *avctx) } /* for h263, we allocate the images after having read the header */ - if (MPV_common_init(s) < 0) - return -1; + if (avctx->codec->id != CODEC_ID_H263) + if (MPV_common_init(s) < 0) + return -1; /* XXX: suppress this matrix init, only needed because using mpeg1 dequantize in mmx case */ @@ -92,7 +95,7 @@ static int h263_decode_frame(AVCodecContext *avctx, printf("*****frame %d size=%d\n", avctx->frame_number, buf_size); printf("bytes=%x %x %x %x\n", buf[0], buf[1], buf[2], buf[3]); #endif - + /* no supplementary picture */ if (buf_size == 0) { *data_size = 0; @@ -110,18 +113,25 @@ static int h263_decode_frame(AVCodecContext *avctx, ret = intel_h263_decode_picture_header(s); } else { ret = h263_decode_picture_header(s); + /* After H263 header decode we have the height, width, */ + /* and other parameters. So then we could init the picture */ + /* FIXME: By the way H263 decoder is evolving it should have */ + /* an H263EncContext */ + if (!s->context_initialized) { + avctx->width = s->width; + avctx->height = s->height; + if (MPV_common_init(s) < 0) + return -1; + } else if (s->width != avctx->width || s->height != avctx->height) { + /* H.263 could change picture size any time */ + MPV_common_end(s); + if (MPV_common_init(s) < 0) + return -1; + } } if (ret < 0) return -1; - /* make sure we start with an I-frame */ - if (s->waiting_for_keyframe) { - if (s->pict_type != I_TYPE) - return -1; - else - s->waiting_for_keyframe = 0; - } - MPV_frame_start(s); #ifdef DEBUG @@ -130,10 +140,17 @@ static int h263_decode_frame(AVCodecContext *avctx, /* decode each macroblock */ for(s->mb_y=0; s->mb_y < s->mb_height; s->mb_y++) { + /* Check for GOB headers on H.263 */ + /* FIXME: In the future H.263+ will have intra prediction */ + /* and we are gonna need another way to detect MPEG4 */ + if (s->mb_y && !s->h263_pred) { + s->first_gob_line = h263_decode_gob_header(s); + } for(s->mb_x=0; s->mb_x < s->mb_width; s->mb_x++) { #ifdef DEBUG printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y); #endif + //fprintf(stderr,"\nFrame: %d\tMB: %d",avctx->frame_number, (s->mb_y * s->mb_width) + s->mb_x); /* DCT & quantize */ if (s->h263_msmpeg4) { msmpeg4_dc_scale(s); @@ -152,8 +169,10 @@ static int h263_decode_frame(AVCodecContext *avctx, if (msmpeg4_decode_mb(s, s->block) < 0) return -1; } else { - if (h263_decode_mb(s, s->block) < 0) + if (h263_decode_mb(s, s->block) < 0) { + fprintf(stderr,"\nError at MB: %d\n", (s->mb_y * s->mb_width) + s->mb_x); return -1; + } } MPV_decode_mb(s, s->block); } diff --git a/src/libffmpeg/libavcodec/idct_mlib.c b/src/libffmpeg/libavcodec/idct_mlib.c deleted file mode 100644 index 63421273f..000000000 --- a/src/libffmpeg/libavcodec/idct_mlib.c +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Sun mediaLib optimized DSP utils - * Copyright (c) 2001 Juergen Keil. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -#include "dsputil.h" - -#include <mlib_types.h> -#include <mlib_status.h> -#include <mlib_sys.h> -#include <mlib_video.h> - - -void ff_idct_mlib(DCTELEM *data) -{ - mlib_VideoIDCT8x8_S16_S16 (data, data); -} - - -void ff_fdct_mlib(DCTELEM *data) -{ - mlib_VideoDCT8x8_S16_S16 (data, data); -} diff --git a/src/libffmpeg/libavcodec/idct_mmx.c b/src/libffmpeg/libavcodec/idct_mmx.c deleted file mode 100644 index 7141347ec..000000000 --- a/src/libffmpeg/libavcodec/idct_mmx.c +++ /dev/null @@ -1,597 +0,0 @@ -/* - * Note: For libavcodec, this code can also be used under the LGPL license - */ -/* - * idct_mmx.c - * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> - * - * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. - * - * mpeg2dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * mpeg2dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <inttypes.h> - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "xineutils.h" - -#ifdef ATTR_ALIGN -#undef ATTR_ALIGN -#endif -#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) - -#define ROW_SHIFT 11 -#define COL_SHIFT 6 - -#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) -#define rounder(bias) {round (bias), round (bias)} - -#if 0 -/* C row IDCT - its just here to document the MMXEXT and MMX versions */ -static inline void idct_row (int16_t * row, int offset, - int16_t * table, int32_t * rounder) -{ - int C1, C2, C3, C4, C5, C6, C7; - int a0, a1, a2, a3, b0, b1, b2, b3; - - row += offset; - - C1 = table[1]; - C2 = table[2]; - C3 = table[3]; - C4 = table[4]; - C5 = table[5]; - C6 = table[6]; - C7 = table[7]; - - a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; - a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; - a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; - a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; - - b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; - b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; - b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; - b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; - - row[0] = (a0 + b0) >> ROW_SHIFT; - row[1] = (a1 + b1) >> ROW_SHIFT; - row[2] = (a2 + b2) >> ROW_SHIFT; - row[3] = (a3 + b3) >> ROW_SHIFT; - row[4] = (a3 - b3) >> ROW_SHIFT; - row[5] = (a2 - b2) >> ROW_SHIFT; - row[6] = (a1 - b1) >> ROW_SHIFT; - row[7] = (a0 - b0) >> ROW_SHIFT; -} -#endif - - -/* MMXEXT row IDCT */ - -#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ - c4, c6, c4, c6, \ - c1, c3, -c1, -c5, \ - c5, c7, c3, -c7, \ - c4, -c6, c4, -c6, \ - -c4, c2, c4, -c2, \ - c5, -c1, c3, -c1, \ - c7, c3, c7, -c5 } - -static inline void mmxext_row_head (int16_t * row, int offset, int16_t * table) -{ - movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 - - movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 - movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 - - movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 - movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 - - movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 - pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 - - pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 -} - -static inline void mmxext_row (int16_t * table, int32_t * rounder) -{ - movq_m2r (*(table+8), mm1); // mm1 = -C5 -C1 C3 C1 - pmaddwd_r2r (mm2, mm4); // mm4 = C4*x0+C6*x2 C4*x4+C6*x6 - - pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x4-C6*x6 C4*x0-C6*x2 - pshufw_r2r (mm6, mm6, 0x4e); // mm6 = x3 x1 x7 x5 - - movq_m2r (*(table+12), mm7); // mm7 = -C7 C3 C7 C5 - pmaddwd_r2r (mm5, mm1); // mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 - - paddd_m2r (*rounder, mm3); // mm3 += rounder - pmaddwd_r2r (mm6, mm7); // mm7 = C3*x1-C7*x3 C5*x5+C7*x7 - - pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 - paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder - - pmaddwd_m2r (*(table+24), mm5); // mm5 = C3*x5-C1*x7 C5*x1-C1*x3 - movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder - - pmaddwd_m2r (*(table+28), mm6); // mm6 = C7*x1-C5*x3 C7*x5+C3*x7 - paddd_r2r (mm7, mm1); // mm1 = b1 b0 - - paddd_m2r (*rounder, mm0); // mm0 += rounder - psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder - - psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 - paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder - - paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder - psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 - - paddd_r2r (mm6, mm5); // mm5 = b3 b2 - movq_r2r (mm0, mm4); // mm4 = a3 a2 + rounder - - paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder - psubd_r2r (mm5, mm4); // mm4 = a3-b3 a2-b2 + rounder -} - -static inline void mmxext_row_tail (int16_t * row, int store) -{ - psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 - - psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 - - packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 - - packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 - - movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 - pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 - - /* slot */ - - movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 -} - -static inline void mmxext_row_mid (int16_t * row, int store, - int offset, int16_t * table) -{ - movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 - psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 - - movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 - psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 - - packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 - movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 - - packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 - movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 - - movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 - pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 - - movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 - movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 - - pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 - - movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 - pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 -} - - -/* MMX row IDCT */ - -#define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ - c4, c6, -c4, -c2, \ - c1, c3, c3, -c7, \ - c5, c7, -c1, -c5, \ - c4, -c6, c4, -c2, \ - -c4, c2, c4, -c6, \ - c5, -c1, c7, -c5, \ - c7, c3, c3, -c1 } - -static inline void mmx_row_head (int16_t * row, int offset, int16_t * table) -{ - movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 - - movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 - movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 - - movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 - movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 - - punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 - - movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 - pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 - - movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 - punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 -} - -static inline void mmx_row (int16_t * table, int32_t * rounder) -{ - pmaddwd_r2r (mm2, mm4); // mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 - punpckldq_r2r (mm5, mm5); // mm5 = x3 x1 x3 x1 - - pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x0-C2*x2 C4*x0-C6*x2 - punpckhdq_r2r (mm6, mm6); // mm6 = x7 x5 x7 x5 - - movq_m2r (*(table+12), mm7); // mm7 = -C5 -C1 C7 C5 - pmaddwd_r2r (mm5, mm1); // mm1 = C3*x1-C7*x3 C1*x1+C3*x3 - - paddd_m2r (*rounder, mm3); // mm3 += rounder - pmaddwd_r2r (mm6, mm7); // mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 - - pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 - paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder - - pmaddwd_m2r (*(table+24), mm5); // mm5 = C7*x1-C5*x3 C5*x1-C1*x3 - movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder - - pmaddwd_m2r (*(table+28), mm6); // mm6 = C3*x5-C1*x7 C7*x5+C3*x7 - paddd_r2r (mm7, mm1); // mm1 = b1 b0 - - paddd_m2r (*rounder, mm0); // mm0 += rounder - psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder - - psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 - paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder - - paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder - psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 - - paddd_r2r (mm6, mm5); // mm5 = b3 b2 - movq_r2r (mm0, mm7); // mm7 = a3 a2 + rounder - - paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder - psubd_r2r (mm5, mm7); // mm7 = a3-b3 a2-b2 + rounder -} - -static inline void mmx_row_tail (int16_t * row, int store) -{ - psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 - - psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 - - packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 - - packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 - - movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 - movq_r2r (mm7, mm4); // mm4 = y6 y7 y4 y5 - - pslld_i2r (16, mm7); // mm7 = y7 0 y5 0 - - psrld_i2r (16, mm4); // mm4 = 0 y6 0 y4 - - por_r2r (mm4, mm7); // mm7 = y7 y6 y5 y4 - - /* slot */ - - movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 -} - -static inline void mmx_row_mid (int16_t * row, int store, - int offset, int16_t * table) -{ - movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 - psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 - - movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 - psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 - - packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 - movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 - - packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 - movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 - - movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 - movq_r2r (mm7, mm1); // mm1 = y6 y7 y4 y5 - - punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 - psrld_i2r (16, mm7); // mm7 = 0 y6 0 y4 - - movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 - pslld_i2r (16, mm1); // mm1 = y7 0 y5 0 - - movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 - por_r2r (mm1, mm7); // mm7 = y7 y6 y5 y4 - - movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 - punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 - - movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 - pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 -} - - -#if 0 -// C column IDCT - its just here to document the MMXEXT and MMX versions -static inline void idct_col (int16_t * col, int offset) -{ -/* multiplication - as implemented on mmx */ -#define F(c,x) (((c) * (x)) >> 16) - -/* saturation - it helps us handle torture test cases */ -#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) - - int16_t x0, x1, x2, x3, x4, x5, x6, x7; - int16_t y0, y1, y2, y3, y4, y5, y6, y7; - int16_t a0, a1, a2, a3, b0, b1, b2, b3; - int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; - - col += offset; - - x0 = col[0*8]; - x1 = col[1*8]; - x2 = col[2*8]; - x3 = col[3*8]; - x4 = col[4*8]; - x5 = col[5*8]; - x6 = col[6*8]; - x7 = col[7*8]; - - u04 = S (x0 + x4); - v04 = S (x0 - x4); - u26 = S (F (T2, x6) + x2); - v26 = S (F (T2, x2) - x6); - - a0 = S (u04 + u26); - a1 = S (v04 + v26); - a2 = S (v04 - v26); - a3 = S (u04 - u26); - - u17 = S (F (T1, x7) + x1); - v17 = S (F (T1, x1) - x7); - u35 = S (F (T3, x5) + x3); - v35 = S (F (T3, x3) - x5); - - b0 = S (u17 + u35); - b3 = S (v17 - v35); - u12 = S (u17 - u35); - v12 = S (v17 + v35); - u12 = S (2 * F (C4, u12)); - v12 = S (2 * F (C4, v12)); - b1 = S (u12 + v12); - b2 = S (u12 - v12); - - y0 = S (a0 + b0) >> COL_SHIFT; - y1 = S (a1 + b1) >> COL_SHIFT; - y2 = S (a2 + b2) >> COL_SHIFT; - y3 = S (a3 + b3) >> COL_SHIFT; - - y4 = S (a3 - b3) >> COL_SHIFT; - y5 = S (a2 - b2) >> COL_SHIFT; - y6 = S (a1 - b1) >> COL_SHIFT; - y7 = S (a0 - b0) >> COL_SHIFT; - - col[0*8] = y0; - col[1*8] = y1; - col[2*8] = y2; - col[3*8] = y3; - col[4*8] = y4; - col[5*8] = y5; - col[6*8] = y6; - col[7*8] = y7; -} -#endif - - -// MMX column IDCT -static inline void idct_col (int16_t * col, int offset) -{ -#define T1 13036 -#define T2 27146 -#define T3 43790 -#define C4 23170 - - static short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; - static short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; - static short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; - static short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; - - /* column code adapted from peter gubanov */ - /* http://www.elecard.com/peter/idct.shtml */ - - movq_m2r (*_T1, mm0); // mm0 = T1 - - movq_m2r (*(col+offset+1*8), mm1); // mm1 = x1 - movq_r2r (mm0, mm2); // mm2 = T1 - - movq_m2r (*(col+offset+7*8), mm4); // mm4 = x7 - pmulhw_r2r (mm1, mm0); // mm0 = T1*x1 - - movq_m2r (*_T3, mm5); // mm5 = T3 - pmulhw_r2r (mm4, mm2); // mm2 = T1*x7 - - movq_m2r (*(col+offset+5*8), mm6); // mm6 = x5 - movq_r2r (mm5, mm7); // mm7 = T3-1 - - movq_m2r (*(col+offset+3*8), mm3); // mm3 = x3 - psubsw_r2r (mm4, mm0); // mm0 = v17 - - movq_m2r (*_T2, mm4); // mm4 = T2 - pmulhw_r2r (mm3, mm5); // mm5 = (T3-1)*x3 - - paddsw_r2r (mm2, mm1); // mm1 = u17 - pmulhw_r2r (mm6, mm7); // mm7 = (T3-1)*x5 - - /* slot */ - - movq_r2r (mm4, mm2); // mm2 = T2 - paddsw_r2r (mm3, mm5); // mm5 = T3*x3 - - pmulhw_m2r (*(col+offset+2*8), mm4);// mm4 = T2*x2 - paddsw_r2r (mm6, mm7); // mm7 = T3*x5 - - psubsw_r2r (mm6, mm5); // mm5 = v35 - paddsw_r2r (mm3, mm7); // mm7 = u35 - - movq_m2r (*(col+offset+6*8), mm3); // mm3 = x6 - movq_r2r (mm0, mm6); // mm6 = v17 - - pmulhw_r2r (mm3, mm2); // mm2 = T2*x6 - psubsw_r2r (mm5, mm0); // mm0 = b3 - - psubsw_r2r (mm3, mm4); // mm4 = v26 - paddsw_r2r (mm6, mm5); // mm5 = v12 - - movq_r2m (mm0, *(col+offset+3*8)); // save b3 in scratch0 - movq_r2r (mm1, mm6); // mm6 = u17 - - paddsw_m2r (*(col+offset+2*8), mm2);// mm2 = u26 - paddsw_r2r (mm7, mm6); // mm6 = b0 - - psubsw_r2r (mm7, mm1); // mm1 = u12 - movq_r2r (mm1, mm7); // mm7 = u12 - - movq_m2r (*(col+offset+0*8), mm3); // mm3 = x0 - paddsw_r2r (mm5, mm1); // mm1 = u12+v12 - - movq_m2r (*_C4, mm0); // mm0 = C4/2 - psubsw_r2r (mm5, mm7); // mm7 = u12-v12 - - movq_r2m (mm6, *(col+offset+5*8)); // save b0 in scratch1 - pmulhw_r2r (mm0, mm1); // mm1 = b1/2 - - movq_r2r (mm4, mm6); // mm6 = v26 - pmulhw_r2r (mm0, mm7); // mm7 = b2/2 - - movq_m2r (*(col+offset+4*8), mm5); // mm5 = x4 - movq_r2r (mm3, mm0); // mm0 = x0 - - psubsw_r2r (mm5, mm3); // mm3 = v04 - paddsw_r2r (mm5, mm0); // mm0 = u04 - - paddsw_r2r (mm3, mm4); // mm4 = a1 - movq_r2r (mm0, mm5); // mm5 = u04 - - psubsw_r2r (mm6, mm3); // mm3 = a2 - paddsw_r2r (mm2, mm5); // mm5 = a0 - - paddsw_r2r (mm1, mm1); // mm1 = b1 - psubsw_r2r (mm2, mm0); // mm0 = a3 - - paddsw_r2r (mm7, mm7); // mm7 = b2 - movq_r2r (mm3, mm2); // mm2 = a2 - - movq_r2r (mm4, mm6); // mm6 = a1 - paddsw_r2r (mm7, mm3); // mm3 = a2+b2 - - psraw_i2r (COL_SHIFT, mm3); // mm3 = y2 - paddsw_r2r (mm1, mm4); // mm4 = a1+b1 - - psraw_i2r (COL_SHIFT, mm4); // mm4 = y1 - psubsw_r2r (mm1, mm6); // mm6 = a1-b1 - - movq_m2r (*(col+offset+5*8), mm1); // mm1 = b0 - psubsw_r2r (mm7, mm2); // mm2 = a2-b2 - - psraw_i2r (COL_SHIFT, mm6); // mm6 = y6 - movq_r2r (mm5, mm7); // mm7 = a0 - - movq_r2m (mm4, *(col+offset+1*8)); // save y1 - psraw_i2r (COL_SHIFT, mm2); // mm2 = y5 - - movq_r2m (mm3, *(col+offset+2*8)); // save y2 - paddsw_r2r (mm1, mm5); // mm5 = a0+b0 - - movq_m2r (*(col+offset+3*8), mm4); // mm4 = b3 - psubsw_r2r (mm1, mm7); // mm7 = a0-b0 - - psraw_i2r (COL_SHIFT, mm5); // mm5 = y0 - movq_r2r (mm0, mm3); // mm3 = a3 - - movq_r2m (mm2, *(col+offset+5*8)); // save y5 - psubsw_r2r (mm4, mm3); // mm3 = a3-b3 - - psraw_i2r (COL_SHIFT, mm7); // mm7 = y7 - paddsw_r2r (mm0, mm4); // mm4 = a3+b3 - - movq_r2m (mm5, *(col+offset+0*8)); // save y0 - psraw_i2r (COL_SHIFT, mm3); // mm3 = y4 - - movq_r2m (mm6, *(col+offset+6*8)); // save y6 - psraw_i2r (COL_SHIFT, mm4); // mm4 = y3 - - movq_r2m (mm7, *(col+offset+7*8)); // save y7 - - movq_r2m (mm3, *(col+offset+4*8)); // save y4 - - movq_r2m (mm4, *(col+offset+3*8)); // save y3 -} - - -static int32_t rounder0[] ATTR_ALIGN(8) = - rounder ((1 << (COL_SHIFT - 1)) - 0.5); -static int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); -static int32_t rounder1[] ATTR_ALIGN(8) = - rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ -static int32_t rounder7[] ATTR_ALIGN(8) = - rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ -static int32_t rounder2[] ATTR_ALIGN(8) = - rounder (0.60355339059); /* C2 * (C6+C2)/2 */ -static int32_t rounder6[] ATTR_ALIGN(8) = - rounder (-0.25); /* C2 * (C6-C2)/2 */ -static int32_t rounder3[] ATTR_ALIGN(8) = - rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ -static int32_t rounder5[] ATTR_ALIGN(8) = - rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ - - -#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ -void idct (int16_t * block) \ -{ \ - static int16_t table04[] ATTR_ALIGN(16) = \ - table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ - static int16_t table17[] ATTR_ALIGN(16) = \ - table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ - static int16_t table26[] ATTR_ALIGN(16) = \ - table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ - static int16_t table35[] ATTR_ALIGN(16) = \ - table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ - \ - idct_row_head (block, 0*8, table04); \ - idct_row (table04, rounder0); \ - idct_row_mid (block, 0*8, 4*8, table04); \ - idct_row (table04, rounder4); \ - idct_row_mid (block, 4*8, 1*8, table17); \ - idct_row (table17, rounder1); \ - idct_row_mid (block, 1*8, 7*8, table17); \ - idct_row (table17, rounder7); \ - idct_row_mid (block, 7*8, 2*8, table26); \ - idct_row (table26, rounder2); \ - idct_row_mid (block, 2*8, 6*8, table26); \ - idct_row (table26, rounder6); \ - idct_row_mid (block, 6*8, 3*8, table35); \ - idct_row (table35, rounder3); \ - idct_row_mid (block, 3*8, 5*8, table35); \ - idct_row (table35, rounder5); \ - idct_row_tail (block, 5*8); \ - \ - idct_col (block, 0); \ - idct_col (block, 4); \ -} - - -declare_idct (ff_mmxext_idct, mmxext_table, - mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) - -declare_idct (ff_mmx_idct, mmx_table, - mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) diff --git a/src/libffmpeg/libavcodec/imgconvert.c b/src/libffmpeg/libavcodec/imgconvert.c new file mode 100644 index 000000000..d39b6c1e9 --- /dev/null +++ b/src/libffmpeg/libavcodec/imgconvert.c @@ -0,0 +1,548 @@ +/* + * Misc image convertion routines + * Copyright (c) 2001 Gerard Lantau. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include "avcodec.h" +#include "dsputil.h" + +#ifdef USE_FASTMEMCPY +#include "fastmemcpy.h" +#endif +/* XXX: totally non optimized */ + +static void yuv422_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr, + UINT8 *src, int width, int height) +{ + int x, y; + UINT8 *p = src; + + for(y=0;y<height;y+=2) { + for(x=0;x<width;x+=2) { + lum[0] = p[0]; + cb[0] = p[1]; + lum[1] = p[2]; + cr[0] = p[3]; + p += 4; + lum += 2; + cb++; + cr++; + } + for(x=0;x<width;x+=2) { + lum[0] = p[0]; + lum[1] = p[2]; + p += 4; + lum += 2; + } + } +} + +#define SCALEBITS 8 +#define ONE_HALF (1 << (SCALEBITS - 1)) +#define FIX(x) ((int) ((x) * (1L<<SCALEBITS) + 0.5)) + +static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr, + UINT8 *src, int width, int height) +{ + int wrap, wrap3, x, y; + int r, g, b, r1, g1, b1; + UINT8 *p; + + wrap = width; + wrap3 = width * 3; + p = src; + for(y=0;y<height;y+=2) { + for(x=0;x<width;x+=2) { + r = p[0]; + g = p[1]; + b = p[2]; + r1 = r; + g1 = g; + b1 = b; + lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + + FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; + r = p[3]; + g = p[4]; + b = p[5]; + r1 += r; + g1 += g; + b1 += b; + lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + + FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; + p += wrap3; + lum += wrap; + + r = p[0]; + g = p[1]; + b = p[2]; + r1 += r; + g1 += g; + b1 += b; + lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + + FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; + r = p[3]; + g = p[4]; + b = p[5]; + r1 += r; + g1 += g; + b1 += b; + lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + + FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; + + cb[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 + + FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128; + cr[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 - + FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128; + + cb++; + cr++; + p += -wrap3 + 2 * 3; + lum += -wrap + 2; + } + p += wrap3; + lum += wrap; + } +} + +static void bgr24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr, + UINT8 *src, int width, int height) +{ + int wrap, wrap3, x, y; + int r, g, b, r1, g1, b1; + UINT8 *p; + + wrap = width; + wrap3 = width * 3; + p = src; + for(y=0;y<height;y+=2) { + for(x=0;x<width;x+=2) { + b = p[0]; + g = p[1]; + r = p[2]; + r1 = r; + g1 = g; + b1 = b; + lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + + FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; + b = p[3]; + g = p[4]; + r = p[5]; + r1 += r; + g1 += g; + b1 += b; + lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + + FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; + p += wrap3; + lum += wrap; + + b = p[0]; + g = p[1]; + r = p[2]; + r1 += r; + g1 += g; + b1 += b; + lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + + FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; + b = p[3]; + g = p[4]; + r = p[5]; + r1 += r; + g1 += g; + b1 += b; + lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + + FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; + + cb[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 + + FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128; + cr[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 - + FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128; + + cb++; + cr++; + p += -wrap3 + 2 * 3; + lum += -wrap + 2; + } + p += wrap3; + lum += wrap; + } +} + +/* XXX: use generic filter ? */ +/* 1x2 -> 1x1 */ +static void shrink2(UINT8 *dst, int dst_wrap, + UINT8 *src, int src_wrap, + int width, int height) +{ + int w; + UINT8 *s1, *s2, *d; + + for(;height > 0; height--) { + s1 = src; + s2 = s1 + src_wrap; + d = dst; + for(w = width;w >= 4; w-=4) { + d[0] = (s1[0] + s2[0]) >> 1; + d[1] = (s1[1] + s2[1]) >> 1; + d[2] = (s1[2] + s2[2]) >> 1; + d[3] = (s1[3] + s2[3]) >> 1; + s1 += 4; + s2 += 4; + d += 4; + } + for(;w > 0; w--) { + d[0] = (s1[0] + s2[0]) >> 1; + s1++; + s2++; + d++; + } + src += 2 * src_wrap; + dst += dst_wrap; + } +} + +/* 2x2 -> 1x1 */ +static void shrink22(UINT8 *dst, int dst_wrap, + UINT8 *src, int src_wrap, + int width, int height) +{ + int w; + UINT8 *s1, *s2, *d; + + for(;height > 0; height--) { + s1 = src; + s2 = s1 + src_wrap; + d = dst; + for(w = width;w >= 4; w-=4) { + d[0] = (s1[0] + s1[1] + s2[0] + s2[1] + 2) >> 1; + d[1] = (s1[2] + s1[3] + s2[2] + s2[3] + 2) >> 1; + d[2] = (s1[4] + s1[5] + s2[4] + s2[5] + 2) >> 1; + d[3] = (s1[6] + s1[7] + s2[6] + s2[7] + 2) >> 1; + s1 += 8; + s2 += 8; + d += 4; + } + for(;w > 0; w--) { + d[0] = (s1[0] + s1[1] + s2[0] + s2[1] + 2) >> 1; + s1 += 2; + s2 += 2; + d++; + } + src += 2 * src_wrap; + dst += dst_wrap; + } +} + +static void img_copy(UINT8 *dst, int dst_wrap, + UINT8 *src, int src_wrap, + int width, int height) +{ + for(;height > 0; height--) { + memcpy(dst, src, width); + dst += dst_wrap; + src += src_wrap; + } +} + +#define SCALE_BITS 10 + +#define C_Y (76309 >> (16 - SCALE_BITS)) +#define C_RV (117504 >> (16 - SCALE_BITS)) +#define C_BU (138453 >> (16 - SCALE_BITS)) +#define C_GU (13954 >> (16 - SCALE_BITS)) +#define C_GV (34903 >> (16 - SCALE_BITS)) + +#define RGBOUT(r, g, b, y1)\ +{\ + y = (y1 - 16) * C_Y;\ + r = cm[(y + r_add) >> SCALE_BITS];\ + g = cm[(y + g_add) >> SCALE_BITS];\ + b = cm[(y + b_add) >> SCALE_BITS];\ +} + +/* XXX: no chroma interpolating is done */ +static void yuv420p_to_rgb24(AVPicture *dst, AVPicture *src, + int width, int height) +{ + UINT8 *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr, *d, *d1, *d2; + int w, y, cb, cr, r_add, g_add, b_add, width2; + UINT8 *cm = cropTbl + MAX_NEG_CROP; + + d = dst->data[0]; + y1_ptr = src->data[0]; + cb_ptr = src->data[1]; + cr_ptr = src->data[2]; + width2 = width >> 1; + for(;height > 0; height -= 2) { + d1 = d; + d2 = d + dst->linesize[0]; + y2_ptr = y1_ptr + src->linesize[0]; + for(w = width2; w > 0; w --) { + cb = cb_ptr[0] - 128; + cr = cr_ptr[0] - 128; + r_add = C_RV * cr + (1 << (SCALE_BITS - 1)); + g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1)); + b_add = C_BU * cb + (1 << (SCALE_BITS - 1)); + + /* output 4 pixels */ + RGBOUT(d1[0], d1[1], d1[2], y1_ptr[0]); + RGBOUT(d1[3], d1[4], d1[5], y1_ptr[1]); + RGBOUT(d2[0], d2[1], d2[2], y2_ptr[0]); + RGBOUT(d2[3], d2[4], d2[5], y2_ptr[1]); + + d1 += 6; + d2 += 6; + y1_ptr += 2; + y2_ptr += 2; + cb_ptr++; + cr_ptr++; + } + d += 2 * dst->linesize[0]; + y1_ptr += 2 * src->linesize[0] - width; + cb_ptr += src->linesize[1] - width2; + cr_ptr += src->linesize[2] - width2; + } +} + +/* XXX: no chroma interpolating is done */ +static void yuv422p_to_rgb24(AVPicture *dst, AVPicture *src, + int width, int height) +{ + UINT8 *y1_ptr, *cb_ptr, *cr_ptr, *d, *d1; + int w, y, cb, cr, r_add, g_add, b_add, width2; + UINT8 *cm = cropTbl + MAX_NEG_CROP; + + d = dst->data[0]; + y1_ptr = src->data[0]; + cb_ptr = src->data[1]; + cr_ptr = src->data[2]; + width2 = width >> 1; + for(;height > 0; height --) { + d1 = d; + for(w = width2; w > 0; w --) { + cb = cb_ptr[0] - 128; + cr = cr_ptr[0] - 128; + r_add = C_RV * cr + (1 << (SCALE_BITS - 1)); + g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1)); + b_add = C_BU * cb + (1 << (SCALE_BITS - 1)); + + /* output 2 pixels */ + RGBOUT(d1[0], d1[1], d1[2], y1_ptr[0]); + RGBOUT(d1[3], d1[4], d1[5], y1_ptr[1]); + + d1 += 6; + y1_ptr += 2; + cb_ptr++; + cr_ptr++; + } + d += dst->linesize[0]; + y1_ptr += src->linesize[0] - width; + cb_ptr += src->linesize[1] - width2; + cr_ptr += src->linesize[2] - width2; + } +} + +/* XXX: always use linesize. Return -1 if not supported */ +int img_convert(AVPicture *dst, int dst_pix_fmt, + AVPicture *src, int pix_fmt, + int width, int height) +{ + int i; + + if (dst_pix_fmt == pix_fmt) { + switch(pix_fmt) { + case PIX_FMT_YUV420P: + for(i=0;i<3;i++) { + if (i == 1) { + width >>= 1; + height >>= 1; + } + img_copy(dst->data[i], dst->linesize[i], + src->data[i], src->linesize[i], + width, height); + } + break; + default: + return -1; + } + } else if (dst_pix_fmt == PIX_FMT_YUV420P) { + + switch(pix_fmt) { + case PIX_FMT_YUV420P: + for(i=0;i<3;i++) { + img_copy(dst->data[i], dst->linesize[i], + src->data[i], src->linesize[i], + width, height); + } + break; + case PIX_FMT_YUV422P: + img_copy(dst->data[0], dst->linesize[0], + src->data[0], src->linesize[0], + width, height); + width >>= 1; + height >>= 1; + for(i=1;i<3;i++) { + shrink2(dst->data[i], dst->linesize[i], + src->data[i], src->linesize[i], + width, height); + } + break; + case PIX_FMT_YUV444P: + img_copy(dst->data[0], dst->linesize[0], + src->data[0], src->linesize[0], + width, height); + width >>= 1; + height >>= 1; + for(i=1;i<3;i++) { + shrink22(dst->data[i], dst->linesize[i], + src->data[i], src->linesize[i], + width, height); + } + break; + case PIX_FMT_YUV422: + yuv422_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], + src->data[0], width, height); + break; + case PIX_FMT_RGB24: + rgb24_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], + src->data[0], width, height); + break; + case PIX_FMT_BGR24: + bgr24_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], + src->data[0], width, height); + break; + default: + return -1; + } + } else if (dst_pix_fmt == PIX_FMT_RGB24) { + switch(pix_fmt) { + case PIX_FMT_YUV420P: + yuv420p_to_rgb24(dst, src, width, height); + break; + case PIX_FMT_YUV422P: + yuv422p_to_rgb24(dst, src, width, height); + break; + default: + return -1; + } + } else { + return -1; + } + return 0; +} + +/* filter parameters: [-1 4 2 4 -1] // 8 */ +static void deinterlace_line(UINT8 *dst, UINT8 *src, int src_wrap, + int size) +{ + UINT8 *cm = cropTbl + MAX_NEG_CROP; + int sum; + UINT8 *s; + + for(;size > 0;size--) { + s = src; + sum = -s[0]; + s += src_wrap; + sum += s[0] << 2; + s += src_wrap; + sum += s[0] << 1; + s += src_wrap; + sum += s[0] << 2; + s += src_wrap; + sum += -s[0]; + dst[0] = cm[(sum + 4) >> 3]; + dst++; + src++; + } +} + +/* deinterlacing : 2 temporal taps, 3 spatial taps linear filter. The + top field is copied as is, but the bottom field is deinterlaced + against the top field. */ +static void deinterlace_bottom_field(UINT8 *dst, int dst_wrap, + UINT8 *src1, int src_wrap, + int width, int height) +{ + UINT8 *src, *ptr; + int y, y1, i; + UINT8 *buf; + + buf= (UINT8*) malloc(5 * width); + + src = src1; + for(y=0;y<height;y+=2) { + /* copy top field line */ + memcpy(dst, src, width); + dst += dst_wrap; + src += (1 - 2) * src_wrap; + y1 = y - 2; + if (y1 >= 0 && (y1 + 4) < height) { + /* fast case : no edges */ + deinterlace_line(dst, src, src_wrap, width); + } else { + /* in order to use the same function, we use an intermediate buffer */ + ptr = buf; + for(i=0;i<5;i++) { + if (y1 < 0) + memcpy(ptr, src1, width); + else if (y1 >= height) + memcpy(ptr, src1 + (height - 1) * src_wrap, width); + else + memcpy(ptr, src1 + y1 * src_wrap, width); + y1++; + ptr += width; + } + deinterlace_line(dst, buf, width, width); + } + dst += dst_wrap; + src += (2 + 1) * src_wrap; + } + free(buf); +} + + +/* deinterlace, return -1 if format not handled */ +int avpicture_deinterlace(AVPicture *dst, AVPicture *src, + int pix_fmt, int width, int height) +{ + int i; + + if (pix_fmt != PIX_FMT_YUV420P && + pix_fmt != PIX_FMT_YUV422P && + pix_fmt != PIX_FMT_YUV444P) + return -1; + if ((width & 1) != 0 || (height & 3) != 0) + return -1; + + for(i=0;i<3;i++) { + if (i == 1) { + switch(pix_fmt) { + case PIX_FMT_YUV420P: + width >>= 1; + height >>= 1; + break; + case PIX_FMT_YUV422P: + width >>= 1; + break; + default: + break; + } + } + deinterlace_bottom_field(dst->data[i], dst->linesize[i], + src->data[i], src->linesize[i], + width, height); + } + return 0; +} diff --git a/src/libffmpeg/libavcodec/imgresample.c b/src/libffmpeg/libavcodec/imgresample.c new file mode 100644 index 000000000..8c69de2de --- /dev/null +++ b/src/libffmpeg/libavcodec/imgresample.c @@ -0,0 +1,619 @@ +/* + * High quality image resampling with polyphase filters + * Copyright (c) 2001 Gerard Lantau. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <math.h> +#include "dsputil.h" +#include "avcodec.h" + +#ifdef USE_FASTMEMCPY +#include "fastmemcpy.h" +#endif + + +#define NB_COMPONENTS 3 + +#define PHASE_BITS 4 +#define NB_PHASES (1 << PHASE_BITS) +#define NB_TAPS 4 +#define FCENTER 1 /* index of the center of the filter */ + +#define POS_FRAC_BITS 16 +#define POS_FRAC (1 << POS_FRAC_BITS) +/* 6 bits precision is needed for MMX */ +#define FILTER_BITS 8 + +#define LINE_BUF_HEIGHT (NB_TAPS * 4) + +struct ImgReSampleContext { + int iwidth, iheight, owidth, oheight; + int h_incr, v_incr; + INT16 h_filters[NB_PHASES][NB_TAPS] __align8; /* horizontal filters */ + INT16 v_filters[NB_PHASES][NB_TAPS] __align8; /* vertical filters */ + UINT8 *line_buf; +}; + +static inline int get_phase(int pos) +{ + return ((pos) >> (POS_FRAC_BITS - PHASE_BITS)) & ((1 << PHASE_BITS) - 1); +} + +/* This function must be optimized */ +static void h_resample_fast(UINT8 *dst, int dst_width, UINT8 *src, int src_width, + int src_start, int src_incr, INT16 *filters) +{ + int src_pos, phase, sum, i; + UINT8 *s; + INT16 *filter; + + src_pos = src_start; + for(i=0;i<dst_width;i++) { +#ifdef TEST + /* test */ + if ((src_pos >> POS_FRAC_BITS) < 0 || + (src_pos >> POS_FRAC_BITS) > (src_width - NB_TAPS)) + abort(); +#endif + s = src + (src_pos >> POS_FRAC_BITS); + phase = get_phase(src_pos); + filter = filters + phase * NB_TAPS; +#if NB_TAPS == 4 + sum = s[0] * filter[0] + + s[1] * filter[1] + + s[2] * filter[2] + + s[3] * filter[3]; +#else + { + int j; + sum = 0; + for(j=0;j<NB_TAPS;j++) + sum += s[j] * filter[j]; + } +#endif + sum = sum >> FILTER_BITS; + if (sum < 0) + sum = 0; + else if (sum > 255) + sum = 255; + dst[0] = sum; + src_pos += src_incr; + dst++; + } +} + +/* This function must be optimized */ +static void v_resample(UINT8 *dst, int dst_width, UINT8 *src, int wrap, + INT16 *filter) +{ + int sum, i; + UINT8 *s; + + s = src; + for(i=0;i<dst_width;i++) { +#if NB_TAPS == 4 + sum = s[0 * wrap] * filter[0] + + s[1 * wrap] * filter[1] + + s[2 * wrap] * filter[2] + + s[3 * wrap] * filter[3]; +#else + { + int j; + UINT8 *s1 = s; + + sum = 0; + for(j=0;j<NB_TAPS;j++) { + sum += s1[0] * filter[j]; + s1 += wrap; + } + } +#endif + sum = sum >> FILTER_BITS; + if (sum < 0) + sum = 0; + else if (sum > 255) + sum = 255; + dst[0] = sum; + dst++; + s++; + } +} + +#ifdef HAVE_MMX + +#include "i386/mmx.h" + +#define FILTER4(reg) \ +{\ + s = src + (src_pos >> POS_FRAC_BITS);\ + phase = get_phase(src_pos);\ + filter = filters + phase * NB_TAPS;\ + movq_m2r(*s, reg);\ + punpcklbw_r2r(mm7, reg);\ + movq_m2r(*filter, mm6);\ + pmaddwd_r2r(reg, mm6);\ + movq_r2r(mm6, reg);\ + psrlq_i2r(32, reg);\ + paddd_r2r(mm6, reg);\ + psrad_i2r(FILTER_BITS, reg);\ + src_pos += src_incr;\ +} + +#define DUMP(reg) movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq); + +/* XXX: do four pixels at a time */ +static void h_resample_fast4_mmx(UINT8 *dst, int dst_width, UINT8 *src, int src_width, + int src_start, int src_incr, INT16 *filters) +{ + int src_pos, phase; + UINT8 *s; + INT16 *filter; + mmx_t tmp; + + src_pos = src_start; + pxor_r2r(mm7, mm7); + + while (dst_width >= 4) { + + FILTER4(mm0); + FILTER4(mm1); + FILTER4(mm2); + FILTER4(mm3); + + packuswb_r2r(mm7, mm0); + packuswb_r2r(mm7, mm1); + packuswb_r2r(mm7, mm3); + packuswb_r2r(mm7, mm2); + movq_r2m(mm0, tmp); + dst[0] = tmp.ub[0]; + movq_r2m(mm1, tmp); + dst[1] = tmp.ub[0]; + movq_r2m(mm2, tmp); + dst[2] = tmp.ub[0]; + movq_r2m(mm3, tmp); + dst[3] = tmp.ub[0]; + dst += 4; + dst_width -= 4; + } + while (dst_width > 0) { + FILTER4(mm0); + packuswb_r2r(mm7, mm0); + movq_r2m(mm0, tmp); + dst[0] = tmp.ub[0]; + dst++; + dst_width--; + } + emms(); +} + +static void v_resample4_mmx(UINT8 *dst, int dst_width, UINT8 *src, int wrap, + INT16 *filter) +{ + int sum, i, v; + UINT8 *s; + mmx_t tmp; + mmx_t coefs[4]; + + for(i=0;i<4;i++) { + v = filter[i]; + coefs[i].uw[0] = v; + coefs[i].uw[1] = v; + coefs[i].uw[2] = v; + coefs[i].uw[3] = v; + } + + pxor_r2r(mm7, mm7); + s = src; + while (dst_width >= 4) { + movq_m2r(s[0 * wrap], mm0); + punpcklbw_r2r(mm7, mm0); + movq_m2r(s[1 * wrap], mm1); + punpcklbw_r2r(mm7, mm1); + movq_m2r(s[2 * wrap], mm2); + punpcklbw_r2r(mm7, mm2); + movq_m2r(s[3 * wrap], mm3); + punpcklbw_r2r(mm7, mm3); + + pmullw_m2r(coefs[0], mm0); + pmullw_m2r(coefs[1], mm1); + pmullw_m2r(coefs[2], mm2); + pmullw_m2r(coefs[3], mm3); + + paddw_r2r(mm1, mm0); + paddw_r2r(mm3, mm2); + paddw_r2r(mm2, mm0); + psraw_i2r(FILTER_BITS, mm0); + + packuswb_r2r(mm7, mm0); + movq_r2m(mm0, tmp); + + *(UINT32 *)dst = tmp.ud[0]; + dst += 4; + s += 4; + dst_width -= 4; + } + while (dst_width > 0) { + sum = s[0 * wrap] * filter[0] + + s[1 * wrap] * filter[1] + + s[2 * wrap] * filter[2] + + s[3 * wrap] * filter[3]; + sum = sum >> FILTER_BITS; + if (sum < 0) + sum = 0; + else if (sum > 255) + sum = 255; + dst[0] = sum; + dst++; + s++; + dst_width--; + } + emms(); +} +#endif + +/* slow version to handle limit cases. Does not need optimisation */ +static void h_resample_slow(UINT8 *dst, int dst_width, UINT8 *src, int src_width, + int src_start, int src_incr, INT16 *filters) +{ + int src_pos, phase, sum, j, v, i; + UINT8 *s, *src_end; + INT16 *filter; + + src_end = src + src_width; + src_pos = src_start; + for(i=0;i<dst_width;i++) { + s = src + (src_pos >> POS_FRAC_BITS); + phase = get_phase(src_pos); + filter = filters + phase * NB_TAPS; + sum = 0; + for(j=0;j<NB_TAPS;j++) { + if (s < src) + v = src[0]; + else if (s >= src_end) + v = src_end[-1]; + else + v = s[0]; + sum += v * filter[j]; + s++; + } + sum = sum >> FILTER_BITS; + if (sum < 0) + sum = 0; + else if (sum > 255) + sum = 255; + dst[0] = sum; + src_pos += src_incr; + dst++; + } +} + +static void h_resample(UINT8 *dst, int dst_width, UINT8 *src, int src_width, + int src_start, int src_incr, INT16 *filters) +{ + int n, src_end; + + if (src_start < 0) { + n = (0 - src_start + src_incr - 1) / src_incr; + h_resample_slow(dst, n, src, src_width, src_start, src_incr, filters); + dst += n; + dst_width -= n; + src_start += n * src_incr; + } + src_end = src_start + dst_width * src_incr; + if (src_end > ((src_width - NB_TAPS) << POS_FRAC_BITS)) { + n = (((src_width - NB_TAPS + 1) << POS_FRAC_BITS) - 1 - src_start) / + src_incr; + } else { + n = dst_width; + } +#ifdef HAVE_MMX + if ((mm_flags & MM_MMX) && NB_TAPS == 4) + h_resample_fast4_mmx(dst, n, + src, src_width, src_start, src_incr, filters); + else +#endif + h_resample_fast(dst, n, + src, src_width, src_start, src_incr, filters); + if (n < dst_width) { + dst += n; + dst_width -= n; + src_start += n * src_incr; + h_resample_slow(dst, dst_width, + src, src_width, src_start, src_incr, filters); + } +} + +static void component_resample(ImgReSampleContext *s, + UINT8 *output, int owrap, int owidth, int oheight, + UINT8 *input, int iwrap, int iwidth, int iheight) +{ + int src_y, src_y1, last_src_y, ring_y, phase_y, y1, y; + UINT8 *new_line, *src_line; + + last_src_y = - FCENTER - 1; + /* position of the bottom of the filter in the source image */ + src_y = (last_src_y + NB_TAPS) * POS_FRAC; + ring_y = NB_TAPS; /* position in ring buffer */ + for(y=0;y<oheight;y++) { + /* apply horizontal filter on new lines from input if needed */ + src_y1 = src_y >> POS_FRAC_BITS; + while (last_src_y < src_y1) { + if (++ring_y >= LINE_BUF_HEIGHT + NB_TAPS) + ring_y = NB_TAPS; + last_src_y++; + /* handle limit conditions : replicate line (slighly + inefficient because we filter multiple times */ + y1 = last_src_y; + if (y1 < 0) { + y1 = 0; + } else if (y1 >= iheight) { + y1 = iheight - 1; + } + src_line = input + y1 * iwrap; + new_line = s->line_buf + ring_y * owidth; + /* apply filter and handle limit cases correctly */ + h_resample(new_line, owidth, + src_line, iwidth, - FCENTER * POS_FRAC, s->h_incr, + &s->h_filters[0][0]); + /* handle ring buffer wraping */ + if (ring_y >= LINE_BUF_HEIGHT) { + memcpy(s->line_buf + (ring_y - LINE_BUF_HEIGHT) * owidth, + new_line, owidth); + } + } + /* apply vertical filter */ + phase_y = get_phase(src_y); +#ifdef HAVE_MMX + /* desactivated MMX because loss of precision */ + if ((mm_flags & MM_MMX) && NB_TAPS == 4 && 0) + v_resample4_mmx(output, owidth, + s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth, + &s->v_filters[phase_y][0]); + else +#endif + v_resample(output, owidth, + s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth, + &s->v_filters[phase_y][0]); + + src_y += s->v_incr; + output += owrap; + } +} + +/* XXX: the following filter is quite naive, but it seems to suffice + for 4 taps */ +static void build_filter(INT16 *filter, float factor) +{ + int ph, i, v; + float x, y, tab[NB_TAPS], norm, mult; + + /* if upsampling, only need to interpolate, no filter */ + if (factor > 1.0) + factor = 1.0; + + for(ph=0;ph<NB_PHASES;ph++) { + norm = 0; + for(i=0;i<NB_TAPS;i++) { + + x = M_PI * ((float)(i - FCENTER) - (float)ph / NB_PHASES) * factor; + if (x == 0) + y = 1.0; + else + y = sin(x) / x; + tab[i] = y; + norm += y; + } + + /* normalize so that an uniform color remains the same */ + mult = (float)(1 << FILTER_BITS) / norm; + for(i=0;i<NB_TAPS;i++) { + v = (int)(tab[i] * mult); + filter[ph * NB_TAPS + i] = v; + } + } +} + +ImgReSampleContext *img_resample_init(int owidth, int oheight, + int iwidth, int iheight) +{ + ImgReSampleContext *s; + + s = av_mallocz(sizeof(ImgReSampleContext)); + if (!s) + return NULL; + s->line_buf = av_mallocz(owidth * (LINE_BUF_HEIGHT + NB_TAPS)); + if (!s->line_buf) + goto fail; + + s->owidth = owidth; + s->oheight = oheight; + s->iwidth = iwidth; + s->iheight = iheight; + + s->h_incr = (iwidth * POS_FRAC) / owidth; + s->v_incr = (iheight * POS_FRAC) / oheight; + + build_filter(&s->h_filters[0][0], (float)owidth / (float)iwidth); + build_filter(&s->v_filters[0][0], (float)oheight / (float)iheight); + + return s; + fail: + free(s); + return NULL; +} + +void img_resample(ImgReSampleContext *s, + AVPicture *output, AVPicture *input) +{ + int i, shift; + + for(i=0;i<3;i++) { + shift = (i == 0) ? 0 : 1; + component_resample(s, output->data[i], output->linesize[i], + s->owidth >> shift, s->oheight >> shift, + input->data[i], input->linesize[i], + s->iwidth >> shift, s->iheight >> shift); + } +} + +void img_resample_close(ImgReSampleContext *s) +{ + free(s->line_buf); + free(s); +} + +#ifdef TEST + +void *av_mallocz(int size) +{ + void *ptr; + ptr = malloc(size); + memset(ptr, 0, size); + return ptr; +} + +/* input */ +#define XSIZE 256 +#define YSIZE 256 +UINT8 img[XSIZE * YSIZE]; + +/* output */ +#define XSIZE1 512 +#define YSIZE1 512 +UINT8 img1[XSIZE1 * YSIZE1]; +UINT8 img2[XSIZE1 * YSIZE1]; + +void save_pgm(const char *filename, UINT8 *img, int xsize, int ysize) +{ + FILE *f; + f=fopen(filename,"w"); + fprintf(f,"P5\n%d %d\n%d\n", xsize, ysize, 255); + fwrite(img,1, xsize * ysize,f); + fclose(f); +} + +static void dump_filter(INT16 *filter) +{ + int i, ph; + + for(ph=0;ph<NB_PHASES;ph++) { + printf("%2d: ", ph); + for(i=0;i<NB_TAPS;i++) { + printf(" %5.2f", filter[ph * NB_TAPS + i] / 256.0); + } + printf("\n"); + } +} + +#ifdef HAVE_MMX +int mm_flags; +#endif + +int main(int argc, char **argv) +{ + int x, y, v, i, xsize, ysize; + ImgReSampleContext *s; + float fact, factors[] = { 1/2.0, 3.0/4.0, 1.0, 4.0/3.0, 16.0/9.0, 2.0 }; + char buf[256]; + + /* build test image */ + for(y=0;y<YSIZE;y++) { + for(x=0;x<XSIZE;x++) { + if (x < XSIZE/2 && y < YSIZE/2) { + if (x < XSIZE/4 && y < YSIZE/4) { + if ((x % 10) <= 6 && + (y % 10) <= 6) + v = 0xff; + else + v = 0x00; + } else if (x < XSIZE/4) { + if (x & 1) + v = 0xff; + else + v = 0; + } else if (y < XSIZE/4) { + if (y & 1) + v = 0xff; + else + v = 0; + } else { + if (y < YSIZE*3/8) { + if ((y+x) & 1) + v = 0xff; + else + v = 0; + } else { + if (((x+3) % 4) <= 1 && + ((y+3) % 4) <= 1) + v = 0xff; + else + v = 0x00; + } + } + } else if (x < XSIZE/2) { + v = ((x - (XSIZE/2)) * 255) / (XSIZE/2); + } else if (y < XSIZE/2) { + v = ((y - (XSIZE/2)) * 255) / (XSIZE/2); + } else { + v = ((x + y - XSIZE) * 255) / XSIZE; + } + img[y * XSIZE + x] = v; + } + } + save_pgm("/tmp/in.pgm", img, XSIZE, YSIZE); + for(i=0;i<sizeof(factors)/sizeof(float);i++) { + fact = factors[i]; + xsize = (int)(XSIZE * fact); + ysize = (int)(YSIZE * fact); + s = img_resample_init(xsize, ysize, XSIZE, YSIZE); + printf("Factor=%0.2f\n", fact); + dump_filter(&s->h_filters[0][0]); + component_resample(s, img1, xsize, xsize, ysize, + img, XSIZE, XSIZE, YSIZE); + img_resample_close(s); + + sprintf(buf, "/tmp/out%d.pgm", i); + save_pgm(buf, img1, xsize, ysize); + } + + /* mmx test */ +#ifdef HAVE_MMX + printf("MMX test\n"); + fact = 0.72; + xsize = (int)(XSIZE * fact); + ysize = (int)(YSIZE * fact); + mm_flags = MM_MMX; + s = img_resample_init(xsize, ysize, XSIZE, YSIZE); + component_resample(s, img1, xsize, xsize, ysize, + img, XSIZE, XSIZE, YSIZE); + + mm_flags = 0; + s = img_resample_init(xsize, ysize, XSIZE, YSIZE); + component_resample(s, img2, xsize, xsize, ysize, + img, XSIZE, XSIZE, YSIZE); + if (memcmp(img1, img2, xsize * ysize) != 0) { + fprintf(stderr, "mmx error\n"); + exit(1); + } + printf("MMX OK\n"); +#endif + return 0; +} + +#endif diff --git a/src/libffmpeg/libavcodec/mjpeg.c b/src/libffmpeg/libavcodec/mjpeg.c index 66d018e69..08281a501 100644 --- a/src/libffmpeg/libavcodec/mjpeg.c +++ b/src/libffmpeg/libavcodec/mjpeg.c @@ -16,6 +16,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +//#define DEBUG #include "avcodec.h" #include "dsputil.h" #include "mpegvideo.h" @@ -419,30 +420,6 @@ void mjpeg_encode_mb(MpegEncContext *s, /******************************************/ /* decoding */ -//#define DEBUG - -#ifndef CONFIG_WIN32 - -#ifdef DEBUG -# if __GNUC__ -# define dprintf(fmt,args...) printf(fmt, ## args) -# else -# define dprintf(...) printf(__VA_ARGS__) -# endif -#else -# if __GNUC__ -# define dprintf(fmt,args...) -# else -# define dprintf(...) -# endif -#endif - -#else - -inline void dprintf(const char* fmt,...) {} - -#endif - /* compressed picture size */ #define PICTURE_BUFFER_SIZE 100000 diff --git a/src/libffmpeg/libavcodec/mpeg12.c b/src/libffmpeg/libavcodec/mpeg12.c index 013e1b935..758124eea 100644 --- a/src/libffmpeg/libavcodec/mpeg12.c +++ b/src/libffmpeg/libavcodec/mpeg12.c @@ -16,36 +16,13 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +//#define DEBUG #include "avcodec.h" #include "dsputil.h" #include "mpegvideo.h" #include "mpeg12data.h" -//#define DEBUG - -#ifndef CONFIG_WIN32 - -#ifdef DEBUG -# if __GNUC__ -# define dprintf(fmt,args...) printf(fmt, ## args) -# else -# define dprintf(...) printf(__VA_ARGS__) -# endif -#else -# if __GNUC__ -# define dprintf(fmt,args...) -# else -# define dprintf(...) -# endif -#endif - -#else - -inline void dprintf(const char* fmt,...) {} - -#endif - /* Start codes. */ #define SEQ_END_CODE 0x000001b7 #define SEQ_START_CODE 0x000001b3 diff --git a/src/libffmpeg/libavcodec/mpegvideo.c b/src/libffmpeg/libavcodec/mpegvideo.c index 9f7236f98..80a35635d 100644 --- a/src/libffmpeg/libavcodec/mpegvideo.c +++ b/src/libffmpeg/libavcodec/mpegvideo.c @@ -101,14 +101,12 @@ int MPV_common_init(MpegEncContext *s) int c_size, i; UINT8 *pict; - s->waiting_for_keyframe = 1; - if (s->out_format == FMT_H263) s->dct_unquantize = dct_unquantize_h263_c; else s->dct_unquantize = dct_unquantize_mpeg1_c; -#ifdef ARCH_X86 +#ifdef HAVE_MMX MPV_common_init_mmx(s); #endif s->mb_width = (s->width + 15) / 16; @@ -214,7 +212,6 @@ int MPV_common_init(MpegEncContext *s) if (s->aux_picture_base[i]) free(s->aux_picture_base[i]); } - return -1; } @@ -252,6 +249,9 @@ int MPV_encode_init(AVCodecContext *avctx) s->width = avctx->width; s->height = avctx->height; s->gop_size = avctx->gop_size; + s->rtp_mode = avctx->rtp_mode; + s->rtp_payload_size = avctx->rtp_payload_size; + if (s->gop_size <= 1) { s->intra_only = 1; s->gop_size = 12; @@ -273,14 +273,22 @@ int MPV_encode_init(AVCodecContext *avctx) return -1; break; case CODEC_ID_H263: - if (h263_get_picture_format(s->width, s->height) == 7) + if (h263_get_picture_format(s->width, s->height) == 7){ + printf("Input picture size isn't suitable for h263 codec! try h263+\n"); return -1; + } s->out_format = FMT_H263; break; case CODEC_ID_H263P: s->out_format = FMT_H263; + s->rtp_mode = 1; + s->rtp_payload_size = 1200; s->h263_plus = 1; - /* XXX: not unrectricted mv yet */ + s->unrestricted_mv = 1; + + /* These are just to be sure */ + s->umvplus = 0; + s->umvplus_dec = 0; break; case CODEC_ID_RV10: s->out_format = FMT_H263; @@ -395,9 +403,15 @@ void MPV_frame_end(MpegEncContext *s) { /* draw edge for correct motion prediction if outside */ if (s->pict_type != B_TYPE) { +#if 1 + draw_edges(s->current_picture[0], s->linesize, s->mb_width*16, s->mb_height*16, EDGE_WIDTH); + draw_edges(s->current_picture[1], s->linesize/2, s->mb_width*8, s->mb_height*8, EDGE_WIDTH/2); + draw_edges(s->current_picture[2], s->linesize/2, s->mb_width*8, s->mb_height*8, EDGE_WIDTH/2); +#else draw_edges(s->current_picture[0], s->linesize, s->width, s->height, EDGE_WIDTH); draw_edges(s->current_picture[1], s->linesize/2, s->width/2, s->height/2, EDGE_WIDTH/2); draw_edges(s->current_picture[2], s->linesize/2, s->width/2, s->height/2, EDGE_WIDTH/2); +#endif } } @@ -674,6 +688,11 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]) mb_x = s->mb_x; mb_y = s->mb_y; +#ifdef FF_POSTPROCESS + quant_store[mb_y][mb_x]=s->qscale; + //printf("[%02d][%02d] %d\n",mb_x,mb_y,s->qscale); +#endif + /* update DC predictors for P macroblocks */ if (!s->mb_intra) { if (s->h263_pred) { @@ -813,7 +832,7 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]) static void encode_picture(MpegEncContext *s, int picture_number) { - int mb_x, mb_y, wrap; + int mb_x, mb_y, wrap, last_gob; UINT8 *ptr; int i, motion_x, motion_y; @@ -863,7 +882,29 @@ static void encode_picture(MpegEncContext *s, int picture_number) s->mv_type = MV_TYPE_16X16; s->mv_dir = MV_DIR_FORWARD; + /* Get the GOB height based on picture height */ + if (s->out_format == FMT_H263 && s->h263_plus) { + if (s->height <= 400) + s->gob_index = 1; + else if (s->height <= 800) + s->gob_index = 2; + else + s->gob_index = 4; + } + for(mb_y=0; mb_y < s->mb_height; mb_y++) { + /* Put GOB header based on RTP MTU */ + if (!mb_y) { + s->ptr_lastgob = s->pb.buf_ptr; + s->ptr_last_mb_line = s->pb.buf_ptr; + } else if (s->out_format == FMT_H263 && s->h263_plus) { + last_gob = h263_encode_gob_header(s, mb_y); + if (last_gob) { + //fprintf(stderr,"\nLast GOB size: %d", last_gob); + s->first_gob_line = 1; + } else + s->first_gob_line = 0; + } for(mb_x=0; mb_x < s->mb_width; mb_x++) { s->mb_x = mb_x; @@ -975,7 +1016,17 @@ static void encode_picture(MpegEncContext *s, int picture_number) MPV_decode_mb(s, s->block); } + /* Obtain average MB line size for RTP */ + if (!mb_y) + s->mb_line_avgsize = s->pb.buf_ptr - s->ptr_last_mb_line; + else + s->mb_line_avgsize = (s->mb_line_avgsize + s->pb.buf_ptr - s->ptr_last_mb_line) >> 1; + //fprintf(stderr, "\nMB line: %d\tSize: %u\tAvg. Size: %u", s->mb_y, + // (s->pb.buf_ptr - s->ptr_last_mb_line), s->mb_line_avgsize); + s->ptr_last_mb_line = s->pb.buf_ptr; } + //if (s->gob_number) + // fprintf(stderr,"\nNumber of GOB: %d", s->gob_number); } static int dct_quantize(MpegEncContext *s, diff --git a/src/libffmpeg/libavcodec/mpegvideo.h b/src/libffmpeg/libavcodec/mpegvideo.h index 9abe75a7c..e653edb9b 100644 --- a/src/libffmpeg/libavcodec/mpegvideo.h +++ b/src/libffmpeg/libavcodec/mpegvideo.h @@ -128,7 +128,16 @@ typedef struct MpegEncContext { int P_frame_bits; /* same for P frame */ INT64 wanted_bits; INT64 total_bits; - + + /* H.263 specific */ + int gob_number; + int gob_index; + int first_gob_line; + + /* H.263+ specific */ + int umvplus; + int umvplus_dec; + /* mpeg4 specific */ int time_increment_bits; int shape; @@ -177,12 +186,17 @@ typedef struct MpegEncContext { int interlaced_dct; int last_qscale; int first_slice; - + + /* RTP specific */ + int rtp_mode; + int rtp_payload_size; + UINT8 *ptr_lastgob; + UINT8 *ptr_last_mb_line; + UINT32 mb_line_avgsize; + DCTELEM block[6][64] __align8; void (*dct_unquantize)(struct MpegEncContext *s, DCTELEM *block, int n, int qscale); - - int waiting_for_keyframe; } MpegEncContext; int MPV_common_init(MpegEncContext *s); @@ -190,7 +204,7 @@ void MPV_common_end(MpegEncContext *s); void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]); void MPV_frame_start(MpegEncContext *s); void MPV_frame_end(MpegEncContext *s); -#ifdef ARCH_X86 +#ifdef HAVE_MMX void MPV_common_init_mmx(MpegEncContext *s); #endif @@ -230,7 +244,7 @@ typedef struct RLTable { void init_rl(RLTable *rl); void init_vlc_rl(RLTable *rl); -static inline int get_rl_index(const RLTable *rl, int last, int run, int level) +extern inline int get_rl_index(const RLTable *rl, int last, int run, int level) { int index; index = rl->index_run[last][run]; @@ -245,6 +259,7 @@ void h263_encode_mb(MpegEncContext *s, DCTELEM block[6][64], int motion_x, int motion_y); void h263_encode_picture_header(MpegEncContext *s, int picture_number); +int h263_encode_gob_header(MpegEncContext * s, int mb_line); void h263_dc_scale(MpegEncContext *s); INT16 *h263_pred_motion(MpegEncContext * s, int block, int *px, int *py); @@ -255,6 +270,7 @@ void h263_encode_init_vlc(MpegEncContext *s); void h263_decode_init_vlc(MpegEncContext *s); int h263_decode_picture_header(MpegEncContext *s); +int h263_decode_gob_header(MpegEncContext *s); int mpeg4_decode_picture_header(MpegEncContext * s); int intel_h263_decode_picture_header(MpegEncContext *s); int h263_decode_mb(MpegEncContext *s, diff --git a/src/libffmpeg/libavcodec/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/mpegvideo_mmx.c deleted file mode 100644 index 7b3ba8aff..000000000 --- a/src/libffmpeg/libavcodec/mpegvideo_mmx.c +++ /dev/null @@ -1,232 +0,0 @@ -/* - * The simplest mpeg encoder (well, it was the simplest!) - * Copyright (c) 2000,2001 Gerard Lantau. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru> - */ - -#include "dsputil.h" -#include "mpegvideo.h" -#include "xineutils.h" - -#if 0 - -/* XXX: GL: I don't understand why this function needs optimization - (it is called only once per frame!), so I disabled it */ - -void MPV_frame_start(MpegEncContext *s) -{ - if (s->pict_type == B_TYPE) { - __asm __volatile( - "movl (%1), %%eax\n\t" - "movl 4(%1), %%edx\n\t" - "movl 8(%1), %%ecx\n\t" - "movl %%eax, (%0)\n\t" - "movl %%edx, 4(%0)\n\t" - "movl %%ecx, 8(%0)\n\t" - : - :"r"(s->current_picture), "r"(s->aux_picture) - :"eax","edx","ecx","memory"); - } else { - /* swap next and last */ - __asm __volatile( - "movl (%1), %%eax\n\t" - "movl 4(%1), %%edx\n\t" - "movl 8(%1), %%ecx\n\t" - "xchgl (%0), %%eax\n\t" - "xchgl 4(%0), %%edx\n\t" - "xchgl 8(%0), %%ecx\n\t" - "movl %%eax, (%1)\n\t" - "movl %%edx, 4(%1)\n\t" - "movl %%ecx, 8(%1)\n\t" - "movl %%eax, (%2)\n\t" - "movl %%edx, 4(%2)\n\t" - "movl %%ecx, 8(%2)\n\t" - : - :"r"(s->last_picture), "r"(s->next_picture), "r"(s->current_picture) - :"eax","edx","ecx","memory"); - } -} -#endif - -static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; -static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; - -/* - NK: - Note: looking at PARANOID: - "enable all paranoid tests for rounding, overflows, etc..." - -#ifdef PARANOID - if (level < -2048 || level > 2047) - fprintf(stderr, "unquant error %d %d\n", i, level); -#endif - We can suppose that result of two multiplications can't be greate of 0xFFFF - i.e. is 16-bit, so we use here only PMULLW instruction and can avoid - a complex multiplication. -===================================================== - Full formula for multiplication of 2 integer numbers - which are represent as high:low words: - input: value1 = high1:low1 - value2 = high2:low2 - output: value3 = value1*value2 - value3=high3:low3 (on overflow: modulus 2^32 wrap-around) - this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 - but this algorithm will compute only 0x66cb0ce4 - this limited by 16-bit size of operands - --------------------------------- - tlow1 = high1*low2 - tlow2 = high2*low1 - tlow1 = tlow1 + tlow2 - high3:low3 = low1*low2 - high3 += tlow1 -*/ -static void dct_unquantize_mpeg1_mmx(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - int i, level; - const UINT16 *quant_matrix; - if (s->mb_intra) { - if (n < 4) - block[0] = block[0] * s->y_dc_scale; - else - block[0] = block[0] * s->c_dc_scale; - if (s->out_format == FMT_H263) { - i = 1; - goto unquant_even; - } - /* XXX: only mpeg1 */ - quant_matrix = s->intra_matrix; - i=1; - /* Align on 4 elements boundary */ - while(i&3) - { - level = block[i]; - if (level) { - if (level < 0) level = -level; - level = (int)(level * qscale * quant_matrix[i]) >> 3; - level = (level - 1) | 1; - if (block[i] < 0) level = -level; - block[i] = level; - } - i++; - } - __asm __volatile( - "movd %0, %%mm6\n\t" /* mm6 = qscale | 0 */ - "punpckldq %%mm6, %%mm6\n\t" /* mm6 = qscale | qscale */ - "movq %2, %%mm4\n\t" - "movq %%mm6, %%mm7\n\t" - "movq %1, %%mm5\n\t" - "packssdw %%mm6, %%mm7\n\t" /* mm7 = qscale | qscale | qscale | qscale */ - "pxor %%mm6, %%mm6\n\t" - ::"g"(qscale),"m"(mm_wone),"m"(mm_wabs):"memory"); - for(;i<64;i+=4) { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %%mm7, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm0, %%mm3\n\t" - "pcmpgtw %%mm6, %%mm2\n\t" - "pmullw %2, %%mm1\n\t" - "pandn %%mm4, %%mm2\n\t" - "por %%mm5, %%mm2\n\t" - "pmullw %%mm2, %%mm0\n\t" /* mm0 = abs(block[i]). */ - - "pcmpeqw %%mm6, %%mm3\n\t" - "pmullw %%mm0, %%mm1\n\t" - "psraw $3, %%mm1\n\t" - "psubw %%mm5, %%mm1\n\t" /* block[i] --; */ - "pandn %%mm4, %%mm3\n\t" /* fake of pcmpneqw : mm0 != 0 then mm1 = -1 */ - "por %%mm5, %%mm1\n\t" /* block[i] |= 1 */ - "pmullw %%mm2, %%mm1\n\t" /* change signs again */ - - "pand %%mm3, %%mm1\n\t" /* nullify if was zero */ - "movq %%mm1, %0" - :"=m"(block[i]) - :"m"(block[i]), "m"(quant_matrix[i]) - :"memory"); - } - } else { - i = 0; - unquant_even: - quant_matrix = s->non_intra_matrix; - /* Align on 4 elements boundary */ - while(i&3) - { - level = block[i]; - if (level) { - if (level < 0) level = -level; - level = (((level << 1) + 1) * qscale * - ((int) quant_matrix[i])) >> 4; - level = (level - 1) | 1; - if(block[i] < 0) level = -level; - block[i] = level; - } - i++; - } - __asm __volatile( - "movd %0, %%mm6\n\t" /* mm6 = qscale | 0 */ - "punpckldq %%mm6, %%mm6\n\t" /* mm6 = qscale | qscale */ - "movq %2, %%mm4\n\t" - "movq %%mm6, %%mm7\n\t" - "movq %1, %%mm5\n\t" - "packssdw %%mm6, %%mm7\n\t" /* mm7 = qscale | qscale | qscale | qscale */ - "pxor %%mm6, %%mm6\n\t" - ::"g"(qscale),"m"(mm_wone),"m"(mm_wabs):"memory"); - for(;i<64;i+=4) { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %%mm7, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm0, %%mm3\n\t" - "pcmpgtw %%mm6, %%mm2\n\t" - "pmullw %2, %%mm1\n\t" - "pandn %%mm4, %%mm2\n\t" - "por %%mm5, %%mm2\n\t" - "pmullw %%mm2, %%mm0\n\t" /* mm0 = abs(block[i]). */ - "psllw $1, %%mm0\n\t" /* block[i] <<= 1 */ - "paddw %%mm5, %%mm0\n\t" /* block[i] ++ */ - - "pmullw %%mm0, %%mm1\n\t" - "psraw $4, %%mm1\n\t" - "pcmpeqw %%mm6, %%mm3\n\t" - "psubw %%mm5, %%mm1\n\t" /* block[i] --; */ - "pandn %%mm4, %%mm3\n\t" /* fake of pcmpneqw : mm0 != 0 then mm1 = -1 */ - "por %%mm5, %%mm1\n\t" /* block[i] |= 1 */ - "pmullw %%mm2, %%mm1\n\t" /* change signs again */ - - "pand %%mm3, %%mm1\n\t" /* nullify if was zero */ - "movq %%mm1, %0" - :"=m"(block[i]) - :"m"(block[i]), "m"(quant_matrix[i]) - :"memory"); - } - } -} - -void MPV_common_init_mmx(MpegEncContext *s) -{ - int mm_flags; - - mm_flags = xine_mm_accel(); - - if (mm_flags & MM_ACCEL_X86_MMX) { - /* XXX: should include h263 optimization too. It would go even - faster! */ - s->dct_unquantize = dct_unquantize_mpeg1_mmx; - } -} diff --git a/src/libffmpeg/libavcodec/msmpeg4.c b/src/libffmpeg/libavcodec/msmpeg4.c index 63f111bdd..01e3d5438 100644 --- a/src/libffmpeg/libavcodec/msmpeg4.c +++ b/src/libffmpeg/libavcodec/msmpeg4.c @@ -30,7 +30,7 @@ * - (encoding) select best vlc/dc table * - (decoding) handle slice indication */ -#undef DEBUG +//#define DEBUG /* motion vector table */ typedef struct MVTable { @@ -664,7 +664,7 @@ int msmpeg4_decode_mb(MpegEncContext *s, /* special slice handling */ if (s->mb_x == 0) { - if ((s->mb_y % s->slice_height) == 0) { + if (s->slice_height && (s->mb_y % s->slice_height) == 0) { int wrap; /* reset DC pred (set previous line to 1024) */ wrap = 2 * s->mb_width + 2; diff --git a/src/libffmpeg/libavcodec/rv10.c b/src/libffmpeg/libavcodec/rv10.c index 6d9376c54..f4ebc9016 100644 --- a/src/libffmpeg/libavcodec/rv10.c +++ b/src/libffmpeg/libavcodec/rv10.c @@ -24,7 +24,7 @@ #include "avcodec.h" #include "mpegvideo.h" -#undef DEBUG +//#define DEBUG static const UINT16 rv_lum_code[256] = { diff --git a/src/libffmpeg/libavcodec/sad_mmx.s b/src/libffmpeg/libavcodec/sad_mmx.s deleted file mode 100644 index 42c7ade59..000000000 --- a/src/libffmpeg/libavcodec/sad_mmx.s +++ /dev/null @@ -1,799 +0,0 @@ -# MMX/SSE optimized routines for SAD of 16*16 macroblocks -# Copyright (C) Juan J. Sierralta P. <juanjo@atmlab.utfsm.cl> -# -# dist1_* Original Copyright (C) 2000 Chris Atenasio <chris@crud.net> -# Enhancements and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk> - -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# - -.global pix_abs16x16_mmx - -# int pix_abs16x16_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h); -# esi = p1 (init: blk1) -# edi = p2 (init: blk2) -# ecx = rowsleft (init: h) -# edx = lx; - -# mm0 = distance accumulators (4 words) -# mm1 = distance accumulators (4 words) -# mm2 = temp -# mm3 = temp -# mm4 = temp -# mm5 = temp -# mm6 = 0 -# mm7 = temp - - -.align 32 -pix_abs16x16_mmx: - pushl %ebp # save frame pointer - movl %esp,%ebp - - pushl %ebx # Saves registers (called saves convention in - pushl %ecx # x86 GCC it seems) - pushl %edx # - pushl %esi - pushl %edi - - pxor %mm0,%mm0 # zero acculumators - pxor %mm1,%mm1 - pxor %mm6,%mm6 - movl 8(%ebp),%esi # get pix1 - movl 12(%ebp),%edi # get pix2 - movl 16(%ebp),%edx # get lx - movl 20(%ebp),%ecx # get rowsleft - jmp pix_abs16x16_mmx.nextrow -.align 32 - -pix_abs16x16_mmx.nextrow: - # First 8 bytes of the row - - movq (%edi),%mm4 # load first 8 bytes of pix2 row - movq (%esi),%mm5 # load first 8 bytes of pix1 row - movq %mm4,%mm3 # mm4 := abs(mm4-mm5) - movq 8(%esi),%mm2 # load last 8 bytes of pix1 row - psubusb %mm5,%mm4 - movq 8(%edi),%mm7 # load last 8 bytes of pix2 row - psubusb %mm3,%mm5 - por %mm5,%mm4 - - # Last 8 bytes of the row - - movq %mm7,%mm3 # mm7 := abs(mm7-mm2) - psubusb %mm2,%mm7 - psubusb %mm3,%mm2 - por %mm2,%mm7 - - # Now mm4 and mm7 have 16 absdiffs to add - - # First 8 bytes of the row2 - - - addl %edx,%edi - movq (%edi),%mm2 # load first 8 bytes of pix2 row - addl %edx,%esi - movq (%esi),%mm5 # load first 8 bytes of pix1 row - - - - movq %mm2,%mm3 # mm2 := abs(mm2-mm5) - psubusb %mm5,%mm2 - movq 8(%esi),%mm6 # load last 8 bytes of pix1 row - psubusb %mm3,%mm5 - por %mm5,%mm2 - - # Last 8 bytes of the row2 - - movq 8(%edi),%mm5 # load last 8 bytes of pix2 row - - - movq %mm5,%mm3 # mm5 := abs(mm5-mm6) - psubusb %mm6,%mm5 - psubusb %mm3,%mm6 - por %mm6,%mm5 - - # Now mm2, mm4, mm5, mm7 have 32 absdiffs - - movq %mm7,%mm3 - - pxor %mm6,%mm6 # Zero mm6 - - punpcklbw %mm6,%mm3 # Unpack to words and add - punpckhbw %mm6,%mm7 - paddusw %mm3,%mm7 - - movq %mm5,%mm3 - - punpcklbw %mm6,%mm3 # Unpack to words and add - punpckhbw %mm6,%mm5 - paddusw %mm3,%mm5 - - paddusw %mm7,%mm0 # Add to the acumulator (mm0) - paddusw %mm5,%mm1 # Add to the acumulator (mm1) - - movq %mm4,%mm3 - - punpcklbw %mm6,%mm3 # Unpack to words and add - punpckhbw %mm6,%mm4 - movq %mm2,%mm5 - paddusw %mm3,%mm4 - - - - punpcklbw %mm6,%mm5 # Unpack to words and add - punpckhbw %mm6,%mm2 - paddusw %mm5,%mm2 - - # Loop termination - - addl %edx,%esi # update pointers to next row - paddusw %mm4,%mm0 # Add to the acumulator (mm0) - addl %edx,%edi - subl $2,%ecx - paddusw %mm2,%mm1 # Add to the acumulator (mm1) - testl %ecx,%ecx # check rowsleft - jnz pix_abs16x16_mmx.nextrow - - paddusw %mm1,%mm0 - movq %mm0,%mm2 # Copy mm0 to mm2 - psrlq $32,%mm2 - paddusw %mm2,%mm0 # Add - movq %mm0,%mm3 - psrlq $16,%mm3 - paddusw %mm3,%mm0 - movd %mm0,%eax # Store return value - andl $0xffff,%eax - - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - - popl %ebp # restore stack pointer - - #emms ; clear mmx registers - ret # return - -.global pix_abs16x16_sse - -# int pix_abs16x16_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h); -# esi = p1 (init: blk1) -# edi = p2 (init: blk2) -# ecx = rowsleft (init: h) -# edx = lx; - -# mm0 = distance accumulators (4 words) -# mm1 = distance accumulators (4 words) -# mm2 = temp -# mm3 = temp -# mm4 = temp -# mm5 = temp -# mm6 = temp -# mm7 = temp - - -.align 32 -pix_abs16x16_sse: - pushl %ebp # save frame pointer - movl %esp,%ebp - - pushl %ebx # Saves registers (called saves convention in - pushl %ecx # x86 GCC it seems) - pushl %edx # - pushl %esi - pushl %edi - - pxor %mm0,%mm0 # zero acculumators - pxor %mm1,%mm1 - movl 8(%ebp),%esi # get pix1 - movl 12(%ebp),%edi # get pix2 - movl 16(%ebp),%edx # get lx - movl 20(%ebp),%ecx # get rowsleft - jmp pix_abs16x16_sse.next4row -.align 32 - -pix_abs16x16_sse.next4row: - # First row - - movq (%edi),%mm4 # load first 8 bytes of pix2 row - movq 8(%edi),%mm5 # load last 8 bytes of pix2 row - psadbw (%esi),%mm4 # SAD of first 8 bytes - psadbw 8(%esi),%mm5 # SAD of last 8 bytes - paddw %mm4,%mm0 # Add to acumulators - paddw %mm5,%mm1 - - # Second row - - addl %edx,%edi # - addl %edx,%esi # - - movq (%edi),%mm6 # load first 8 bytes of pix2 row - movq 8(%edi),%mm7 # load last 8 bytes of pix2 row - psadbw (%esi),%mm6 # SAD of first 8 bytes - psadbw 8(%esi),%mm7 # SAD of last 8 bytes - paddw %mm6,%mm0 # Add to acumulators - paddw %mm7,%mm1 - - # Third row - - addl %edx,%edi # - addl %edx,%esi # - - movq (%edi),%mm4 # load first 8 bytes of pix2 row - movq 8(%edi),%mm5 # load last 8 bytes of pix2 row - psadbw (%esi),%mm4 # SAD of first 8 bytes - psadbw 8(%esi),%mm5 # SAD of last 8 bytes - paddw %mm4,%mm0 # Add to acumulators - paddw %mm5,%mm1 - - # Fourth row - - addl %edx,%edi # - addl %edx,%esi # - - movq (%edi),%mm6 # load first 8 bytes of pix2 row - movq 8(%edi),%mm7 # load last 8 bytes of pix2 row - psadbw (%esi),%mm6 # SAD of first 8 bytes - psadbw 8(%esi),%mm7 # SAD of last 8 bytes - paddw %mm6,%mm0 # Add to acumulators - paddw %mm7,%mm1 - - # Loop termination - - addl %edx,%esi # update pointers to next row - addl %edx,%edi - subl $4,%ecx - testl %ecx,%ecx # check rowsleft - jnz pix_abs16x16_sse.next4row - - paddd %mm1,%mm0 # Sum acumulators - movd %mm0,%eax # Store return value - - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - - popl %ebp # restore stack pointer - - #emms ; clear mmx registers - ret # return - -.global pix_abs16x16_x2_mmx - -# int pix_abs16x16_x2_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h); -# esi = p1 (init: blk1) -# edi = p2 (init: blk2) -# ecx = rowsleft (init: h) -# edx = lx; - -# mm0 = distance accumulators (4 words) -# mm1 = distance accumulators (4 words) -# mm2 = temp -# mm3 = temp -# mm4 = temp -# mm5 = temp -# mm6 = 0 -# mm7 = temp - - -.align 32 -pix_abs16x16_x2_mmx: - pushl %ebp # save frame pointer - movl %esp,%ebp - - pushl %ebx # Saves registers (called saves convention in - pushl %ecx # x86 GCC it seems) - pushl %edx # - pushl %esi - pushl %edi - - pxor %mm0,%mm0 # zero acculumators - pxor %mm1,%mm1 - pxor %mm6,%mm6 - movl 8(%ebp),%esi # get pix1 - movl 12(%ebp),%edi # get pix2 - movl 16(%ebp),%edx # get lx - movl 20(%ebp),%ecx # get rowsleft - jmp pix_abs16x16_x2_mmx.nextrow_x2 -.align 32 - -pix_abs16x16_x2_mmx.nextrow_x2: - # First 8 bytes of the row - - movq (%edi),%mm4 # load first 8 bytes of pix2 row - movq 1(%edi),%mm5 # load bytes 1-8 of pix2 row - - movq %mm4,%mm2 # copy mm4 on mm2 - movq %mm5,%mm3 # copy mm5 on mm3 - punpcklbw %mm6,%mm4 # first 4 bytes of [edi] on mm4 - punpcklbw %mm6,%mm5 # first 4 bytes of [edi+1] on mm5 - paddusw %mm5,%mm4 # mm4 := first 4 bytes interpolated in words - psrlw $1,%mm4 - - punpckhbw %mm6,%mm2 # last 4 bytes of [edi] on mm2 - punpckhbw %mm6,%mm3 # last 4 bytes of [edi+1] on mm3 - paddusw %mm3,%mm2 # mm2 := last 4 bytes interpolated in words - psrlw $1,%mm2 - - packuswb %mm2,%mm4 # pack 8 bytes interpolated on mm4 - movq (%esi),%mm5 # load first 8 bytes of pix1 row - - movq %mm4,%mm3 # mm4 := abs(mm4-mm5) - psubusb %mm5,%mm4 - psubusb %mm3,%mm5 - por %mm5,%mm4 - - # Last 8 bytes of the row - - movq 8(%edi),%mm7 # load last 8 bytes of pix2 row - movq 9(%edi),%mm5 # load bytes 10-17 of pix2 row - - movq %mm7,%mm2 # copy mm7 on mm2 - movq %mm5,%mm3 # copy mm5 on mm3 - punpcklbw %mm6,%mm7 # first 4 bytes of [edi+8] on mm7 - punpcklbw %mm6,%mm5 # first 4 bytes of [edi+9] on mm5 - paddusw %mm5,%mm7 # mm1 := first 4 bytes interpolated in words - psrlw $1,%mm7 - - punpckhbw %mm6,%mm2 # last 4 bytes of [edi] on mm2 - punpckhbw %mm6,%mm3 # last 4 bytes of [edi+1] on mm3 - paddusw %mm3,%mm2 # mm2 := last 4 bytes interpolated in words - psrlw $1,%mm2 - - packuswb %mm2,%mm7 # pack 8 bytes interpolated on mm1 - movq 8(%esi),%mm5 # load last 8 bytes of pix1 row - - movq %mm7,%mm3 # mm7 := abs(mm1-mm5) - psubusb %mm5,%mm7 - psubusb %mm3,%mm5 - por %mm5,%mm7 - - # Now mm4 and mm7 have 16 absdiffs to add - - movq %mm4,%mm3 # Make copies of these bytes - movq %mm7,%mm2 - - punpcklbw %mm6,%mm4 # Unpack to words and add - punpcklbw %mm6,%mm7 - paddusw %mm7,%mm4 - paddusw %mm4,%mm0 # Add to the acumulator (mm0) - - punpckhbw %mm6,%mm3 # Unpack to words and add - punpckhbw %mm6,%mm2 - paddusw %mm2,%mm3 - paddusw %mm3,%mm1 # Add to the acumulator (mm1) - - # Loop termination - - addl %edx,%esi # update pointers to next row - addl %edx,%edi - - subl $1,%ecx - testl %ecx,%ecx # check rowsleft - jnz pix_abs16x16_x2_mmx.nextrow_x2 - - paddusw %mm1,%mm0 - - movq %mm0,%mm1 # Copy mm0 to mm1 - psrlq $32,%mm1 - paddusw %mm1,%mm0 # Add - movq %mm0,%mm2 - psrlq $16,%mm2 - paddusw %mm2,%mm0 - movd %mm0,%eax # Store return value - andl $0xffff,%eax - - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - - popl %ebp # restore stack pointer - - emms # clear mmx registers - ret # return - -.global pix_abs16x16_y2_mmx - -# int pix_abs16x16_y2_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h); -# esi = p1 (init: blk1) -# edi = p2 (init: blk2) -# ebx = p2 + lx -# ecx = rowsleft (init: h) -# edx = lx; - -# mm0 = distance accumulators (4 words) -# mm1 = distance accumulators (4 words) -# mm2 = temp -# mm3 = temp -# mm4 = temp -# mm5 = temp -# mm6 = 0 -# mm7 = temp - - -.align 32 -pix_abs16x16_y2_mmx: - pushl %ebp # save frame pointer - movl %esp,%ebp - - pushl %ebx # Saves registers (called saves convention in - pushl %ecx # x86 GCC it seems) - pushl %edx # - pushl %esi - pushl %edi - - pxor %mm0,%mm0 # zero acculumators - pxor %mm1,%mm1 - pxor %mm6,%mm6 - movl 8(%ebp),%esi # get pix1 - movl 12(%ebp),%edi # get pix2 - movl 16(%ebp),%edx # get lx - movl 20(%ebp),%ecx # get rowsleft - movl %edi,%ebx - addl %edx,%ebx - jmp pix_abs16x16_y2_mmx.nextrow_y2 -.align 32 - -pix_abs16x16_y2_mmx.nextrow_y2: - # First 8 bytes of the row - - movq (%edi),%mm4 # load first 8 bytes of pix2 row - movq (%ebx),%mm5 # load bytes 1-8 of pix2 row - - movq %mm4,%mm2 # copy mm4 on mm2 - movq %mm5,%mm3 # copy mm5 on mm3 - punpcklbw %mm6,%mm4 # first 4 bytes of [edi] on mm4 - punpcklbw %mm6,%mm5 # first 4 bytes of [ebx] on mm5 - paddusw %mm5,%mm4 # mm4 := first 4 bytes interpolated in words - psrlw $1,%mm4 - - punpckhbw %mm6,%mm2 # last 4 bytes of [edi] on mm2 - punpckhbw %mm6,%mm3 # last 4 bytes of [edi+1] on mm3 - paddusw %mm3,%mm2 # mm2 := last 4 bytes interpolated in words - psrlw $1,%mm2 - - packuswb %mm2,%mm4 # pack 8 bytes interpolated on mm4 - movq (%esi),%mm5 # load first 8 bytes of pix1 row - - movq %mm4,%mm3 # mm4 := abs(mm4-mm5) - psubusb %mm5,%mm4 - psubusb %mm3,%mm5 - por %mm5,%mm4 - - # Last 8 bytes of the row - - movq 8(%edi),%mm7 # load last 8 bytes of pix2 row - movq 8(%ebx),%mm5 # load bytes 10-17 of pix2 row - - movq %mm7,%mm2 # copy mm7 on mm2 - movq %mm5,%mm3 # copy mm5 on mm3 - punpcklbw %mm6,%mm7 # first 4 bytes of [edi+8] on mm7 - punpcklbw %mm6,%mm5 # first 4 bytes of [ebx+8] on mm5 - paddusw %mm5,%mm7 # mm1 := first 4 bytes interpolated in words - psrlw $1,%mm7 - - punpckhbw %mm6,%mm2 # last 4 bytes of [edi+8] on mm2 - punpckhbw %mm6,%mm3 # last 4 bytes of [ebx+8] on mm3 - paddusw %mm3,%mm2 # mm2 := last 4 bytes interpolated in words - psrlw $1,%mm2 - - packuswb %mm2,%mm7 # pack 8 bytes interpolated on mm1 - movq 8(%esi),%mm5 # load last 8 bytes of pix1 row - - movq %mm7,%mm3 # mm7 := abs(mm1-mm5) - psubusb %mm5,%mm7 - psubusb %mm3,%mm5 - por %mm5,%mm7 - - # Now mm4 and mm7 have 16 absdiffs to add - - movq %mm4,%mm3 # Make copies of these bytes - movq %mm7,%mm2 - - punpcklbw %mm6,%mm4 # Unpack to words and add - punpcklbw %mm6,%mm7 - paddusw %mm7,%mm4 - paddusw %mm4,%mm0 # Add to the acumulator (mm0) - - punpckhbw %mm6,%mm3 # Unpack to words and add - punpckhbw %mm6,%mm2 - paddusw %mm2,%mm3 - paddusw %mm3,%mm1 # Add to the acumulator (mm1) - - # Loop termination - - addl %edx,%esi # update pointers to next row - addl %edx,%edi - addl %edx,%ebx - subl $1,%ecx - testl %ecx,%ecx # check rowsleft - jnz pix_abs16x16_y2_mmx.nextrow_y2 - - paddusw %mm1,%mm0 - - movq %mm0,%mm1 # Copy mm0 to mm1 - psrlq $32,%mm1 - paddusw %mm1,%mm0 # Add - movq %mm0,%mm2 - psrlq $16,%mm2 - paddusw %mm2,%mm0 - movd %mm0,%eax # Store return value - andl $0xffff,%eax - - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - - popl %ebp # restore stack pointer - - emms # clear mmx registers - ret # return - -.global pix_abs16x16_xy2_mmx - -# int pix_abs16x16_xy2_mmx(unsigned char *p1,unsigned char *p2,int lx,int h); - -# esi = p1 (init: blk1) -# edi = p2 (init: blk2) -# ebx = p1+lx -# ecx = rowsleft (init: h) -# edx = lx; - -# mm0 = distance accumulators (4 words) -# mm1 = bytes p2 -# mm2 = bytes p1 -# mm3 = bytes p1+lx -# I'd love to find someplace to stash p1+1 and p1+lx+1's bytes -# but I don't think thats going to happen in iA32-land... -# mm4 = temp 4 bytes in words interpolating p1, p1+1 -# mm5 = temp 4 bytes in words from p2 -# mm6 = temp comparison bit mask p1,p2 -# mm7 = temp comparison bit mask p2,p1 - - -.align 32 -pix_abs16x16_xy2_mmx: - pushl %ebp # save stack pointer - movl %esp,%ebp # so that we can do this - - pushl %ebx # Saves registers (called saves convention in - pushl %ecx # x86 GCC it seems) - pushl %edx # - pushl %esi - pushl %edi - - pxor %mm0,%mm0 # zero acculumators - - movl 12(%ebp),%esi # get p1 - movl 8(%ebp),%edi # get p2 - movl 16(%ebp),%edx # get lx - movl 20(%ebp),%ecx # rowsleft := h - movl %esi,%ebx - addl %edx,%ebx - jmp pix_abs16x16_xy2_mmx.nextrowmm11 # snap to it -.align 32 -pix_abs16x16_xy2_mmx.nextrowmm11: - - ## - ## First 8 bytes of row - ## - - ## First 4 bytes of 8 - - movq (%esi),%mm4 # mm4 := first 4 bytes p1 - pxor %mm7,%mm7 - movq %mm4,%mm2 # mm2 records all 8 bytes - punpcklbw %mm7,%mm4 # First 4 bytes p1 in Words... - - movq (%ebx),%mm6 # mm6 := first 4 bytes p1+lx - movq %mm6,%mm3 # mm3 records all 8 bytes - punpcklbw %mm7,%mm6 - paddw %mm6,%mm4 - - - movq 1(%esi),%mm5 # mm5 := first 4 bytes p1+1 - punpcklbw %mm7,%mm5 # First 4 bytes p1 in Words... - paddw %mm5,%mm4 - movq 1(%ebx),%mm6 # mm6 := first 4 bytes p1+lx+1 - punpcklbw %mm7,%mm6 - paddw %mm6,%mm4 - - psrlw $2,%mm4 # mm4 := First 4 bytes interpolated in words - - movq (%edi),%mm5 # mm5:=first 4 bytes of p2 in words - movq %mm5,%mm1 - punpcklbw %mm7,%mm5 - - movq %mm4,%mm7 - pcmpgtw %mm5,%mm7 # mm7 := [i : W0..3,mm4>mm5] - - movq %mm4,%mm6 # mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] - psubw %mm5,%mm6 - pand %mm7,%mm6 - - paddw %mm6,%mm0 # Add to accumulator - - movq %mm5,%mm6 # mm6 := [i : W0..3,mm5>mm4] - pcmpgtw %mm4,%mm6 - psubw %mm4,%mm5 # mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] - pand %mm6,%mm5 - - paddw %mm5,%mm0 # Add to accumulator - - ## Second 4 bytes of 8 - - movq %mm2,%mm4 # mm4 := Second 4 bytes p1 in words - pxor %mm7,%mm7 - punpckhbw %mm7,%mm4 - movq %mm3,%mm6 # mm6 := Second 4 bytes p1+1 in words - punpckhbw %mm7,%mm6 - paddw %mm6,%mm4 - - movq 1(%esi),%mm5 # mm5 := first 4 bytes p1+1 - punpckhbw %mm7,%mm5 # First 4 bytes p1 in Words... - paddw %mm5,%mm4 - movq 1(%ebx),%mm6 # mm6 := first 4 bytes p1+lx+1 - punpckhbw %mm7,%mm6 - paddw %mm6,%mm4 - - psrlw $2,%mm4 # mm4 := First 4 bytes interpolated in words - - movq %mm1,%mm5 # mm5:= second 4 bytes of p2 in words - punpckhbw %mm7,%mm5 - - movq %mm4,%mm7 - pcmpgtw %mm5,%mm7 # mm7 := [i : W0..3,mm4>mm5] - - movq %mm4,%mm6 # mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] - psubw %mm5,%mm6 - pand %mm7,%mm6 - - paddw %mm6,%mm0 # Add to accumulator - - movq %mm5,%mm6 # mm6 := [i : W0..3,mm5>mm4] - pcmpgtw %mm4,%mm6 - psubw %mm4,%mm5 # mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] - pand %mm6,%mm5 - - paddw %mm5,%mm0 # Add to accumulator - - - ## - ## Second 8 bytes of row - ## - ## First 4 bytes of 8 - - movq 8(%esi),%mm4 # mm4 := first 4 bytes p1+8 - pxor %mm7,%mm7 - movq %mm4,%mm2 # mm2 records all 8 bytes - punpcklbw %mm7,%mm4 # First 4 bytes p1 in Words... - - movq 8(%ebx),%mm6 # mm6 := first 4 bytes p1+lx+8 - movq %mm6,%mm3 # mm3 records all 8 bytes - punpcklbw %mm7,%mm6 - paddw %mm6,%mm4 - - - movq 9(%esi),%mm5 # mm5 := first 4 bytes p1+9 - punpcklbw %mm7,%mm5 # First 4 bytes p1 in Words... - paddw %mm5,%mm4 - movq 9(%ebx),%mm6 # mm6 := first 4 bytes p1+lx+9 - punpcklbw %mm7,%mm6 - paddw %mm6,%mm4 - - psrlw $2,%mm4 # mm4 := First 4 bytes interpolated in words - - movq 8(%edi),%mm5 # mm5:=first 4 bytes of p2+8 in words - movq %mm5,%mm1 - punpcklbw %mm7,%mm5 - - movq %mm4,%mm7 - pcmpgtw %mm5,%mm7 # mm7 := [i : W0..3,mm4>mm5] - - movq %mm4,%mm6 # mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] - psubw %mm5,%mm6 - pand %mm7,%mm6 - - paddw %mm6,%mm0 # Add to accumulator - - movq %mm5,%mm6 # mm6 := [i : W0..3,mm5>mm4] - pcmpgtw %mm4,%mm6 - psubw %mm4,%mm5 # mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] - pand %mm6,%mm5 - - paddw %mm5,%mm0 # Add to accumulator - - ## Second 4 bytes of 8 - - movq %mm2,%mm4 # mm4 := Second 4 bytes p1 in words - pxor %mm7,%mm7 - punpckhbw %mm7,%mm4 - movq %mm3,%mm6 # mm6 := Second 4 bytes p1+1 in words - punpckhbw %mm7,%mm6 - paddw %mm6,%mm4 - - movq 9(%esi),%mm5 # mm5 := first 4 bytes p1+1 - punpckhbw %mm7,%mm5 # First 4 bytes p1 in Words... - paddw %mm5,%mm4 - movq 9(%ebx),%mm6 # mm6 := first 4 bytes p1+lx+1 - punpckhbw %mm7,%mm6 - paddw %mm6,%mm4 - - psrlw $2,%mm4 # mm4 := First 4 bytes interpolated in words - - movq %mm1,%mm5 # mm5:= second 4 bytes of p2 in words - punpckhbw %mm7,%mm5 - - movq %mm4,%mm7 - pcmpgtw %mm5,%mm7 # mm7 := [i : W0..3,mm4>mm5] - - movq %mm4,%mm6 # mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] - psubw %mm5,%mm6 - pand %mm7,%mm6 - - paddw %mm6,%mm0 # Add to accumulator - - movq %mm5,%mm6 # mm6 := [i : W0..3,mm5>mm4] - pcmpgtw %mm4,%mm6 - psubw %mm4,%mm5 # mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] - pand %mm6,%mm5 - - paddw %mm5,%mm0 # Add to accumulator - - - ## - ## Loop termination condition... and stepping - ## - - addl %edx,%esi # update pointer to next row - addl %edx,%edi # ditto - addl %edx,%ebx - - subl $1,%ecx - testl %ecx,%ecx # check rowsleft - jnz pix_abs16x16_xy2_mmx.nextrowmm11 - - ## Sum the Accumulators - movq %mm0,%mm4 - psrlq $32,%mm4 - paddw %mm4,%mm0 - movq %mm0,%mm6 - psrlq $16,%mm6 - paddw %mm6,%mm0 - movd %mm0,%eax # store return value - andl $0xffff,%eax - - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - - popl %ebp # restore stack pointer - - emms # clear mmx registers - ret # we now return you to your regular programming - - - diff --git a/src/libffmpeg/libavcodec/simple_idct.c b/src/libffmpeg/libavcodec/simple_idct.c new file mode 100644 index 000000000..5459b81b3 --- /dev/null +++ b/src/libffmpeg/libavcodec/simple_idct.c @@ -0,0 +1,231 @@ +/* + Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +/* + based upon some outcommented c code from mpeg2dec (idct_mmx.c written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>) +*/ + +#include <inttypes.h> + +#include "simple_idct.h" + +#if 0 +#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ +#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ +#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ +#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */ +#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */ +#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */ +#define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */ +#define ROW_SHIFT 8 +#define COL_SHIFT 17 +#else +#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define ROW_SHIFT 11 +#define COL_SHIFT 20 // 6 +#endif +#if 1 +static void inline idctRow (int16_t * row) +{ + int a0, a1, a2, a3, b0, b1, b2, b3; + const int C1 =W1; + const int C2 =W2; + const int C3 =W3; + const int C4 =W4; + const int C5 =W5; + const int C6 =W6; + const int C7 =W7; + + if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7])) { + row[0] = row[1] = row[2] = row[3] = row[4] = + row[5] = row[6] = row[7] = row[0]<<3; + return; + } + + a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1)); + a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1)); + a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1)); + a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1)); + + b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; + b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; + b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; + b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; + + row[0] = (a0 + b0) >> ROW_SHIFT; + row[1] = (a1 + b1) >> ROW_SHIFT; + row[2] = (a2 + b2) >> ROW_SHIFT; + row[3] = (a3 + b3) >> ROW_SHIFT; + row[4] = (a3 - b3) >> ROW_SHIFT; + row[5] = (a2 - b2) >> ROW_SHIFT; + row[6] = (a1 - b1) >> ROW_SHIFT; + row[7] = (a0 - b0) >> ROW_SHIFT; +} + +static void inline idctCol (int16_t * col) +{ + int a0, a1, a2, a3, b0, b1, b2, b3; + const int C1 =W1; + const int C2 =W2; + const int C3 =W3; + const int C4 =W4; + const int C5 =W5; + const int C6 =W6; + const int C7 =W7; +/* + if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) { + col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] = + col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3; + return; + }*/ + col[0] += (1<<(COL_SHIFT-1))/W4; + a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6]; + a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6]; + a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6]; + a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6]; + + b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7]; + b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7]; + b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7]; + b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7]; + + col[8*0] = (a0 + b0) >> COL_SHIFT; + col[8*1] = (a1 + b1) >> COL_SHIFT; + col[8*2] = (a2 + b2) >> COL_SHIFT; + col[8*3] = (a3 + b3) >> COL_SHIFT; + col[8*4] = (a3 - b3) >> COL_SHIFT; + col[8*5] = (a2 - b2) >> COL_SHIFT; + col[8*6] = (a1 - b1) >> COL_SHIFT; + col[8*7] = (a0 - b0) >> COL_SHIFT; +} + +void simple_idct (short *block) +{ + int i; + for(i=0; i<8; i++) + idctRow(block + 8*i); + + for(i=0; i<8; i++) + idctCol(block + i); + +} + +#else + +#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define COL_SHIFT 31 // 6 + +static void inline idctRow (int32_t *out, int16_t * row) +{ + int a0, a1, a2, a3, b0, b1, b2, b3; + const int C1 =W1; + const int C2 =W2; + const int C3 =W3; + const int C4 =W4; + const int C5 =W5; + const int C6 =W6; + const int C7 =W7; +/* + if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7])) { + row[0] = row[1] = row[2] = row[3] = row[4] = + row[5] = row[6] = row[7] = row[0]<<14; + return; + } +*/ + a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6]; + a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6]; + a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6]; + a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6]; + + b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; + b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; + b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; + b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; + + out[0] = (a0 + b0); + out[1] = (a1 + b1); + out[2] = (a2 + b2); + out[3] = (a3 + b3); + out[4] = (a3 - b3); + out[5] = (a2 - b2); + out[6] = (a1 - b1); + out[7] = (a0 - b0); +} + +static void inline idctCol (int32_t *in, int16_t * col) +{ + int64_t a0, a1, a2, a3, b0, b1, b2, b3; + const int64_t C1 =W1; + const int64_t C2 =W2; + const int64_t C3 =W3; + const int64_t C4 =W4; + const int64_t C5 =W5; + const int64_t C6 =W6; + const int64_t C7 =W7; +/* + if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) { + col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] = + col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3; + return; + }*/ + in[0] += (1<<(COL_SHIFT-1))/W4; + a0 = C4*in[8*0] + C2*in[8*2] + C4*in[8*4] + C6*in[8*6]; + a1 = C4*in[8*0] + C6*in[8*2] - C4*in[8*4] - C2*in[8*6]; + a2 = C4*in[8*0] - C6*in[8*2] - C4*in[8*4] + C2*in[8*6]; + a3 = C4*in[8*0] - C2*in[8*2] + C4*in[8*4] - C6*in[8*6]; + + b0 = C1*in[8*1] + C3*in[8*3] + C5*in[8*5] + C7*in[8*7]; + b1 = C3*in[8*1] - C7*in[8*3] - C1*in[8*5] - C5*in[8*7]; + b2 = C5*in[8*1] - C1*in[8*3] + C7*in[8*5] + C3*in[8*7]; + b3 = C7*in[8*1] - C5*in[8*3] + C3*in[8*5] - C1*in[8*7]; + + col[8*0] = (a0 + b0) >> COL_SHIFT; + col[8*1] = (a1 + b1) >> COL_SHIFT; + col[8*2] = (a2 + b2) >> COL_SHIFT; + col[8*3] = (a3 + b3) >> COL_SHIFT; + col[8*4] = (a3 - b3) >> COL_SHIFT; + col[8*5] = (a2 - b2) >> COL_SHIFT; + col[8*6] = (a1 - b1) >> COL_SHIFT; + col[8*7] = (a0 - b0) >> COL_SHIFT; +} + +void simple_idct (short *block) +{ + int i; + int32_t temp[64]; + for(i=0; i<8; i++) + idctRow(temp+8*i, block + 8*i); + + for(i=0; i<8; i++) + idctCol(temp+i, block + i); + +} + +#endif diff --git a/src/libffmpeg/libavcodec/simple_idct.h b/src/libffmpeg/libavcodec/simple_idct.h new file mode 100644 index 000000000..54dff7396 --- /dev/null +++ b/src/libffmpeg/libavcodec/simple_idct.h @@ -0,0 +1,20 @@ +/* + Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +void simple_idct(short *block); +void simple_idct_mmx(short *block); diff --git a/src/libffmpeg/libavcodec/utils.c b/src/libffmpeg/libavcodec/utils.c index 26510d87c..aef27e342 100644 --- a/src/libffmpeg/libavcodec/utils.c +++ b/src/libffmpeg/libavcodec/utils.c @@ -16,19 +16,36 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <stdlib.h> #include <stdio.h> #include <string.h> #include <errno.h> #include "common.h" #include "dsputil.h" #include "avcodec.h" +#ifdef HAVE_MALLOC_H +#include <malloc.h> +#else +#include <stdlib.h> +#endif /* memory alloc */ void *av_mallocz(int size) { void *ptr; +#if defined ( ARCH_X86 ) && defined ( HAVE_MEMALIGN ) + ptr = memalign(64,size); + /* Why 64? + Indeed, we should align it: + on 4 for 386 + on 16 for 486 + on 32 for 586, PPro - k6-III + on 64 for K7 (maybe for P3 too). + Because L1 and L2 caches are aligned on those values. + But I don't want to code such logic here! + */ +#else ptr = malloc(size); +#endif if (!ptr) return NULL; memset(ptr, 0, size); @@ -138,6 +155,18 @@ AVCodec *avcodec_find_encoder(enum CodecID id) return NULL; } +AVCodec *avcodec_find_encoder_by_name(const char *name) +{ + AVCodec *p; + p = first_avcodec; + while (p) { + if (p->encode != NULL && strcmp(name,p->name) == 0) + return p; + p = p->next; + } + return NULL; +} + AVCodec *avcodec_find_decoder(enum CodecID id) { AVCodec *p; @@ -188,6 +217,7 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode) const char *codec_name; AVCodec *p; char buf1[32]; + int bitrate; if (encode) p = avcodec_find_encoder(enc->codec_id); @@ -228,6 +258,7 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode) enc->width, enc->height, (float)enc->frame_rate / FRAME_RATE_BASE); } + bitrate = enc->bit_rate; break; case CODEC_TYPE_AUDIO: snprintf(buf, buf_size, @@ -239,13 +270,31 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode) enc->sample_rate, enc->channels == 2 ? "stereo" : "mono"); } + /* for PCM codecs, compute bitrate directly */ + switch(enc->codec_id) { + case CODEC_ID_PCM_S16LE: + case CODEC_ID_PCM_S16BE: + case CODEC_ID_PCM_U16LE: + case CODEC_ID_PCM_U16BE: + bitrate = enc->sample_rate * enc->channels * 16; + break; + case CODEC_ID_PCM_S8: + case CODEC_ID_PCM_U8: + case CODEC_ID_PCM_ALAW: + case CODEC_ID_PCM_MULAW: + bitrate = enc->sample_rate * enc->channels * 8; + break; + default: + bitrate = enc->bit_rate; + break; + } break; default: abort(); } - if (enc->bit_rate != 0) { + if (bitrate != 0) { snprintf(buf + strlen(buf), buf_size - strlen(buf), - ", %d kb/s", enc->bit_rate / 1000); + ", %d kb/s", bitrate / 1000); } } @@ -353,7 +402,6 @@ void avcodec_register_all(void) register_avcodec(&mpeg4_encoder); register_avcodec(&msmpeg4_encoder); #endif /* CONFIG_ENCODERS */ - register_avcodec(&pcm_codec); register_avcodec(&rawvideo_codec); /* decoders */ @@ -365,13 +413,32 @@ void avcodec_register_all(void) register_avcodec(&h263i_decoder); register_avcodec(&rv10_decoder); register_avcodec(&mjpeg_decoder); -#ifdef CONFIG_MPGLIB +#ifdef FF_AUDIO_CODECS register_avcodec(&mp3_decoder); -#endif #ifdef CONFIG_AC3 register_avcodec(&ac3_decoder); #endif +#endif #endif /* CONFIG_DECODERS */ + +#ifdef FF_AUDIO_CODECS + /* pcm codecs */ + +#define PCM_CODEC(id, name) \ + register_avcodec(& name ## _encoder); \ + register_avcodec(& name ## _decoder); \ + +PCM_CODEC(CODEC_ID_PCM_S16LE, pcm_s16le); +PCM_CODEC(CODEC_ID_PCM_S16BE, pcm_s16be); +PCM_CODEC(CODEC_ID_PCM_U16LE, pcm_u16le); +PCM_CODEC(CODEC_ID_PCM_U16BE, pcm_u16be); +PCM_CODEC(CODEC_ID_PCM_S8, pcm_s8); +PCM_CODEC(CODEC_ID_PCM_U8, pcm_u8); +PCM_CODEC(CODEC_ID_PCM_ALAW, pcm_alaw); +PCM_CODEC(CODEC_ID_PCM_MULAW, pcm_mulaw); + +#undef PCM_CODEC +#endif } static int encode_init(AVCodecContext *s) @@ -392,18 +459,6 @@ static int encode_frame(AVCodecContext *avctx, return -1; } -/* dummy pcm codec */ -AVCodec pcm_codec = { - "pcm", - CODEC_TYPE_AUDIO, - CODEC_ID_PCM, - 0, - encode_init, - encode_frame, - NULL, - decode_frame, -}; - AVCodec rawvideo_codec = { "rawvideo", CODEC_TYPE_VIDEO, |