diff options
Diffstat (limited to 'src')
41 files changed, 9784 insertions, 5276 deletions
diff --git a/src/libffmpeg/libavcodec/Makefile.am b/src/libffmpeg/libavcodec/Makefile.am index f05501807..0fcae49fb 100644 --- a/src/libffmpeg/libavcodec/Makefile.am +++ b/src/libffmpeg/libavcodec/Makefile.am @@ -16,7 +16,8 @@ noinst_LTLIBRARIES = libavcodec.la libavcodec_la_SOURCES = common.c utils.c mpegvideo.c h263.c jrevdct.c jfdctfst.c \ mjpeg.c dsputil.c \ motion_est.c imgconvert.c msmpeg4.c \ - mpeg12.c h263dec.c rv10.c simple_idct.c + mpeg12.c h263dec.c rv10.c simple_idct.c \ + ratecontrol.c #imgresample.c libavcodec_la_LDFLAGS = \ diff --git a/src/libffmpeg/libavcodec/alpha/asm.h b/src/libffmpeg/libavcodec/alpha/asm.h new file mode 100644 index 000000000..0f4685f11 --- /dev/null +++ b/src/libffmpeg/libavcodec/alpha/asm.h @@ -0,0 +1,141 @@ +/* + * Alpha optimized DSP utils + * Copyright (c) 2002 Falk Hueffner <falk@debian.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef LIBAVCODEC_ALPHA_ASM_H +#define LIBAVCODEC_ALPHA_ASM_H + +#include <stdint.h> + +#define AMASK_BWX (1 << 0) +#define AMASK_FIX (1 << 1) +#define AMASK_MVI (1 << 8) + +static inline uint64_t BYTE_VEC(uint64_t x) +{ + x |= x << 8; + x |= x << 16; + x |= x << 32; + return x; +} +static inline uint64_t WORD_VEC(uint64_t x) +{ + x |= x << 16; + x |= x << 32; + return x; +} + +static inline int32_t ldl(const void* p) +{ + return *(const int32_t*) p; +} +static inline uint64_t ldq(const void* p) +{ + return *(const uint64_t*) p; +} +/* FIXME ccc doesn't seem to get it? Use inline asm? */ +static inline uint64_t ldq_u(const void* p) +{ + return *(const uint64_t*) ((uintptr_t) p & ~7ul); +} +static inline void stl(uint32_t l, void* p) +{ + *(uint32_t*) p = l; +} +static inline void stq(uint64_t l, void* p) +{ + *(uint64_t*) p = l; +} + +#ifdef __GNUC__ +#define OPCODE1(name) \ +static inline uint64_t name(uint64_t l) \ +{ \ + uint64_t r; \ + asm (#name " %1, %0" : "=r" (r) : "r" (l)); \ + return r; \ +} + +#define OPCODE2(name) \ +static inline uint64_t name(uint64_t l1, uint64_t l2) \ +{ \ + uint64_t r; \ + asm (#name " %1, %2, %0" : "=r" (r) : "r" (l1), "rI" (l2)); \ + return r; \ +} + +/* We don't want gcc to move this around or combine it with another + rpcc, so mark it volatile. */ +static inline uint64_t rpcc(void) +{ + uint64_t r; + asm volatile ("rpcc %0" : "=r" (r)); + return r; +} + +static inline uint64_t uldq(const void* v) +{ + struct foo { + unsigned long l; + } __attribute__((packed)); + + return ((const struct foo*) v)->l; +} + +#elif defined(__DECC) /* Compaq "ccc" compiler */ + +#include <c_asm.h> +#define OPCODE1(name) \ +static inline uint64_t name(uint64_t l) \ +{ \ + return asm (#name " %a0, %v0", l); \ +} + +#define OPCODE2(name) \ +static inline uint64_t name(uint64_t l1, uint64_t l2) \ +{ \ + return asm (#name " %a0, %a1, %v0", l1, l2); \ +} + +static inline uint64_t rpcc(void) +{ + return asm ("rpcc %v0"); +} + +static inline uint64_t uldq(const void* v) +{ + return *(const __unaligned uint64_t *) v; +} + +#endif + +OPCODE1(amask); +OPCODE1(unpkbw); +OPCODE1(pkwb); +OPCODE2(extql); +OPCODE2(extqh); +OPCODE2(zap); +OPCODE2(cmpbge); +OPCODE2(minsw4); +OPCODE2(minuw4); +OPCODE2(minub8); +OPCODE2(maxsw4); +OPCODE2(maxuw4); +OPCODE2(perr); + +#endif /* LIBAVCODEC_ALPHA_ASM_H */ diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c new file mode 100644 index 000000000..3a54904f4 --- /dev/null +++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c @@ -0,0 +1,223 @@ +/* + * Alpha optimized DSP utils + * Copyright (c) 2002 Falk Hueffner <falk@debian.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "asm.h" +#include "../dsputil.h" + +void simple_idct_axp(DCTELEM *block); + +static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, + int line_size) +{ + int i = 8; + do { + UINT64 shorts; + + shorts = ldq(block); + shorts = maxsw4(shorts, 0); + shorts = minsw4(shorts, WORD_VEC(0x00ff)); + stl(pkwb(shorts), pixels); + + shorts = ldq(block + 4); + shorts = maxsw4(shorts, 0); + shorts = minsw4(shorts, WORD_VEC(0x00ff)); + stl(pkwb(shorts), pixels + 4); + + pixels += line_size; + block += 8; + } while (--i); +} + +static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, + int line_size) +{ + int i = 8; + do { + UINT64 shorts; + + shorts = ldq(block); + shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */ + shorts += unpkbw(ldl(pixels)); + shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */ + shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */ + shorts &= ~WORD_VEC(0x4000); /* ...and zap them */ + shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */ + stl(pkwb(shorts), pixels); + + /* next 4 */ + shorts = ldq(block + 4); + shorts &= ~WORD_VEC(0x8000); + shorts += unpkbw(ldl(pixels + 4)); + shorts &= ~WORD_VEC(0x8000); + shorts = minuw4(shorts, WORD_VEC(0x4000)); + shorts &= ~WORD_VEC(0x4000); + shorts = minsw4(shorts, WORD_VEC(0x00ff)); + stl(pkwb(shorts), pixels + 4); + + pixels += line_size; + block += 8; + } while (--i); +} + +/* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1 + Since the immediate result could be greater than 255, we do the + shift first. The result is too low by one if the bytes were both + odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */ +static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2) +{ + UINT64 correction = (l1 & l2) & BYTE_VEC(0x01); + l1 = (l1 & ~BYTE_VEC(0x01)) >> 1; + l2 = (l2 & ~BYTE_VEC(0x01)) >> 1; + return l1 + l2 + correction; +} + +/* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1 + The '1' only has an effect when one byte is even and the other odd, + i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01). + Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01). */ +static inline UINT64 avg2(UINT64 l1, UINT64 l2) +{ + UINT64 correction = (l1 | l2) & BYTE_VEC(0x01); + l1 = (l1 & ~BYTE_VEC(0x01)) >> 1; + l2 = (l2 & ~BYTE_VEC(0x01)) >> 1; + return l1 + l2 + correction; +} + +static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4) +{ + UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) + + ((l2 & ~BYTE_VEC(0x03)) >> 2) + + ((l3 & ~BYTE_VEC(0x03)) >> 2) + + ((l4 & ~BYTE_VEC(0x03)) >> 2); + UINT64 r2 = (( (l1 & BYTE_VEC(0x03)) + + (l2 & BYTE_VEC(0x03)) + + (l3 & BYTE_VEC(0x03)) + + (l4 & BYTE_VEC(0x03)) + + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); + return r1 + r2; +} + +static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4) +{ + UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) + + ((l2 & ~BYTE_VEC(0x03)) >> 2) + + ((l3 & ~BYTE_VEC(0x03)) >> 2) + + ((l4 & ~BYTE_VEC(0x03)) >> 2); + UINT64 r2 = (( (l1 & BYTE_VEC(0x03)) + + (l2 & BYTE_VEC(0x03)) + + (l3 & BYTE_VEC(0x03)) + + (l4 & BYTE_VEC(0x03)) + + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03); + return r1 + r2; +} + +#define PIXOPNAME(suffix) put ## suffix +#define BTYPE UINT8 +#define AVG2 avg2 +#define AVG4 avg4 +#define STORE(l, b) stq(l, b) +#include "pixops.h" +#undef PIXOPNAME +#undef BTYPE +#undef AVG2 +#undef AVG4 +#undef STORE + +#define PIXOPNAME(suffix) put_no_rnd ## suffix +#define BTYPE UINT8 +#define AVG2 avg2_no_rnd +#define AVG4 avg4_no_rnd +#define STORE(l, b) stq(l, b) +#include "pixops.h" +#undef PIXOPNAME +#undef BTYPE +#undef AVG2 +#undef AVG4 +#undef STORE + +/* The following functions are untested. */ +#if 0 + +#define PIXOPNAME(suffix) avg ## suffix +#define BTYPE UINT8 +#define AVG2 avg2 +#define AVG4 avg4 +#define STORE(l, b) stq(AVG2(l, ldq(b)), b); +#include "pixops.h" +#undef PIXOPNAME +#undef BTYPE +#undef AVG2 +#undef AVG4 +#undef STORE + +#define PIXOPNAME(suffix) avg_no_rnd ## suffix +#define BTYPE UINT8 +#define AVG2 avg2_no_rnd +#define AVG4 avg4_no_rnd +#define STORE(l, b) stq(AVG2(l, ldq(b)), b); +#include "pixops.h" +#undef PIXOPNAME +#undef BTYPE +#undef AVG2 +#undef AVG4 +#undef STORE + +#define PIXOPNAME(suffix) sub ## suffix +#define BTYPE DCTELEM +#define AVG2 avg2 +#define AVG4 avg4 +#define STORE(l, block) do { \ + UINT64 xxx = l; \ + (block)[0] -= (xxx >> 0) & 0xff; \ + (block)[1] -= (xxx >> 8) & 0xff; \ + (block)[2] -= (xxx >> 16) & 0xff; \ + (block)[3] -= (xxx >> 24) & 0xff; \ + (block)[4] -= (xxx >> 32) & 0xff; \ + (block)[5] -= (xxx >> 40) & 0xff; \ + (block)[6] -= (xxx >> 48) & 0xff; \ + (block)[7] -= (xxx >> 56) & 0xff; \ +} while (0) +#include "pixops.h" +#undef PIXOPNAME +#undef BTYPE +#undef AVG2 +#undef AVG4 +#undef STORE + +#endif + +void dsputil_init_alpha(void) +{ + put_pixels_tab[0] = put_pixels_axp; + put_pixels_tab[1] = put_pixels_x2_axp; + put_pixels_tab[2] = put_pixels_y2_axp; + put_pixels_tab[3] = put_pixels_xy2_axp; + + put_no_rnd_pixels_tab[0] = put_pixels_axp; + put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp; + put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp; + put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp; + + /* amask clears all bits that correspond to present features. */ + if (amask(AMASK_MVI) == 0) { + fprintf(stderr, "MVI extension detected\n"); + put_pixels_clamped = put_pixels_clamped_axp; + add_pixels_clamped = add_pixels_clamped_axp; + } +} diff --git a/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c b/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c new file mode 100644 index 000000000..d0af5e1d3 --- /dev/null +++ b/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c @@ -0,0 +1,88 @@ +/* + * Alpha optimized DSP utils + * Copyright (c) 2002 Falk Hueffner <falk@debian.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "asm.h" +#include "../dsputil.h" +#include "../mpegvideo.h" + +extern UINT8 zigzag_end[64]; + +static void dct_unquantize_h263_axp(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int i, level; + UINT64 qmul, qadd; + if (s->mb_intra) { + if (n < 4) + block[0] = block[0] * s->y_dc_scale; + else + block[0] = block[0] * s->c_dc_scale; + /* Catch up to aligned point. */ + qmul = s->qscale << 1; + qadd = (s->qscale - 1) | 1; + for (i = 1; i < 4; ++i) { + level = block[i]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[i] = level; + } + } + block += 4; + i = 60 / 4; + } else { + i = zigzag_end[s->block_last_index[n]] / 4; + } + qmul = s->qscale << 1; + qadd = WORD_VEC((qscale - 1) | 1); + do { + UINT64 levels, negmask, zeromask, corr; + levels = ldq(block); + if (levels == 0) + continue; + zeromask = cmpbge(0, levels); + zeromask &= zeromask >> 1; + /* Negate all negative words. */ + negmask = maxsw4(levels, WORD_VEC(0xffff)); /* negative -> ffff (-1) */ + negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */ + corr = negmask & WORD_VEC(0x0001); /* twos-complement correction */ + levels ^= negmask; + levels += corr; + + levels = levels * qmul; + levels += zap(qadd, zeromask); + + /* Re-negate negative words. */ + levels -= corr; + levels ^= negmask; + + stq(levels, block); + } while (block += 4, --i); +} + +void MPV_common_init_axp(MpegEncContext *s) +{ + if (amask(AMASK_MVI) == 0) { + if (s->out_format == FMT_H263) + s->dct_unquantize = dct_unquantize_h263_axp; + } +} diff --git a/src/libffmpeg/libavcodec/alpha/pixops.h b/src/libffmpeg/libavcodec/alpha/pixops.h new file mode 100644 index 000000000..118d7ae23 --- /dev/null +++ b/src/libffmpeg/libavcodec/alpha/pixops.h @@ -0,0 +1,135 @@ +/* + * Alpha optimized DSP utils + * Copyright (c) 2002 Falk Hueffner <falk@debian.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* This file is intended to be #included with proper definitions of + * PIXOPNAME, BTYPE, AVG2, AVG4 and STORE. */ + +static void PIXOPNAME(_pixels_axp)(BTYPE *block, const UINT8 *pixels, + int line_size, int h) +{ + if ((size_t) pixels & 0x7) { + do { + STORE(uldq(pixels), block); + pixels += line_size; + block += line_size; + } while (--h); + } else { + do { + STORE(ldq(pixels), block); + pixels += line_size; + block += line_size; + } while (--h); + } +} + +static void PIXOPNAME(_pixels_x2_axp)(BTYPE *block, const UINT8 *pixels, + int line_size, int h) +{ + if ((size_t) pixels & 0x7) { + do { + UINT64 pix1, pix2; + + pix1 = uldq(pixels); + pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56); + STORE(AVG2(pix1, pix2), block); + pixels += line_size; + block += line_size; + } while (--h); + } else { + do { + UINT64 pix1, pix2; + + pix1 = ldq(pixels); + pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56); + STORE(AVG2(pix1, pix2), block); + pixels += line_size; + block += line_size; + } while (--h); + } +} + +static void PIXOPNAME(_pixels_y2_axp)(BTYPE *block, const UINT8 *pixels, + int line_size, int h) +{ + if ((size_t) pixels & 0x7) { + UINT64 pix = uldq(pixels); + do { + UINT64 next_pix; + + pixels += line_size; + next_pix = uldq(pixels); + STORE(AVG2(pix, next_pix), block); + block += line_size; + pix = next_pix; + } while (--h); + } else { + UINT64 pix = ldq(pixels); + do { + UINT64 next_pix; + + pixels += line_size; + next_pix = ldq(pixels); + STORE(AVG2(pix, next_pix), block); + block += line_size; + pix = next_pix; + } while (--h); + } +} + +/* This could be further sped up by recycling AVG4 intermediate + results from the previous loop pass. */ +static void PIXOPNAME(_pixels_xy2_axp)(BTYPE *block, const UINT8 *pixels, + int line_size, int h) +{ + if ((size_t) pixels & 0x7) { + UINT64 pix1 = uldq(pixels); + UINT64 pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56); + + do { + UINT64 next_pix1, next_pix2; + + pixels += line_size; + next_pix1 = uldq(pixels); + next_pix2 = next_pix1 >> 8 | ((UINT64) pixels[8] << 56); + + STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block); + + block += line_size; + pix1 = next_pix1; + pix2 = next_pix2; + } while (--h); + } else { + UINT64 pix1 = ldq(pixels); + UINT64 pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56); + + do { + UINT64 next_pix1, next_pix2; + + pixels += line_size; + next_pix1 = ldq(pixels); + next_pix2 = next_pix1 >> 8 | ((UINT64) pixels[8] << 56); + + STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block); + + block += line_size; + pix1 = next_pix1; + pix2 = next_pix2; + } while (--h); + } +} diff --git a/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c b/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c index 1cf7b4fba..cd362ca48 100644 --- a/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c +++ b/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c @@ -2,19 +2,19 @@ * ARMv4L optimized DSP utils * Copyright (c) 2001 Lionel Ulmer. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "../dsputil.h" diff --git a/src/libffmpeg/libavcodec/avcodec.h b/src/libffmpeg/libavcodec/avcodec.h index 05b27d8c2..68b67154d 100644 --- a/src/libffmpeg/libavcodec/avcodec.h +++ b/src/libffmpeg/libavcodec/avcodec.h @@ -3,6 +3,11 @@ #include "common.h" +#define LIBAVCODEC_VERSION_INT 0x000406 +#define LIBAVCODEC_VERSION "0.4.6" +#define LIBAVCODEC_BUILD 4614 +#define LIBAVCODEC_BUILD_STR "4614" + enum CodecID { CODEC_ID_NONE, CODEC_ID_MPEG1VIDEO, @@ -17,18 +22,31 @@ enum CodecID { CODEC_ID_MSMPEG4V1, CODEC_ID_MSMPEG4V2, CODEC_ID_MSMPEG4V3, + CODEC_ID_WMV1, + CODEC_ID_WMV2, CODEC_ID_H263P, CODEC_ID_H263I, + /* various pcm "codecs" */ + CODEC_ID_PCM_S16LE, + CODEC_ID_PCM_S16BE, + CODEC_ID_PCM_U16LE, + CODEC_ID_PCM_U16BE, + CODEC_ID_PCM_S8, + CODEC_ID_PCM_U8, + CODEC_ID_PCM_MULAW, + CODEC_ID_PCM_ALAW, }; #define CODEC_ID_MSMPEG4 CODEC_ID_MSMPEG4V3 enum CodecType { + CODEC_TYPE_UNKNOWN = -1, CODEC_TYPE_VIDEO, CODEC_TYPE_AUDIO, }; enum PixelFormat { + PIX_FMT_ANY = -1, PIX_FMT_YUV420P, PIX_FMT_YUV422, PIX_FMT_RGB24, @@ -45,14 +63,24 @@ enum SampleFormat { /* in bytes */ #define AVCODEC_MAX_AUDIO_FRAME_SIZE 18432 -/* motion estimation type */ +/* motion estimation type, EPZS by default */ +enum Motion_Est_ID { + ME_ZERO = 1, + ME_FULL, + ME_LOG, + ME_PHODS, + ME_EPZS, + ME_X1 +}; + +/* only for ME compatiblity with old apps */ extern int motion_estimation_method; -#define ME_ZERO 0 -#define ME_FULL 1 -#define ME_LOG 2 -#define ME_PHODS 3 -#define ME_EPZS 4 -#define ME_X1 5 + +/* ME algos sorted by quality */ +static const int Motion_Est_QTab[] = { ME_ZERO, ME_PHODS, ME_LOG, + ME_X1, ME_EPZS, ME_FULL }; + +#define FF_MAX_B_FRAMES 4 /* encoding support */ /* note not everything is supported yet */ @@ -60,10 +88,17 @@ extern int motion_estimation_method; #define CODEC_FLAG_HQ 0x0001 /* high quality (non real time) encoding */ #define CODEC_FLAG_QSCALE 0x0002 /* use fixed qscale */ #define CODEC_FLAG_4MV 0x0004 /* 4 MV per MB allowed */ -#define CODEC_FLAG_B 0x0008 /* use B frames */ #define CODEC_FLAG_QPEL 0x0010 /* use qpel MC */ #define CODEC_FLAG_GMC 0x0020 /* use GMC */ #define CODEC_FLAG_TYPE 0x0040 /* fixed I/P frame type, from avctx->key_frame */ +#define CODEC_FLAG_PART 0x0080 /* use data partitioning */ +/* parent program gurantees that the input for b-frame containing streams is not written to + for at least s->max_b_frames+1 frames, if this is not set than the input will be copied */ +#define CODEC_FLAG_INPUT_PRESERVED 0x0100 +#define CODEC_FLAG_PASS1 0x0200 /* use internal 2pass ratecontrol in first pass mode */ +#define CODEC_FLAG_PASS2 0x0400 /* use internal 2pass ratecontrol in second pass mode */ +#define CODEC_FLAG_EXTERN_HUFF 0x1000 /* use external huffman table (for mjpeg) */ +#define CODEC_FLAG_GRAY 0x2000 /* only decode/encode grayscale */ /* codec capabilities */ @@ -78,6 +113,15 @@ typedef struct AVCodecContext { int flags; int sub_id; /* some codecs needs additionnal format info. It is stored there */ + + int me_method; /* ME algorithm used for video coding */ + + /* extra data from parent application to codec, e.g. huffman table + for mjpeg */ + /* the parent should allocate and free this buffer */ + void *extradata; + int extradata_size; + /* video only */ int frame_rate; /* frames per sec multiplied by FRAME_RATE_BASE */ int width, height; @@ -88,8 +132,10 @@ typedef struct AVCodecContext { #define FF_ASPECT_16_9_625 4 #define FF_ASPECT_16_9_525 5 int gop_size; /* 0 = intra only */ - int pix_fmt; /* pixel format, see PIX_FMT_xxx */ - + enum PixelFormat pix_fmt; /* pixel format, see PIX_FMT_xxx */ + int repeat_pict; /* when decoding, this signal how much the picture */ + /* must be delayed. */ + /* extra_delay = (repeat_pict / 2) * (1/fps) */ /* if non NULL, 'draw_horiz_band' is called by the libavcodec decoder to draw an horizontal band. It improve cache usage. Not all codecs can do that. You must check the codec capabilities @@ -104,23 +150,48 @@ typedef struct AVCodecContext { int sample_fmt; /* sample format, currenly unused */ /* the following data should not be initialized */ - int frame_size; /* in samples, initialized when calling 'init' */ - int frame_number; /* audio or video frame number */ - int key_frame; /* true if the previous compressed frame was - a key frame (intra, or seekable) */ + int frame_size; /* in samples, initialized when calling 'init' */ + int frame_number; /* audio or video frame number */ + int real_pict_num; /* returns the real picture number of + previous encoded frame */ + int key_frame; /* true if the previous compressed frame was + a key frame (intra, or seekable) */ + int pict_type; /* picture type of the previous + encoded frame */ +/* FIXME: these should have FF_ */ +#define I_TYPE 1 // Intra +#define P_TYPE 2 // Predicted +#define B_TYPE 3 // Bi-dir predicted +#define S_TYPE 4 // S(GMC)-VOP MPEG4 + + int delay; /* number of frames the decoded output + will be delayed relative to the encoded input */ + uint8_t *mbskip_table; /* =1 if MB didnt change, is only valid for I/P frames + stride= mb_width = (width+15)>>4 */ + + /* encoding parameters */ int quality; /* quality of the previous encoded frame - (between 1 (good) and 31 (bad)) */ + (between 1 (good) and 31 (bad)) + this is allso used to set the quality in vbr mode + and the per frame quality in CODEC_FLAG_TYPE (second pass mode) */ float qcompress; /* amount of qscale change between easy & hard scenes (0.0-1.0)*/ float qblur; /* amount of qscale smoothing over time (0.0-1.0) */ int qmin; /* min qscale */ int qmax; /* max qscale */ int max_qdiff; /* max qscale difference between frames */ + int max_b_frames; /* maximum b frames, the output will be delayed by max_b_frames+1 relative to the input */ + float b_quant_factor;/* qscale factor between ips and b frames */ + int rc_strategy; + int b_frame_strategy; + + int hurry_up; /* when set to 1 during decoding, b frames will be skiped + when set to 2 idct/dequant will be skipped too */ struct AVCodec *codec; void *priv_data; /* The following data is for RTP friendly coding */ - /* By now only H.263/H.263+ coder honours this */ + /* By now only H.263/H.263+/MPEG4 coder honours this */ int rtp_mode; /* 1 for activate RTP friendly-mode */ /* highers numbers represent more error-prone */ /* enviroments, by now just "1" exist */ @@ -145,7 +216,7 @@ typedef struct AVCodecContext { float psnr_y; float psnr_cb; float psnr_cr; - + /* statistics, used for 2-pass encoding */ int mv_bits; int header_bits; @@ -156,13 +227,57 @@ typedef struct AVCodecContext { int skip_count; int misc_bits; // cbp, mb_type int frame_bits; - + /* the following fields are ignored */ void *opaque; /* can be used to carry app specific stuff */ char codec_name[32]; - int codec_type; /* see CODEC_TYPE_xxx */ - int codec_id; /* see CODEC_ID_xxx */ + enum CodecType codec_type; /* see CODEC_TYPE_xxx */ + enum CodecID codec_id; /* see CODEC_ID_xxx */ unsigned int codec_tag; /* codec tag, only used if unknown codec */ + + int workaround_bugs; /* workaround bugs in encoders which cannot be detected automatically */ + int luma_elim_threshold; + int chroma_elim_threshold; + int strict_std_compliance; /* strictly follow the std (MPEG4, ...) */ + float b_quant_offset;/* qscale offset between ips and b frames, not implemented yet */ + int error_resilience; + +#ifndef MBC +#define MBC 128 +#define MBR 96 +#endif + int *quant_store; /* field for communicating with external postprocessing */ + unsigned qstride; + //FIXME this should be reordered after kabis API is finished ... + /* + Note: Below are located reserved fields for further usage + It requires for ABI !!! + If you'll perform some changes then borrow new space from these fields + (void * can be safety replaced with struct * ;) + P L E A S E ! ! ! + IMPORTANT: Never change order of already declared fields!!! + */ + unsigned long long int + ull_res0,ull_res1,ull_res2,ull_res3,ull_res4,ull_res5, + ull_res6,ull_res7,ull_res8,ull_res9,ull_res10,ull_res11,ull_res12; + float + flt_res0,flt_res1,flt_res2,flt_res3,flt_res4,flt_res5, + flt_res6,flt_res7,flt_res8,flt_res9,flt_res10,flt_res11; + void + *ptr_res0,*ptr_res1,*ptr_res2,*ptr_res3,*ptr_res4,*ptr_res5, + *ptr_res6,*ptr_res7,*ptr_res8,*ptr_res9,*ptr_res10,*ptr_res11; + unsigned long int + ul_res0,ul_res1,ul_res2,ul_res3,ul_res4,ul_res5, + ul_res6,ul_res7,ul_res8,ul_res9,ul_res10,ul_res11,ul_res12; + unsigned int + ui_res0,ui_res1,ui_res2,ui_res3,ui_res4,ui_res5, + ui_res6; + unsigned short int + us_res0,us_res1,us_res2,us_res3,us_res4,us_res5, + us_res6,us_res7,us_res8,us_res9,us_res10,us_res11,us_res12; + unsigned char + uc_res0,uc_res1,uc_res2,uc_res3,uc_res4,uc_res5, + uc_res6,uc_res7,uc_res8,uc_res9,uc_res10,uc_res11,uc_res12; } AVCodecContext; typedef struct AVCodec { @@ -177,6 +292,23 @@ typedef struct AVCodec { UINT8 *buf, int buf_size); int capabilities; struct AVCodec *next; + /* + Note: Below are located reserved fields for further usage + It requires for ABI !!! + If you'll perform some changes then borrow new space from these fields + (void * can be safety replaced with struct * ;) + P L E A S E ! ! ! + IMPORTANT: Never change order of already declared fields!!! + */ + unsigned long long int + ull_res0,ull_res1,ull_res2,ull_res3,ull_res4,ull_res5, + ull_res6,ull_res7,ull_res8,ull_res9,ull_res10,ull_res11,ull_res12; + float + flt_res0,flt_res1,flt_res2,flt_res3,flt_res4,flt_res5, + flt_res6,flt_res7,flt_res8,flt_res9,flt_res10,flt_res11,flt_res12; + void + *ptr_res0,*ptr_res1,*ptr_res2,*ptr_res3,*ptr_res4,*ptr_res5, + *ptr_res6,*ptr_res7,*ptr_res8,*ptr_res9,*ptr_res10,*ptr_res11,*ptr_res12; } AVCodec; /* three components are given, that's all */ @@ -185,15 +317,47 @@ typedef struct AVPicture { int linesize[3]; } AVPicture; +extern AVCodec ac3_encoder; +extern AVCodec mp2_encoder; +extern AVCodec mp3lame_encoder; +extern AVCodec mpeg1video_encoder; +extern AVCodec h263_encoder; +extern AVCodec h263p_encoder; +extern AVCodec rv10_encoder; +extern AVCodec mjpeg_encoder; +extern AVCodec mpeg4_encoder; +extern AVCodec msmpeg4v1_encoder; +extern AVCodec msmpeg4v2_encoder; +extern AVCodec msmpeg4v3_encoder; + extern AVCodec h263_decoder; extern AVCodec mpeg4_decoder; extern AVCodec msmpeg4v1_decoder; extern AVCodec msmpeg4v2_decoder; extern AVCodec msmpeg4v3_decoder; +extern AVCodec wmv1_decoder; extern AVCodec mpeg_decoder; extern AVCodec h263i_decoder; extern AVCodec rv10_decoder; extern AVCodec mjpeg_decoder; +extern AVCodec mp2_decoder; +extern AVCodec mp3_decoder; + +/* pcm codecs */ +#define PCM_CODEC(id, name) \ +extern AVCodec name ## _decoder; \ +extern AVCodec name ## _encoder; + +PCM_CODEC(CODEC_ID_PCM_S16LE, pcm_s16le); +PCM_CODEC(CODEC_ID_PCM_S16BE, pcm_s16be); +PCM_CODEC(CODEC_ID_PCM_U16LE, pcm_u16le); +PCM_CODEC(CODEC_ID_PCM_U16BE, pcm_u16be); +PCM_CODEC(CODEC_ID_PCM_S8, pcm_s8); +PCM_CODEC(CODEC_ID_PCM_U8, pcm_u8); +PCM_CODEC(CODEC_ID_PCM_ALAW, pcm_alaw); +PCM_CODEC(CODEC_ID_PCM_MULAW, pcm_mulaw); + +#undef PCM_CODEC /* dummy raw video codec */ extern AVCodec rawvideo_codec; @@ -242,8 +406,14 @@ int avpicture_deinterlace(AVPicture *dst, AVPicture *src, extern AVCodec *first_avcodec; +/* returns LIBAVCODEC_VERSION_INT constant */ +unsigned avcodec_version(void); +/* returns LIBAVCODEC_BUILD constant */ +unsigned avcodec_build(void); void avcodec_init(void); +void avcodec_set_bit_exact(void); + void register_avcodec(AVCodec *format); AVCodec *avcodec_find_encoder(enum CodecID id); AVCodec *avcodec_find_encoder_by_name(const char *name); @@ -267,12 +437,87 @@ int avcodec_close(AVCodecContext *avctx); void avcodec_register_all(void); +void avcodec_flush_buffers(AVCodecContext *avctx); + #ifdef FF_POSTPROCESS -#ifndef MBC -#define MBC 48 -#define MBR 36 -#endif extern int quant_store[MBR+1][MBC+1]; // [Review] #endif + +/** + * Interface for 0.5.0 version + * + * do not even think about it's usage for this moment + */ + +typedef struct { + // compressed size used from given memory buffer + int size; + /// I/P/B frame type + int frame_type; +} avc_enc_result_t; + +/** + * Commands + * order can't be changed - once it was defined + */ +typedef enum { + // general commands + AVC_OPEN_BY_NAME = 0xACA000, + AVC_OPEN_BY_CODEC_ID, + AVC_OPEN_BY_FOURCC, + AVC_CLOSE, + + AVC_FLUSH, + // pin - struct { uint8_t* src, uint_t src_size } + // pout - struct { AVPicture* img, consumed_bytes, + AVC_DECODE, + // pin - struct { AVPicture* img, uint8_t* dest, uint_t dest_size } + // pout - uint_t used_from_dest_size + AVC_ENCODE, + + // query/get video commands + AVC_GET_VERSION = 0xACB000, + AVC_GET_WIDTH, + AVC_GET_HEIGHT, + AVC_GET_DELAY, + AVC_GET_QUANT_TABLE, + // ... + + // query/get audio commands + AVC_GET_FRAME_SIZE = 0xABC000, + + // maybe define some simple structure which + // might be passed to the user - but they can't + // contain any codec specific parts and these + // calls are usualy necessary only few times + + // set video commands + AVC_SET_WIDTH = 0xACD000, + AVC_SET_HEIGHT, + + // set video encoding commands + AVC_SET_FRAME_RATE = 0xACD800, + AVC_SET_QUALITY, + AVC_SET_HURRY_UP, + + // set audio commands + AVC_SET_SAMPLE_RATE = 0xACE000, + AVC_SET_CHANNELS, + +} avc_cmd_t; + +/** + * \param handle allocated private structure by libavcodec + * for initialization pass NULL - will be returned pout + * user is supposed to know nothing about its structure + * \param cmd type of operation to be performed + * \param pint input parameter + * \param pout output parameter + * + * \returns command status - eventually for query command it might return + * integer resulting value + */ +int avcodec(void* handle, avc_cmd_t cmd, void* pin, void* pout); + #endif /* AVCODEC_H */ diff --git a/src/libffmpeg/libavcodec/common.c b/src/libffmpeg/libavcodec/common.c index f7fe2e1d1..571de1afc 100644 --- a/src/libffmpeg/libavcodec/common.c +++ b/src/libffmpeg/libavcodec/common.c @@ -1,25 +1,24 @@ /* * Common bit i/o utils - * Copyright (c) 2000, 2001 Gerard Lantau. + * Copyright (c) 2000, 2001 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * alternative bitstream reader & writer by Michael Niedermayer <michaelni@gmx.at> */ #include "common.h" -#include <math.h> void init_put_bits(PutBitContext *s, UINT8 *buffer, int buffer_size, @@ -108,6 +107,15 @@ void jflush_put_bits(PutBitContext *s) } #endif +void put_string(PutBitContext * pbc, char *s) +{ + while(*s){ + put_bits(pbc, 8, *s); + s++; + } + put_bits(pbc, 8, 0); +} + /* bit input functions */ void init_get_bits(GetBitContext *s, @@ -166,6 +174,9 @@ unsigned int get_bits_long(GetBitContext *s, int n) (buf_ptr[-2] << 8) | (buf_ptr[-1]); #endif + val |= bit_buf >> (32 + bit_cnt); + bit_buf <<= - bit_cnt; + bit_cnt += 32; } else { buf_ptr -= 4; bit_buf = 0; @@ -177,11 +188,13 @@ unsigned int get_bits_long(GetBitContext *s, int n) bit_buf |= *buf_ptr++ << 8; if (buf_ptr < s->buf_end) bit_buf |= *buf_ptr++; + + val |= bit_buf >> (32 + bit_cnt); + bit_buf <<= - bit_cnt; + bit_cnt += 8*(buf_ptr - s->buf_ptr); + if(bit_cnt<0) bit_cnt=0; } s->buf_ptr = buf_ptr; - val |= bit_buf >> (32 + bit_cnt); - bit_buf <<= - bit_cnt; - bit_cnt += 32; } s->bit_buf = bit_buf; s->bit_cnt = bit_cnt; @@ -349,7 +362,7 @@ static int build_table(VLC *vlc, int table_nb_bits, #endif if (table_bits[j] != 0) { fprintf(stderr, "incorrect codes\n"); - abort(); + exit(1); } table_bits[j] = n; table_codes[j] = i; @@ -435,10 +448,8 @@ int init_vlc(VLC *vlc, int nb_bits, int nb_codes, bits, bits_wrap, bits_size, codes, codes_wrap, codes_size, 0, 0) < 0) { - if (vlc->table_bits) - free(vlc->table_bits); - if (vlc->table_codes) - free(vlc->table_codes); + av_free(vlc->table_bits); + av_free(vlc->table_codes); return -1; } return 0; @@ -447,7 +458,11 @@ int init_vlc(VLC *vlc, int nb_bits, int nb_codes, void free_vlc(VLC *vlc) { - free(vlc->table_bits); - free(vlc->table_codes); + av_free(vlc->table_bits); + av_free(vlc->table_codes); } +int ff_gcd(int a, int b){ + if(b) return ff_gcd(b, a%b); + else return a; +} diff --git a/src/libffmpeg/libavcodec/common.h b/src/libffmpeg/libavcodec/common.h index 9c7b086d8..24bd367d6 100644 --- a/src/libffmpeg/libavcodec/common.h +++ b/src/libffmpeg/libavcodec/common.h @@ -1,8 +1,6 @@ #ifndef COMMON_H #define COMMON_H -#undef DEBUG - #define FFMPEG_VERSION_INT 0x000406 #define FFMPEG_VERSION "0.4.6" @@ -19,18 +17,19 @@ #ifdef HAVE_AV_CONFIG_H /* only include the following when compiling package */ -#include "../config.h" +#include "config.h" #include <stdlib.h> #include <stdio.h> #include <string.h> #include <errno.h> +#include <math.h> #ifndef ENODATA #define ENODATA 61 #endif -#endif +#endif /* HAVE_AV_CONFIG_H */ #ifdef CONFIG_WIN32 @@ -51,6 +50,8 @@ typedef UINT16 uint16_t; typedef INT16 int16_t; typedef UINT32 uint32_t; typedef INT32 int32_t; +typedef UINT64 uint64_t; +typedef INT64 int64_t; #ifndef __MINGW32__ #define INT64_C(c) (c ## i64) @@ -58,14 +59,6 @@ typedef INT32 int32_t; #define inline __inline -/* - Disable warning messages: - warning C4244: '=' : conversion from 'double' to 'float', possible loss of data - warning C4305: 'argument' : truncation from 'const double' to 'float' -*/ -#pragma warning( disable : 4244 ) -#pragma warning( disable : 4305 ) - #else #define INT64_C(c) (c ## LL) #define UINT64_C(c) (c ## ULL) @@ -78,22 +71,9 @@ typedef INT32 int32_t; #define DEBUG #endif -// code from bits/byteswap.h (C) 1997, 1998 Free Software Foundation, Inc. -#define bswap_32(x) \ - ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ - (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) -#define be2me_32(x) bswap_32(x) - #define snprintf _snprintf -#ifndef __MINGW32__ -/* no config.h with VC */ -#define CONFIG_ENCODERS 1 -#define CONFIG_DECODERS 1 -#define CONFIG_AC3 1 -#endif - -#else +#else /* CONFIG_WIN32 */ /* unix */ @@ -112,8 +92,6 @@ typedef signed char INT8; typedef signed int INT32; typedef signed long long INT64; -#include "xine-engine/bswap.h" - #ifdef HAVE_AV_CONFIG_H #ifdef __FreeBSD__ @@ -133,10 +111,19 @@ typedef signed long long INT64; #endif /* !CONFIG_WIN32 */ +#include "bswap.h" -/* debug stuff */ #ifdef HAVE_AV_CONFIG_H +#if defined(__MINGW32__) || defined(__CYGWIN__) || \ + defined(__OS2__) || defined (__OpenBSD__) +#define MANGLE(a) "_" #a +#else +#define MANGLE(a) #a +#endif + +/* debug stuff */ + #ifndef DEBUG #define NDEBUG #endif @@ -150,11 +137,7 @@ inline void dprintf(const char* fmt,...) {} #else #ifdef DEBUG -#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95) || !defined(__GNUC__) -#define dprintf(...) printf(__VA_ARGS__) -#else #define dprintf(fmt,args...) printf(fmt, ## args) -#endif #else #define dprintf(fmt,args...) #endif @@ -163,6 +146,14 @@ inline void dprintf(const char* fmt,...) {} #endif /* HAVE_AV_CONFIG_H */ +#define av_abort() do { fprintf(stderr, "Abort at %s:%d\n", __FILE__, __LINE__); abort(); } while (0) + +/* assume b>0 */ +#define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b)) +#define ABS(a) ((a) >= 0 ? (a) : (-(a))) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) +#define MIN(a,b) ((a) > (b) ? (b) : (a)) + /* bit output */ struct PutBitContext; @@ -189,6 +180,7 @@ void init_put_bits(PutBitContext *s, INT64 get_bit_count(PutBitContext *s); /* XXX: change function name */ void align_put_bits(PutBitContext *s); void flush_put_bits(PutBitContext *s); +void put_string(PutBitContext * pbc, char *s); /* jpeg specific put_bits */ void jflush_put_bits(PutBitContext *s); @@ -250,7 +242,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value) #endif // printf("put_bits=%d %x\n", n, value); assert(n == 32 || value < (1U << n)); - + bit_buf = s->bit_buf; bit_left = s->bit_left; @@ -430,7 +422,6 @@ static inline void jput_bits(PutBitContext *s, int n, int value) } #endif - static inline uint8_t* pbBufPtr(PutBitContext *s) { #ifdef ALT_BITSTREAM_WRITER @@ -483,7 +474,6 @@ static inline unsigned int get_bits(GetBitContext *s, int n){ } printf(" "); #endif - return result; #endif //!ALIGNED_BITSTREAM #else //ALT_BITSTREAM_READER @@ -509,10 +499,10 @@ static inline unsigned int get_bits1(GetBitContext *s){ result>>= 8 - 1; index++; s->index= index; + #ifdef DUMP_STREAM printf("%d ", result); #endif - return result; #else if(s->bit_cnt>0){ @@ -888,7 +878,62 @@ static inline int mid_pred(int a, int b, int c) return a + b + c - vmin - vmax; } +static inline int clip(int a, int amin, int amax) +{ + if (a < amin) + return amin; + else if (a > amax) + return amax; + else + return a; +} + /* memory */ +void *av_malloc(int size); void *av_mallocz(int size); +void av_free(void *ptr); +void __av_freep(void **ptr); +#define av_freep(p) __av_freep((void **)(p)) + +/* math */ +int ff_gcd(int a, int b); + +static inline int ff_sqrt(int a) +{ + int ret=0; + int s; + int ret_sq=0; + + for(s=15; s>=0; s--){ + int b= ret_sq + (1<<(s*2)) + (ret<<s)*2; + if(b<=a){ + ret_sq=b; + ret+= 1<<s; + } + } + return ret; +} +#define RUNTIME_CPUDETECT + +#if __CPU__ >= 686 && !defined(RUNTIME_CPUDETECT) +#define COPY3_IF_LT(x,y,a,b,c,d)\ +asm volatile (\ + "cmpl %0, %3 \n\t"\ + "cmovl %3, %0 \n\t"\ + "cmovl %4, %1 \n\t"\ + "cmovl %5, %2 \n\t"\ + : "+r" (x), "+r" (a), "+r" (c)\ + : "r" (y), "r" (b), "r" (d)\ +); +#else +#define COPY3_IF_LT(x,y,a,b,c,d)\ +if((y)<(x)){\ + (x)=(y);\ + (a)=(b);\ + (c)=(d);\ +} +#endif + +#define CLAMP_TO_8BIT(d) ((d > 0xff) ? 0xff : (d < 0) ? 0 : d) #endif diff --git a/src/libffmpeg/libavcodec/dsputil.c b/src/libffmpeg/libavcodec/dsputil.c index dcfad05a5..945b7cc9d 100644 --- a/src/libffmpeg/libavcodec/dsputil.c +++ b/src/libffmpeg/libavcodec/dsputil.c @@ -1,32 +1,33 @@ /* * DSP utils - * Copyright (c) 2000, 2001 Gerard Lantau. + * Copyright (c) 2000, 2001 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * gmc & q-pel support by Michael Niedermayer <michaelni@gmx.at> + * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at> */ -#include <stdlib.h> -#include <stdio.h> -#include <math.h> #include "avcodec.h" #include "dsputil.h" #include "simple_idct.h" void (*ff_idct)(DCTELEM *block); +void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block); +void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block); +void (*av_fdct)(DCTELEM *block); void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size); +void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride); void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size); void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size); void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder); @@ -45,8 +46,10 @@ op_pixels_abs_func pix_abs8x8_xy2; UINT8 cropTbl[256 + 2 * MAX_NEG_CROP]; UINT32 squareTbl[512]; -extern UINT16 default_intra_matrix[64]; -extern UINT16 default_non_intra_matrix[64]; +extern INT16 default_intra_matrix[64]; +extern INT16 default_non_intra_matrix[64]; +extern INT16 ff_mpeg4_default_intra_matrix[64]; +extern INT16 ff_mpeg4_default_non_intra_matrix[64]; UINT8 zigzag_direct[64] = { 0, 1, 8, 16, 9, 2, 3, 10, @@ -87,6 +90,8 @@ UINT8 ff_alternate_vertical_scan[64] = { 38, 46, 54, 62, 39, 47, 55, 63, }; +#ifdef SIMPLE_IDCT + /* Input permutation for the simple_idct_mmx */ static UINT8 simple_mmx_permutation[64]={ 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, @@ -98,6 +103,7 @@ static UINT8 simple_mmx_permutation[64]={ 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, }; +#endif /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */ UINT32 inverse[256]={ @@ -141,7 +147,7 @@ UINT8 zigzag_end[64]; UINT8 permutation[64]; //UINT8 invPermutation[64]; -static void build_zigzag_end() +static void build_zigzag_end(void) { int lastIndex; int lastIndexAfterPerm=0; @@ -176,6 +182,28 @@ void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size) } } +void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){ + DCTELEM *p; + int i; + + /* read the pixels */ + p = block; + for(i=0;i<8;i++) { + p[0] = s1[0] - s2[0]; + p[1] = s1[1] - s2[1]; + p[2] = s1[2] - s2[2]; + p[3] = s1[3] - s2[3]; + p[4] = s1[4] - s2[4]; + p[5] = s1[5] - s2[5]; + p[6] = s1[6] - s2[6]; + p[7] = s1[7] - s2[7]; + s1 += stride; + s2 += stride; + p += 8; + } +} + + void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size) { const DCTELEM *p; @@ -224,6 +252,358 @@ void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size) } } +#ifdef __GNUC__ + +struct unaligned_64 { uint64_t l; } __attribute__((packed)); +struct unaligned_32 { uint32_t l; } __attribute__((packed)); + +#define LD32(a) (((const struct unaligned_32 *) (a))->l) +#define LD64(a) (((const struct unaligned_64 *) (a))->l) + +#else /* __GNUC__ */ + +#define LD32(a) (*((uint32_t*)(a))) +#define LD64(a) (*((uint64_t*)(a))) + +#endif /* !__GNUC__ */ + +#if 0 + +#define PIXOP2(OPNAME, OP) \ +void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i<h; i++){\ + OP(*((uint64_t*)block), LD64(pixels));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i<h; i++){\ + const uint64_t a= LD64(pixels );\ + const uint64_t b= LD64(pixels+1);\ + OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i<h; i++){\ + const uint64_t a= LD64(pixels );\ + const uint64_t b= LD64(pixels+1);\ + OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i<h; i++){\ + const uint64_t a= LD64(pixels );\ + const uint64_t b= LD64(pixels+line_size);\ + OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i<h; i++){\ + const uint64_t a= LD64(pixels );\ + const uint64_t b= LD64(pixels+line_size);\ + OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + const uint64_t a= LD64(pixels );\ + const uint64_t b= LD64(pixels+1);\ + uint64_t l0= (a&0x0303030303030303ULL)\ + + (b&0x0303030303030303ULL)\ + + 0x0202020202020202ULL;\ + uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ + + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ + uint64_t l1,h1;\ +\ + pixels+=line_size;\ + for(i=0; i<h; i+=2){\ + uint64_t a= LD64(pixels );\ + uint64_t b= LD64(pixels+1);\ + l1= (a&0x0303030303030303ULL)\ + + (b&0x0303030303030303ULL);\ + h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ + + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ + OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ + pixels+=line_size;\ + block +=line_size;\ + a= LD64(pixels );\ + b= LD64(pixels+1);\ + l0= (a&0x0303030303030303ULL)\ + + (b&0x0303030303030303ULL)\ + + 0x0202020202020202ULL;\ + h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ + + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ + OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + const uint64_t a= LD64(pixels );\ + const uint64_t b= LD64(pixels+1);\ + uint64_t l0= (a&0x0303030303030303ULL)\ + + (b&0x0303030303030303ULL)\ + + 0x0101010101010101ULL;\ + uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ + + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ + uint64_t l1,h1;\ +\ + pixels+=line_size;\ + for(i=0; i<h; i+=2){\ + uint64_t a= LD64(pixels );\ + uint64_t b= LD64(pixels+1);\ + l1= (a&0x0303030303030303ULL)\ + + (b&0x0303030303030303ULL);\ + h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ + + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ + OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ + pixels+=line_size;\ + block +=line_size;\ + a= LD64(pixels );\ + b= LD64(pixels+1);\ + l0= (a&0x0303030303030303ULL)\ + + (b&0x0303030303030303ULL)\ + + 0x0101010101010101ULL;\ + h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ + + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ + OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\ + OPNAME ## _pixels,\ + OPNAME ## _pixels_x2,\ + OPNAME ## _pixels_y2,\ + OPNAME ## _pixels_xy2,\ +};\ +\ +void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\ + OPNAME ## _pixels,\ + OPNAME ## _no_rnd_pixels_x2,\ + OPNAME ## _no_rnd_pixels_y2,\ + OPNAME ## _no_rnd_pixels_xy2,\ +}; + +#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) ) +#else // 64 bit variant + +#define PIXOP2(OPNAME, OP) \ +void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i<h; i++){\ + OP(*((uint32_t*)(block )), LD32(pixels ));\ + OP(*((uint32_t*)(block+4)), LD32(pixels+4));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i<h; i++){\ + int j;\ + for(j=0; j<2; j++){\ + const uint32_t a= LD32(pixels );\ + const uint32_t b= LD32(pixels+1);\ + OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\ + pixels+=4;\ + block +=4;\ + }\ + pixels+=line_size-8;\ + block +=line_size-8;\ + }\ +}\ +\ +void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i<h; i++){\ + int j;\ + for(j=0; j<2; j++){\ + const uint32_t a= LD32(pixels );\ + const uint32_t b= LD32(pixels+1);\ + OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\ + pixels+=4;\ + block +=4;\ + }\ + pixels+=line_size-8;\ + block +=line_size-8;\ + }\ +}\ +\ +void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i<h; i++){\ + int j;\ + for(j=0; j<2; j++){\ + const uint32_t a= LD32(pixels );\ + const uint32_t b= LD32(pixels+line_size);\ + OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\ + pixels+=4;\ + block +=4;\ + }\ + pixels+=line_size-8;\ + block +=line_size-8;\ + }\ +}\ +\ +void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i<h; i++){\ + int j;\ + for(j=0; j<2; j++){\ + const uint32_t a= LD32(pixels );\ + const uint32_t b= LD32(pixels+line_size);\ + OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\ + pixels+=4;\ + block +=4;\ + }\ + pixels+=line_size-8;\ + block +=line_size-8;\ + }\ +}\ +\ +void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int j;\ + for(j=0; j<2; j++){\ + int i;\ + const uint32_t a= LD32(pixels );\ + const uint32_t b= LD32(pixels+1);\ + uint32_t l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x02020202UL;\ + uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + uint32_t l1,h1;\ +\ + pixels+=line_size;\ + for(i=0; i<h; i+=2){\ + uint32_t a= LD32(pixels );\ + uint32_t b= LD32(pixels+1);\ + l1= (a&0x03030303UL)\ + + (b&0x03030303UL);\ + h1= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + a= LD32(pixels );\ + b= LD32(pixels+1);\ + l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x02020202UL;\ + h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + }\ + pixels+=4-line_size*(h+1);\ + block +=4-line_size*h;\ + }\ +}\ +\ +void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int j;\ + for(j=0; j<2; j++){\ + int i;\ + const uint32_t a= LD32(pixels );\ + const uint32_t b= LD32(pixels+1);\ + uint32_t l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x01010101UL;\ + uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + uint32_t l1,h1;\ +\ + pixels+=line_size;\ + for(i=0; i<h; i+=2){\ + uint32_t a= LD32(pixels );\ + uint32_t b= LD32(pixels+1);\ + l1= (a&0x03030303UL)\ + + (b&0x03030303UL);\ + h1= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + a= LD32(pixels );\ + b= LD32(pixels+1);\ + l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x01010101UL;\ + h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + }\ + pixels+=4-line_size*(h+1);\ + block +=4-line_size*h;\ + }\ +}\ +\ +void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\ + OPNAME ## _pixels,\ + OPNAME ## _pixels_x2,\ + OPNAME ## _pixels_y2,\ + OPNAME ## _pixels_xy2,\ +};\ +\ +void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\ + OPNAME ## _pixels,\ + OPNAME ## _no_rnd_pixels_x2,\ + OPNAME ## _no_rnd_pixels_y2,\ + OPNAME ## _no_rnd_pixels_xy2,\ +}; +#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) +#endif + +#define op_put(a, b) a = b + +PIXOP2(avg, op_avg) +PIXOP2(put, op_put) +#undef op_avg +#undef op_put + +#if 0 +/* FIXME this stuff could be removed as its ot really used anymore */ #define PIXOP(BTYPE, OPNAME, OP, INCR) \ \ static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \ @@ -323,18 +703,13 @@ void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_siz OPNAME ## _pixels_xy2, \ }; - /* rounding primitives */ #define avg2(a,b) ((a+b+1)>>1) #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) -#define op_put(a, b) a = b #define op_avg(a, b) a = avg2(a, b) #define op_sub(a, b) a -= b -PIXOP(UINT8, put, op_put, line_size) -PIXOP(UINT8, avg, op_avg, line_size) - PIXOP(DCTELEM, sub, op_sub, 8) /* not rounding primitives */ @@ -343,13 +718,12 @@ PIXOP(DCTELEM, sub, op_sub, 8) #define avg2(a,b) ((a+b)>>1) #define avg4(a,b,c,d) ((a+b+c+d+1)>>2) -PIXOP(UINT8, put_no_rnd, op_put, line_size) -PIXOP(UINT8, avg_no_rnd, op_avg, line_size) - /* motion estimation */ #undef avg2 #undef avg4 +#endif + #define avg2(a,b) ((a+b+1)>>1) #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) @@ -872,6 +1246,20 @@ void clear_blocks_c(DCTELEM *blocks) memset(blocks, 0, sizeof(DCTELEM)*6*64); } +/* XXX: those functions should be suppressed ASAP when all IDCTs are + converted */ +void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block) +{ + ff_idct (block); + put_pixels_clamped(block, dest, line_size); +} + +void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block) +{ + ff_idct (block); + add_pixels_clamped(block, dest, line_size); +} + void dsputil_init(void) { int i, j; @@ -888,11 +1276,12 @@ void dsputil_init(void) } #ifdef SIMPLE_IDCT - ff_idct = simple_idct; + ff_idct = NULL; #else ff_idct = j_rev_dct; #endif get_pixels = get_pixels_c; + diff_pixels = diff_pixels_c; put_pixels_clamped = put_pixels_clamped_c; add_pixels_clamped = add_pixels_clamped_c; gmc1= gmc1_c; @@ -906,7 +1295,7 @@ void dsputil_init(void) pix_abs8x8_x2 = pix_abs8x8_x2_c; pix_abs8x8_y2 = pix_abs8x8_y2_c; pix_abs8x8_xy2 = pix_abs8x8_xy2_c; - av_fdct = jpeg_fdct_ifast; + av_fdct = fdct_ifast; use_permuted_idct = 1; @@ -925,9 +1314,16 @@ void dsputil_init(void) use_permuted_idct = 0; #endif -#ifdef SIMPLE_IDCT - if(ff_idct == simple_idct) use_permuted_idct=0; -#endif +//#ifdef SIMPLE_IDCT + if (ff_idct == NULL) { + ff_idct_put = simple_idct_put; + ff_idct_add = simple_idct_add; + use_permuted_idct=0; + } else { + ff_idct_put = gen_idct_put; + ff_idct_add = gen_idct_add; + } +//#endif if(use_permuted_idct) #ifdef SIMPLE_IDCT @@ -953,11 +1349,21 @@ void dsputil_init(void) } block_permute(default_intra_matrix); block_permute(default_non_intra_matrix); + block_permute(ff_mpeg4_default_intra_matrix); + block_permute(ff_mpeg4_default_non_intra_matrix); } build_zigzag_end(); } +/* remove any non bit exact operation (testing purpose) */ +void avcodec_set_bit_exact(void) +{ +#ifdef HAVE_MMX + dsputil_set_bit_exact_mmx(); +#endif +} + void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3], int orig_linesize[3], int coded_linesize, AVCodecContext *avctx) diff --git a/src/libffmpeg/libavcodec/dsputil.h b/src/libffmpeg/libavcodec/dsputil.h index dc63f06f1..b7b7e999c 100644 --- a/src/libffmpeg/libavcodec/dsputil.h +++ b/src/libffmpeg/libavcodec/dsputil.h @@ -1,21 +1,39 @@ +/* + * DSP utils + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ #ifndef DSPUTIL_H #define DSPUTIL_H #include "common.h" #include "avcodec.h" +#include "xineutils.h" #undef DEBUG -//#define DEBUG /* dct code */ typedef short DCTELEM; -void jpeg_fdct_ifast (DCTELEM *data); +void fdct_ifast (DCTELEM *data); void j_rev_dct (DCTELEM *data); void fdct_mmx(DCTELEM *block); -void (*av_fdct)(DCTELEM *block); +extern void (*av_fdct)(DCTELEM *block); /* encoding scans */ extern UINT8 ff_alternate_horizontal_scan[64]; @@ -37,7 +55,10 @@ void dsputil_init(void); /* pixel ops : interface with DCT */ extern void (*ff_idct)(DCTELEM *block); +extern void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block); +extern void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block); extern void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size); +extern void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride); extern void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size); extern void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size); extern void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder); @@ -45,6 +66,7 @@ extern void (*clear_blocks)(DCTELEM *blocks); void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size); +void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride); void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size); void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size); void clear_blocks_c(DCTELEM *blocks); @@ -60,13 +82,6 @@ extern op_pixels_func avg_no_rnd_pixels_tab[4]; extern qpel_mc_func qpel_mc_rnd_tab[16]; extern qpel_mc_func qpel_mc_no_rnd_tab[16]; - -/* sub pixel (encoding) */ -extern void (*sub_pixels_tab[4])(DCTELEM *block, const UINT8 *pixels, int line_size, int h); - -#define sub_pixels_2(block, pixels, line_size, dxy) \ - sub_pixels_tab[dxy](block, pixels, line_size, 8) - /* motion estimation */ typedef int (*op_pixels_abs_func)(UINT8 *blk1, UINT8 *blk2, int line_size); @@ -91,9 +106,9 @@ static inline int block_permute_op(int j) } void block_permute(INT16 *block); - + #if defined(ARCH_X86) -#define HAVE_MMX +#define HAVE_MMX 1 #endif #if defined(HAVE_MMX) @@ -108,10 +123,10 @@ void block_permute(INT16 *block); extern int mm_flags; -/* int mm_support(void); */ +/*int mm_support(void);*/ #define mm_support() xine_mm_accel() -#if 0 +#if 0 static inline void emms(void) { __asm __volatile ("emms;":::"memory"); @@ -127,6 +142,7 @@ static inline void emms(void) #define __align8 __attribute__ ((aligned (8))) void dsputil_init_mmx(void); +void dsputil_set_bit_exact_mmx(void); #elif defined(ARCH_ARMV4L) diff --git a/src/libffmpeg/libavcodec/h263.c b/src/libffmpeg/libavcodec/h263.c index a8d04d58a..decddd344 100644 --- a/src/libffmpeg/libavcodec/h263.c +++ b/src/libffmpeg/libavcodec/h263.c @@ -1,25 +1,27 @@ /* * H263/MPEG4 backend for ffmpeg encoder and decoder - * Copyright (c) 2000,2001 Gerard Lantau. + * Copyright (c) 2000,2001 Fabrice Bellard. * H263+ support. * Copyright (c) 2001 Juan J. Sierralta P. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * ac prediction encoding by Michael Niedermayer <michaelni@gmx.at> + * ac prediction encoding & b-frame support by Michael Niedermayer <michaelni@gmx.at> */ + +//#define DEBUG #include "common.h" #include "dsputil.h" #include "avcodec.h" @@ -28,24 +30,26 @@ #include "mpeg4data.h" //rounded divison & shift -#define RDIV(a,b) ((a) > 0 ? ((a)+((b)>>1))/(b) : ((a)-((b)>>1))/(b)) #define RSHIFT(a,b) ((a) > 0 ? ((a) + (1<<((b)-1)))>>(b) : ((a) + (1<<((b)-1))-1)>>(b)) -#define ABS(a) (((a)>=0)?(a):(-(a))) -#define MAX(a,b) ((a) > (b) ? (a) : (b)) -#define MIN(a,b) ((a) < (b) ? (a) : (b)) + +#define PRINT_MB_TYPE(a) ; +//#define PRINT_MB_TYPE(a) printf(a); static void h263_encode_block(MpegEncContext * s, DCTELEM * block, int n); -static void h263_encode_motion(MpegEncContext * s, int val); +static void h263_encode_motion(MpegEncContext * s, int val, int fcode); static void h263p_encode_umotion(MpegEncContext * s, int val); static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, - int n, int dc, UINT8 *scan_table); + int n, int dc, UINT8 *scan_table, + PutBitContext *dc_pb, PutBitContext *ac_pb); static int h263_decode_motion(MpegEncContext * s, int pred, int fcode); static int h263p_decode_umotion(MpegEncContext * s, int pred); static int h263_decode_block(MpegEncContext * s, DCTELEM * block, int n, int coded); -static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block, +static inline int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr); +static inline int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block, int n, int coded); +static int h263_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr); static inline int mpeg4_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr, int *dir_ptr); static void mpeg4_inv_pred_ac(MpegEncContext * s, INT16 *block, int n, int dir); @@ -65,15 +69,15 @@ int h263_get_picture_format(int width, int height) int format; if (width == 128 && height == 96) - format = 1; + format = 1; else if (width == 176 && height == 144) - format = 2; + format = 2; else if (width == 352 && height == 288) - format = 3; + format = 3; else if (width == 704 && height == 576) - format = 4; + format = 4; else if (width == 1408 && height == 1152) - format = 5; + format = 5; else format = 7; return format; @@ -128,7 +132,7 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number) put_bits(&s->pb, 1, s->umvplus); /* Unrestricted Motion Vector */ put_bits(&s->pb,1,0); /* SAC: off */ put_bits(&s->pb,1,0); /* Advanced Prediction Mode: off */ - put_bits(&s->pb,1,0); /* Advanced Intra Coding: off */ + put_bits(&s->pb,1,s->h263_aic); /* Advanced Intra Coding */ put_bits(&s->pb,1,0); /* Deblocking Filter: off */ put_bits(&s->pb,1,0); /* Slice Structured: off */ put_bits(&s->pb,1,0); /* Reference Picture Selection: off */ @@ -142,7 +146,11 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number) put_bits(&s->pb,1,0); /* Reference Picture Resampling: off */ put_bits(&s->pb,1,0); /* Reduced-Resolution Update: off */ - put_bits(&s->pb,1,0); /* Rounding Type */ + if (s->pict_type == I_TYPE) + s->no_rounding = 0; + else + s->no_rounding ^= 1; + put_bits(&s->pb,1,s->no_rounding); /* Rounding Type */ put_bits(&s->pb,2,0); /* Reserved */ put_bits(&s->pb,1,1); /* "1" to prevent start code emulation */ @@ -152,6 +160,9 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number) if (format == 7) { /* Custom Picture Format (CPFMT) */ + if (s->aspect_ratio_info) + put_bits(&s->pb,4,s->aspect_ratio_info); + else put_bits(&s->pb,4,2); /* Aspect ratio: CIF 12:11 (4:3) picture */ put_bits(&s->pb,9,(s->width >> 2) - 1); put_bits(&s->pb,1,1); /* "1" to prevent start code emulation */ @@ -252,78 +263,230 @@ void mpeg4_encode_mb(MpegEncContext * s, DCTELEM block[6][64], int motion_x, int motion_y) { - int cbpc, cbpy, i, cbp, pred_x, pred_y; + int cbpc, cbpy, i, pred_x, pred_y; int bits; + PutBitContext * const pb2 = s->data_partitioning ? &s->pb2 : &s->pb; + PutBitContext * const tex_pb = s->data_partitioning && s->pict_type!=B_TYPE ? &s->tex_pb : &s->pb; + PutBitContext * const dc_pb = s->data_partitioning && s->pict_type!=I_TYPE ? &s->pb2 : &s->pb; + const int interleaved_stats= (s->flags&CODEC_FLAG_PASS1) && !s->data_partitioning ? 1 : 0; // printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y); if (!s->mb_intra) { /* compute cbp */ - cbp = 0; + int cbp = 0; for (i = 0; i < 6; i++) { - if (s->block_last_index[i] >= 0) - cbp |= 1 << (5 - i); - } - if ((cbp | motion_x | motion_y) == 0 && s->mv_type==MV_TYPE_16X16) { - /* skip macroblock */ - put_bits(&s->pb, 1, 1); - s->misc_bits++; - s->last_bits++; - s->skip_count++; - return; + if (s->block_last_index[i] >= 0) + cbp |= 1 << (5 - i); } - put_bits(&s->pb, 1, 0); /* mb coded */ - if(s->mv_type==MV_TYPE_16X16){ - cbpc = cbp & 3; - put_bits(&s->pb, - inter_MCBPC_bits[cbpc], - inter_MCBPC_code[cbpc]); - cbpy = cbp >> 2; - cbpy ^= 0xf; - put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]); - - bits= get_bit_count(&s->pb); - s->misc_bits+= bits - s->last_bits; - s->last_bits=bits; - /* motion vectors: 16x16 mode */ - h263_pred_motion(s, 0, &pred_x, &pred_y); - - h263_encode_motion(s, motion_x - pred_x); - h263_encode_motion(s, motion_y - pred_y); - }else{ - cbpc = (cbp & 3)+16; - put_bits(&s->pb, - inter_MCBPC_bits[cbpc], - inter_MCBPC_code[cbpc]); - cbpy = cbp >> 2; - cbpy ^= 0xf; - put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]); + if(s->pict_type==B_TYPE){ + static const int mb_type_table[8]= {-1, 2, 3, 1,-1,-1,-1, 0}; /* convert from mv_dir to type */ + int mb_type= mb_type_table[s->mv_dir]; + + if(s->mb_x==0){ + s->last_mv[0][0][0]= + s->last_mv[0][0][1]= + s->last_mv[1][0][0]= + s->last_mv[1][0][1]= 0; + } - bits= get_bit_count(&s->pb); - s->misc_bits+= bits - s->last_bits; - s->last_bits=bits; + /* nothing to do if this MB was skiped in the next P Frame */ + if(s->mbskip_table[s->mb_y * s->mb_width + s->mb_x]){ + s->skip_count++; + s->mv[0][0][0]= + s->mv[0][0][1]= + s->mv[1][0][0]= + s->mv[1][0][1]= 0; + s->mv_dir= MV_DIR_FORWARD; //doesnt matter + return; + } - for(i=0; i<4; i++){ - /* motion vectors: 8x8 mode*/ - h263_pred_motion(s, i, &pred_x, &pred_y); + if ((cbp | motion_x | motion_y | mb_type) ==0) { + /* direct MB with MV={0,0} */ + put_bits(&s->pb, 1, 1); /* mb not coded modb1=1 */ - h263_encode_motion(s, s->motion_val[ s->block_index[i] ][0] - pred_x); - h263_encode_motion(s, s->motion_val[ s->block_index[i] ][1] - pred_y); + if(interleaved_stats){ + s->misc_bits++; + s->last_bits++; + } + s->skip_count++; + return; } - } - bits= get_bit_count(&s->pb); - s->mv_bits+= bits - s->last_bits; - s->last_bits=bits; + put_bits(&s->pb, 1, 0); /* mb coded modb1=0 */ + put_bits(&s->pb, 1, cbp ? 0 : 1); /* modb2 */ //FIXME merge + put_bits(&s->pb, mb_type+1, 1); // this table is so simple that we dont need it :) + if(cbp) put_bits(&s->pb, 6, cbp); + + if(cbp && mb_type) + put_bits(&s->pb, 1, 0); /* no q-scale change */ - /* encode each block */ - for (i = 0; i < 6; i++) { - mpeg4_encode_block(s, block[i], i, 0, zigzag_direct); + if(interleaved_stats){ + bits= get_bit_count(&s->pb); + s->misc_bits+= bits - s->last_bits; + s->last_bits=bits; + } + + switch(mb_type) + { + case 0: /* direct */ + h263_encode_motion(s, motion_x, 1); + h263_encode_motion(s, motion_y, 1); + break; + case 1: /* bidir */ + h263_encode_motion(s, s->mv[0][0][0] - s->last_mv[0][0][0], s->f_code); + h263_encode_motion(s, s->mv[0][0][1] - s->last_mv[0][0][1], s->f_code); + h263_encode_motion(s, s->mv[1][0][0] - s->last_mv[1][0][0], s->b_code); + h263_encode_motion(s, s->mv[1][0][1] - s->last_mv[1][0][1], s->b_code); + s->last_mv[0][0][0]= s->mv[0][0][0]; + s->last_mv[0][0][1]= s->mv[0][0][1]; + s->last_mv[1][0][0]= s->mv[1][0][0]; + s->last_mv[1][0][1]= s->mv[1][0][1]; + break; + case 2: /* backward */ + h263_encode_motion(s, motion_x - s->last_mv[1][0][0], s->b_code); + h263_encode_motion(s, motion_y - s->last_mv[1][0][1], s->b_code); + s->last_mv[1][0][0]= motion_x; + s->last_mv[1][0][1]= motion_y; + break; + case 3: /* forward */ + h263_encode_motion(s, motion_x - s->last_mv[0][0][0], s->f_code); + h263_encode_motion(s, motion_y - s->last_mv[0][0][1], s->f_code); + s->last_mv[0][0][0]= motion_x; + s->last_mv[0][0][1]= motion_y; + break; + default: + printf("unknown mb type\n"); + return; + } + + if(interleaved_stats){ + bits= get_bit_count(&s->pb); + s->mv_bits+= bits - s->last_bits; + s->last_bits=bits; + } + + /* encode each block */ + for (i = 0; i < 6; i++) { + mpeg4_encode_block(s, block[i], i, 0, zigzag_direct, NULL, &s->pb); + } + + if(interleaved_stats){ + bits= get_bit_count(&s->pb); + s->p_tex_bits+= bits - s->last_bits; + s->last_bits=bits; + } + }else{ /* s->pict_type==B_TYPE */ + if ((cbp | motion_x | motion_y) == 0 && s->mv_type==MV_TYPE_16X16) { + /* check if the B frames can skip it too, as we must skip it if we skip here + why didnt they just compress the skip-mb bits instead of reusing them ?! */ + if(s->max_b_frames>0){ + int i; + int x,y, offset; + uint8_t *p_pic; + + x= s->mb_x*16; + y= s->mb_y*16; + if(x+16 > s->width) x= s->width-16; + if(y+16 > s->height) y= s->height-16; + + offset= x + y*s->linesize; + p_pic= s->new_picture[0] + offset; + + s->mb_skiped=1; + for(i=0; i<s->max_b_frames; i++){ + uint8_t *b_pic; + int diff; + + if(s->coded_order[i+1].pict_type!=B_TYPE) break; + + b_pic= s->coded_order[i+1].picture[0] + offset; + diff= pix_abs16x16(p_pic, b_pic, s->linesize); + if(diff>s->qscale*70){ //FIXME check that 70 is optimal + s->mb_skiped=0; + break; + } + } + }else + s->mb_skiped=1; + + if(s->mb_skiped==1){ + /* skip macroblock */ + put_bits(&s->pb, 1, 1); + + if(interleaved_stats){ + s->misc_bits++; + s->last_bits++; + } + s->skip_count++; + return; + } + } + + put_bits(&s->pb, 1, 0); /* mb coded */ + if(s->mv_type==MV_TYPE_16X16){ + cbpc = cbp & 3; + put_bits(&s->pb, + inter_MCBPC_bits[cbpc], + inter_MCBPC_code[cbpc]); + cbpy = cbp >> 2; + cbpy ^= 0xf; + put_bits(pb2, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]); + + if(interleaved_stats){ + bits= get_bit_count(&s->pb); + s->misc_bits+= bits - s->last_bits; + s->last_bits=bits; + } + + /* motion vectors: 16x16 mode */ + h263_pred_motion(s, 0, &pred_x, &pred_y); + + h263_encode_motion(s, motion_x - pred_x, s->f_code); + h263_encode_motion(s, motion_y - pred_y, s->f_code); + }else{ + cbpc = (cbp & 3)+16; + put_bits(&s->pb, + inter_MCBPC_bits[cbpc], + inter_MCBPC_code[cbpc]); + cbpy = cbp >> 2; + cbpy ^= 0xf; + put_bits(pb2, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]); + + if(interleaved_stats){ + bits= get_bit_count(&s->pb); + s->misc_bits+= bits - s->last_bits; + s->last_bits=bits; + } + + for(i=0; i<4; i++){ + /* motion vectors: 8x8 mode*/ + h263_pred_motion(s, i, &pred_x, &pred_y); + + h263_encode_motion(s, s->motion_val[ s->block_index[i] ][0] - pred_x, s->f_code); + h263_encode_motion(s, s->motion_val[ s->block_index[i] ][1] - pred_y, s->f_code); + } + } + + if(interleaved_stats){ + bits= get_bit_count(&s->pb); + s->mv_bits+= bits - s->last_bits; + s->last_bits=bits; + } + + /* encode each block */ + for (i = 0; i < 6; i++) { + mpeg4_encode_block(s, block[i], i, 0, zigzag_direct, NULL, tex_pb); + } + + if(interleaved_stats){ + bits= get_bit_count(&s->pb); + s->p_tex_bits+= bits - s->last_bits; + s->last_bits=bits; + } + s->p_count++; } - bits= get_bit_count(&s->pb); - s->p_tex_bits+= bits - s->last_bits; - s->last_bits=bits; - s->p_count++; } else { + int cbp; int dc_diff[6]; //dc values with the dc prediction subtracted int dir[6]; //prediction direction int zigzag_last_index[6]; @@ -381,22 +544,26 @@ void mpeg4_encode_mb(MpegEncContext * s, inter_MCBPC_bits[cbpc + 4], inter_MCBPC_code[cbpc + 4]); } - put_bits(&s->pb, 1, s->ac_pred); + put_bits(pb2, 1, s->ac_pred); cbpy = cbp >> 2; - put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]); + put_bits(pb2, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]); - bits= get_bit_count(&s->pb); - s->misc_bits+= bits - s->last_bits; - s->last_bits=bits; + if(interleaved_stats){ + bits= get_bit_count(&s->pb); + s->misc_bits+= bits - s->last_bits; + s->last_bits=bits; + } /* encode each block */ for (i = 0; i < 6; i++) { - mpeg4_encode_block(s, block[i], i, dc_diff[i], scan_table[i]); + mpeg4_encode_block(s, block[i], i, dc_diff[i], scan_table[i], dc_pb, tex_pb); } - bits= get_bit_count(&s->pb); - s->i_tex_bits+= bits - s->last_bits; - s->last_bits=bits; + if(interleaved_stats){ + bits= get_bit_count(&s->pb); + s->i_tex_bits+= bits - s->last_bits; + s->last_bits=bits; + } s->i_count++; /* restore ac coeffs & last_index stuff if we messed them up with the prediction */ @@ -425,76 +592,169 @@ void h263_encode_mb(MpegEncContext * s, int motion_x, int motion_y) { int cbpc, cbpy, i, cbp, pred_x, pred_y; - - // printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y); - if (!s->mb_intra) { - /* compute cbp */ - cbp = 0; - for (i = 0; i < 6; i++) { - if (s->block_last_index[i] >= 0) - cbp |= 1 << (5 - i); - } - if ((cbp | motion_x | motion_y) == 0) { - /* skip macroblock */ - put_bits(&s->pb, 1, 1); - return; - } - put_bits(&s->pb, 1, 0); /* mb coded */ - cbpc = cbp & 3; - put_bits(&s->pb, - inter_MCBPC_bits[cbpc], - inter_MCBPC_code[cbpc]); - cbpy = cbp >> 2; - cbpy ^= 0xf; - put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]); - - /* motion vectors: 16x16 mode only now */ - h263_pred_motion(s, 0, &pred_x, &pred_y); + INT16 pred_dc; + INT16 rec_intradc[6]; + UINT16 *dc_ptr[6]; + + //printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y); + if (!s->mb_intra) { + /* compute cbp */ + cbp = 0; + for (i = 0; i < 6; i++) { + if (s->block_last_index[i] >= 0) + cbp |= 1 << (5 - i); + } + if ((cbp | motion_x | motion_y) == 0) { + /* skip macroblock */ + put_bits(&s->pb, 1, 1); + return; + } + put_bits(&s->pb, 1, 0); /* mb coded */ + cbpc = cbp & 3; + put_bits(&s->pb, + inter_MCBPC_bits[cbpc], + inter_MCBPC_code[cbpc]); + cbpy = cbp >> 2; + cbpy ^= 0xf; + put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]); + + /* motion vectors: 16x16 mode only now */ + h263_pred_motion(s, 0, &pred_x, &pred_y); - if (!s->umvplus) { - h263_encode_motion(s, motion_x - pred_x); - h263_encode_motion(s, motion_y - pred_y); - } - else { - h263p_encode_umotion(s, motion_x - pred_x); - h263p_encode_umotion(s, motion_y - pred_y); - if (((motion_x - pred_x) == 1) && ((motion_y - pred_y) == 1)) - /* To prevent Start Code emulation */ - put_bits(&s->pb,1,1); - } - } else { - /* compute cbp */ - cbp = 0; - for (i = 0; i < 6; i++) { - if (s->block_last_index[i] >= 1) - cbp |= 1 << (5 - i); - } + if (!s->umvplus) { + h263_encode_motion(s, motion_x - pred_x, s->f_code); + h263_encode_motion(s, motion_y - pred_y, s->f_code); + } + else { + h263p_encode_umotion(s, motion_x - pred_x); + h263p_encode_umotion(s, motion_y - pred_y); + if (((motion_x - pred_x) == 1) && ((motion_y - pred_y) == 1)) + /* To prevent Start Code emulation */ + put_bits(&s->pb,1,1); + } + } else { + int li = s->h263_aic ? 0 : 1; + + cbp = 0; + for(i=0; i<6; i++) { + /* Predict DC */ + if (s->h263_aic && s->mb_intra) { + INT16 level = block[i][0]; + + pred_dc = h263_pred_dc(s, i, &dc_ptr[i]); + level -= pred_dc; + /* Quant */ + if (level < 0) + level = (level + (s->qscale >> 1))/(s->y_dc_scale); + else + level = (level - (s->qscale >> 1))/(s->y_dc_scale); + + /* AIC can change CBP */ + if (level == 0 && s->block_last_index[i] == 0) + s->block_last_index[i] = -1; + else if (level < -127) + level = -127; + else if (level > 127) + level = 127; + + block[i][0] = level; + /* Reconstruction */ + rec_intradc[i] = (s->y_dc_scale*level) + pred_dc; + /* Oddify */ + rec_intradc[i] |= 1; + //if ((rec_intradc[i] % 2) == 0) + // rec_intradc[i]++; + /* Clipping */ + if (rec_intradc[i] < 0) + rec_intradc[i] = 0; + else if (rec_intradc[i] > 2047) + rec_intradc[i] = 2047; + + /* Update AC/DC tables */ + *dc_ptr[i] = rec_intradc[i]; + } + /* compute cbp */ + if (s->block_last_index[i] >= li) + cbp |= 1 << (5 - i); + } - cbpc = cbp & 3; - if (s->pict_type == I_TYPE) { - put_bits(&s->pb, - intra_MCBPC_bits[cbpc], - intra_MCBPC_code[cbpc]); - } else { - put_bits(&s->pb, 1, 0); /* mb coded */ - put_bits(&s->pb, - inter_MCBPC_bits[cbpc + 4], - inter_MCBPC_code[cbpc + 4]); - } - if (s->h263_pred) { - /* XXX: currently, we do not try to use ac prediction */ - put_bits(&s->pb, 1, 0); /* no ac prediction */ - } - cbpy = cbp >> 2; - put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]); + cbpc = cbp & 3; + if (s->pict_type == I_TYPE) { + put_bits(&s->pb, + intra_MCBPC_bits[cbpc], + intra_MCBPC_code[cbpc]); + } else { + put_bits(&s->pb, 1, 0); /* mb coded */ + put_bits(&s->pb, + inter_MCBPC_bits[cbpc + 4], + inter_MCBPC_code[cbpc + 4]); + } + if (s->h263_aic) { + /* XXX: currently, we do not try to use ac prediction */ + put_bits(&s->pb, 1, 0); /* no AC prediction */ + } + cbpy = cbp >> 2; + put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]); } - /* encode each block */ - for (i = 0; i < 6; i++) { + for(i=0; i<6; i++) { + /* encode each block */ h263_encode_block(s, block[i], i); + + /* Update INTRADC for decoding */ + if (s->h263_aic && s->mb_intra) { + block[i][0] = rec_intradc[i]; + + } } } +static int h263_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr) +{ + int x, y, wrap, a, c, pred_dc, scale; + INT16 *dc_val, *ac_val; + + /* find prediction */ + if (n < 4) { + x = 2 * s->mb_x + 1 + (n & 1); + y = 2 * s->mb_y + 1 + ((n & 2) >> 1); + wrap = s->mb_width * 2 + 2; + dc_val = s->dc_val[0]; + ac_val = s->ac_val[0][0]; + scale = s->y_dc_scale; + } else { + x = s->mb_x + 1; + y = s->mb_y + 1; + wrap = s->mb_width + 2; + dc_val = s->dc_val[n - 4 + 1]; + ac_val = s->ac_val[n - 4 + 1][0]; + scale = s->c_dc_scale; + } + /* B C + * A X + */ + a = dc_val[(x - 1) + (y) * wrap]; + c = dc_val[(x) + (y - 1) * wrap]; + + /* No prediction outside GOB boundary */ + if (s->first_slice_line && ((n < 2) || (n > 3))) + c = 1024; + pred_dc = 1024; + /* just DC prediction */ + if (a != 1024 && c != 1024) + pred_dc = (a + c) >> 1; + else if (a != 1024) + pred_dc = a; + else + pred_dc = c; + + /* we assume pred is positive */ + //pred_dc = (pred_dc + (scale >> 1)) / scale; + *dc_val_ptr = &dc_val[x + y * wrap]; + return pred_dc; +} + + void h263_pred_acdc(MpegEncContext * s, INT16 *block, int n) { int x, y, wrap, a, c, pred_dc, scale, i; @@ -526,6 +786,9 @@ void h263_pred_acdc(MpegEncContext * s, INT16 *block, int n) a = dc_val[(x - 1) + (y) * wrap]; c = dc_val[(x) + (y - 1) * wrap]; + /* No prediction outside GOB boundary */ + if (s->first_slice_line && ((n < 2) || (n > 3))) + c = 1024; pred_dc = 1024; if (s->ac_pred) { if (s->h263_aic_dir) { @@ -588,13 +851,46 @@ INT16 *h263_pred_motion(MpegEncContext * s, int block, mot_val = s->motion_val[xy]; - /* special case for first line */ - if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) { - A = s->motion_val[xy - 1]; - *px = A[0]; - *py = A[1]; + A = s->motion_val[xy - 1]; + /* special case for first (slice) line */ + if ((s->mb_y == 0 || s->first_slice_line) && block<3) { + // we cant just change some MVs to simulate that as we need them for the B frames (and ME) + // and if we ever support non rectangular objects than we need to do a few ifs here anyway :( + if(block==0){ //most common case + if(s->mb_x == s->resync_mb_x){ //rare + *px= *py = 0; + }else if(s->mb_x + 1 == s->resync_mb_x){ //rare + C = s->motion_val[xy + off[block] - wrap]; + if(s->mb_x==0){ + *px = C[0]; + *py = C[1]; + }else{ + *px = mid_pred(A[0], 0, C[0]); + *py = mid_pred(A[1], 0, C[1]); + } + }else{ + *px = A[0]; + *py = A[1]; + } + }else if(block==1){ + if(s->mb_x + 1 == s->resync_mb_x){ //rare + C = s->motion_val[xy + off[block] - wrap]; + *px = mid_pred(A[0], 0, C[0]); + *py = mid_pred(A[1], 0, C[1]); + }else{ + *px = A[0]; + *py = A[1]; + } + }else{ /* block==2*/ + B = s->motion_val[xy - wrap]; + C = s->motion_val[xy + off[block] - wrap]; + if(s->mb_x == s->resync_mb_x) //rare + A[0]=A[1]=0; + + *px = mid_pred(A[0], B[0], C[0]); + *py = mid_pred(A[1], B[1], C[1]); + } } else { - A = s->motion_val[xy - 1]; B = s->motion_val[xy - wrap]; C = s->motion_val[xy + off[block] - wrap]; *px = mid_pred(A[0], B[0], C[0]); @@ -603,7 +899,7 @@ INT16 *h263_pred_motion(MpegEncContext * s, int block, return mot_val; } -static void h263_encode_motion(MpegEncContext * s, int val) +static void h263_encode_motion(MpegEncContext * s, int val, int f_code) { int range, l, m, bit_size, sign, code, bits; @@ -612,7 +908,7 @@ static void h263_encode_motion(MpegEncContext * s, int val) code = 0; put_bits(&s->pb, mvtab[code][1], mvtab[code][0]); } else { - bit_size = s->f_code - 1; + bit_size = f_code - 1; range = 1 << bit_size; /* modulo encoding */ l = range * 32; @@ -624,17 +920,14 @@ static void h263_encode_motion(MpegEncContext * s, int val) } if (val >= 0) { - val--; - code = (val >> bit_size) + 1; - bits = val & (range - 1); sign = 0; } else { val = -val; - val--; - code = (val >> bit_size) + 1; - bits = val & (range - 1); sign = 1; } + val--; + code = (val >> bit_size) + 1; + bits = val & (range - 1); put_bits(&s->pb, mvtab[code][1] + 1, (mvtab[code][0] << 1) | sign); if (bit_size > 0) { @@ -724,11 +1017,11 @@ static void init_mv_penalty_and_fcode(MpegEncContext *s) } } -static void init_uni_dc_tab() +static void init_uni_dc_tab(void) { int level, uni_code, uni_len; - for(level=-255; level<256; level++){ + for(level=-256; level<256; level++){ int size, v, l; /* find number of bits */ size = 0; @@ -787,23 +1080,42 @@ void h263_encode_init(MpegEncContext *s) init_rl(&rl_inter); init_rl(&rl_intra); + init_rl(&rl_intra_aic); init_mv_penalty_and_fcode(s); } s->mv_penalty= mv_penalty; //FIXME exact table for msmpeg4 & h263p // use fcodes >1 only for mpeg4 & h263 & h263p FIXME - if(s->h263_plus) s->fcode_tab= umv_fcode_tab; - else if(s->h263_pred && !s->h263_msmpeg4) s->fcode_tab= fcode_tab; + switch(s->codec_id){ + case CODEC_ID_MPEG4: + s->fcode_tab= fcode_tab; + s->min_qcoeff= -2048; + s->max_qcoeff= 2047; + break; + case CODEC_ID_H263P: + s->fcode_tab= umv_fcode_tab; + s->min_qcoeff= -128; + s->max_qcoeff= 127; + break; + default: //nothing needed default table allready set in mpegvideo.c + s->min_qcoeff= -128; + s->max_qcoeff= 127; + } + + /* h263 type bias */ + //FIXME mpeg4 mpeg quantizer + s->intra_quant_bias=0; + s->inter_quant_bias=-(1<<(QUANT_BIAS_SHIFT-2)); //(a - x/4)/x } static void h263_encode_block(MpegEncContext * s, DCTELEM * block, int n) { - int level, run, last, i, j, last_index, last_non_zero, sign, slevel; - int code; - RLTable *rl = &rl_inter; + int level, run, last, i, j, last_index, last_non_zero, sign, slevel, code; + RLTable *rl; - if (s->mb_intra) { + rl = &rl_inter; + if (s->mb_intra && !s->h263_aic) { /* DC coef */ level = block[0]; /* 255 cannot be represented, so we clamp */ @@ -823,23 +1135,25 @@ static void h263_encode_block(MpegEncContext * s, DCTELEM * block, int n) i = 1; } else { i = 0; + if (s->h263_aic && s->mb_intra) + rl = &rl_intra_aic; } - + /* AC coefs */ last_index = s->block_last_index[n]; last_non_zero = i - 1; for (; i <= last_index; i++) { - j = zigzag_direct[i]; - level = block[j]; - if (level) { - run = i - last_non_zero - 1; - last = (i == last_index); - sign = 0; - slevel = level; - if (level < 0) { - sign = 1; - level = -level; - } + j = zigzag_direct[i]; + level = block[j]; + if (level) { + run = i - last_non_zero - 1; + last = (i == last_index); + sign = 0; + slevel = level; + if (level < 0) { + sign = 1; + level = -level; + } code = get_rl_index(rl, last, run, level); put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]); if (code == rl->n) { @@ -849,42 +1163,60 @@ static void h263_encode_block(MpegEncContext * s, DCTELEM * block, int n) } else { put_bits(&s->pb, 1, sign); } - last_non_zero = i; - } + last_non_zero = i; + } } } /***************************************************/ -static void mpeg4_stuffing(PutBitContext * pbc) +void ff_mpeg4_stuffing(PutBitContext * pbc) { int length; put_bits(pbc, 1, 0); length= (-get_bit_count(pbc))&7; - put_bits(pbc, length, (1<<length)-1); + if(length) put_bits(pbc, length, (1<<length)-1); } -static void put_string(PutBitContext * pbc, char *s) -{ - while(*s){ - put_bits(pbc, 8, *s); - s++; +/* must be called before writing the header */ +void ff_set_mpeg4_time(MpegEncContext * s, int picture_number){ + int time_div, time_mod; + + if(s->pict_type==I_TYPE){ //we will encode a vol header + s->time_increment_resolution= s->frame_rate/ff_gcd(s->frame_rate, FRAME_RATE_BASE); + if(s->time_increment_resolution>=256*256) s->time_increment_resolution= 256*128; + + s->time_increment_bits = av_log2(s->time_increment_resolution - 1) + 1; + } + + s->time= picture_number*(INT64)FRAME_RATE_BASE*s->time_increment_resolution/s->frame_rate; + time_div= s->time/s->time_increment_resolution; + time_mod= s->time%s->time_increment_resolution; + + if(s->pict_type==B_TYPE){ + s->bp_time= s->last_non_b_time - s->time; + }else{ + s->last_time_base= s->time_base; + s->time_base= time_div; + s->pp_time= s->time - s->last_non_b_time; + s->last_non_b_time= s->time; } - put_bits(pbc, 8, 0); } static void mpeg4_encode_vol_header(MpegEncContext * s) { int vo_ver_id=1; //must be 2 if we want GMC or q-pel + char buf[255]; + + s->vo_type= s->has_b_frames ? CORE_VO_TYPE : SIMPLE_VO_TYPE; - if(get_bit_count(&s->pb)!=0) mpeg4_stuffing(&s->pb); put_bits(&s->pb, 16, 0); put_bits(&s->pb, 16, 0x100); /* video obj */ put_bits(&s->pb, 16, 0); put_bits(&s->pb, 16, 0x120); /* video obj layer */ put_bits(&s->pb, 1, 0); /* random access vol */ - put_bits(&s->pb, 8, 1); /* video obj type indication= simple obj */ + put_bits(&s->pb, 8, s->vo_type); /* video obj type indication */ put_bits(&s->pb, 1, 1); /* is obj layer id= yes */ put_bits(&s->pb, 4, vo_ver_id); /* is obj layer ver id */ put_bits(&s->pb, 3, 1); /* is obj layer priority */ @@ -892,11 +1224,20 @@ static void mpeg4_encode_vol_header(MpegEncContext * s) put_bits(&s->pb, 4, s->aspect_ratio_info);/* aspect ratio info */ else put_bits(&s->pb, 4, 1); /* aspect ratio info= sqare pixel */ - put_bits(&s->pb, 1, 0); /* vol control parameters= no */ + + if(s->low_delay){ + put_bits(&s->pb, 1, 1); /* vol control parameters= yes */ + put_bits(&s->pb, 2, 1); /* chroma format YUV 420/YV12 */ + put_bits(&s->pb, 1, s->low_delay); + put_bits(&s->pb, 1, 0); /* vbv parameters= no */ + }else{ + put_bits(&s->pb, 1, 0); /* vol control parameters= no */ + } + put_bits(&s->pb, 2, RECT_SHAPE); /* vol shape= rectangle */ put_bits(&s->pb, 1, 1); /* marker bit */ - put_bits(&s->pb, 16, s->time_increment_resolution=30000); - s->time_increment_bits = av_log2(s->time_increment_resolution - 1) + 1; + + put_bits(&s->pb, 16, s->time_increment_resolution); if (s->time_increment_bits < 1) s->time_increment_bits = 1; put_bits(&s->pb, 1, 1); /* marker bit */ @@ -918,37 +1259,56 @@ static void mpeg4_encode_vol_header(MpegEncContext * s) if (vo_ver_id != 1) put_bits(&s->pb, 1, s->quarter_sample=0); put_bits(&s->pb, 1, 1); /* complexity estimation disable */ - put_bits(&s->pb, 1, 1); /* resync marker disable */ - put_bits(&s->pb, 1, 0); /* data partitioned */ + s->resync_marker= s->rtp_mode; + put_bits(&s->pb, 1, s->resync_marker ? 0 : 1);/* resync marker disable */ + put_bits(&s->pb, 1, s->data_partitioning ? 1 : 0); + if(s->data_partitioning){ + put_bits(&s->pb, 1, 0); /* no rvlc */ + } + if (vo_ver_id != 1){ put_bits(&s->pb, 1, 0); /* newpred */ put_bits(&s->pb, 1, 0); /* reduced res vop */ } put_bits(&s->pb, 1, 0); /* scalability */ - mpeg4_stuffing(&s->pb); + ff_mpeg4_stuffing(&s->pb); put_bits(&s->pb, 16, 0); put_bits(&s->pb, 16, 0x1B2); /* user_data */ - put_string(&s->pb, "ffmpeg"); //FIXME append some version ... + sprintf(buf, "FFmpeg%sb%s", FFMPEG_VERSION, LIBAVCODEC_BUILD_STR); + put_string(&s->pb, buf); - s->no_rounding = 0; + ff_mpeg4_stuffing(&s->pb); } /* write mpeg4 VOP header */ void mpeg4_encode_picture_header(MpegEncContext * s, int picture_number) { - if(s->pict_type==I_TYPE) mpeg4_encode_vol_header(s); - - if(get_bit_count(&s->pb)!=0) mpeg4_stuffing(&s->pb); + int time_incr; + int time_div, time_mod; + + if(s->pict_type==I_TYPE){ + s->no_rounding=0; + if(picture_number==0 || !s->strict_std_compliance) + mpeg4_encode_vol_header(s); + } + +//printf("num:%d rate:%d base:%d\n", s->picture_number, s->frame_rate, FRAME_RATE_BASE); + put_bits(&s->pb, 16, 0); /* vop header */ put_bits(&s->pb, 16, 0x1B6); /* vop header */ put_bits(&s->pb, 2, s->pict_type - 1); /* pict type: I = 0 , P = 1 */ - /* XXX: time base + 1 not always correct */ - put_bits(&s->pb, 1, 1); + + time_div= s->time/s->time_increment_resolution; + time_mod= s->time%s->time_increment_resolution; + time_incr= time_div - s->last_time_base; + while(time_incr--) + put_bits(&s->pb, 1, 1); + put_bits(&s->pb, 1, 0); put_bits(&s->pb, 1, 1); /* marker */ - put_bits(&s->pb, s->time_increment_bits, 1); /* XXX: correct time increment */ + put_bits(&s->pb, s->time_increment_bits, time_mod); /* time increment */ put_bits(&s->pb, 1, 1); /* marker */ put_bits(&s->pb, 1, 1); /* vop coded */ if ( s->pict_type == P_TYPE @@ -1106,16 +1466,17 @@ static void mpeg4_inv_pred_ac(MpegEncContext * s, INT16 *block, int n, } } -static inline void mpeg4_encode_dc(MpegEncContext * s, int level, int n) +static inline void mpeg4_encode_dc(PutBitContext * s, int level, int n) { #if 1 +// if(level<-255 || level>255) printf("dc overflow\n"); level+=256; if (n < 4) { /* luminance */ - put_bits(&s->pb, uni_DCtab_lum[level][1], uni_DCtab_lum[level][0]); + put_bits(s, uni_DCtab_lum[level][1], uni_DCtab_lum[level][0]); } else { /* chrominance */ - put_bits(&s->pb, uni_DCtab_chrom[level][1], uni_DCtab_chrom[level][0]); + put_bits(s, uni_DCtab_chrom[level][1], uni_DCtab_chrom[level][0]); } #else int size, v; @@ -1146,7 +1507,8 @@ static inline void mpeg4_encode_dc(MpegEncContext * s, int level, int n) #endif } -static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int intra_dc, UINT8 *scan_table) +static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int intra_dc, + UINT8 *scan_table, PutBitContext *dc_pb, PutBitContext *ac_pb) { int level, run, last, i, j, last_index, last_non_zero, sign, slevel; int code; @@ -1154,7 +1516,7 @@ static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int i if (s->mb_intra) { /* mpeg4 based DC predictor */ - mpeg4_encode_dc(s, intra_dc, n); + mpeg4_encode_dc(dc_pb, intra_dc, n); i = 1; rl = &rl_intra; } else { @@ -1178,7 +1540,7 @@ static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int i level = -level; } code = get_rl_index(rl, last, run, level); - put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]); + put_bits(ac_pb, rl->table_vlc[code][1], rl->table_vlc[code][0]); if (code == rl->n) { int level1, run1; level1 = level - rl->max_level[last][run]; @@ -1187,7 +1549,7 @@ static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int i code = get_rl_index(rl, last, run, level1); if (code == rl->n) { esc2: - put_bits(&s->pb, 1, 1); + put_bits(ac_pb, 1, 1); if (level > MAX_LEVEL) goto esc3; run1 = run - rl->max_run[last][level] - 1; @@ -1197,26 +1559,26 @@ static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int i if (code == rl->n) { esc3: /* third escape */ - put_bits(&s->pb, 1, 1); - put_bits(&s->pb, 1, last); - put_bits(&s->pb, 6, run); - put_bits(&s->pb, 1, 1); - put_bits(&s->pb, 12, slevel & 0xfff); - put_bits(&s->pb, 1, 1); + put_bits(ac_pb, 1, 1); + put_bits(ac_pb, 1, last); + put_bits(ac_pb, 6, run); + put_bits(ac_pb, 1, 1); + put_bits(ac_pb, 12, slevel & 0xfff); + put_bits(ac_pb, 1, 1); } else { /* second escape */ - put_bits(&s->pb, 1, 0); - put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]); - put_bits(&s->pb, 1, sign); + put_bits(ac_pb, 1, 0); + put_bits(ac_pb, rl->table_vlc[code][1], rl->table_vlc[code][0]); + put_bits(ac_pb, 1, sign); } } else { /* first escape */ - put_bits(&s->pb, 1, 0); - put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]); - put_bits(&s->pb, 1, sign); + put_bits(ac_pb, 1, 0); + put_bits(ac_pb, rl->table_vlc[code][1], rl->table_vlc[code][0]); + put_bits(ac_pb, 1, sign); } } else { - put_bits(&s->pb, 1, sign); + put_bits(ac_pb, 1, sign); } last_non_zero = i; } @@ -1265,11 +1627,11 @@ void init_rl(RLTable *rl) if (run > max_run[level]) max_run[level] = run; } - rl->max_level[last] = malloc(MAX_RUN + 1); + rl->max_level[last] = av_malloc(MAX_RUN + 1); memcpy(rl->max_level[last], max_level, MAX_RUN + 1); - rl->max_run[last] = malloc(MAX_LEVEL + 1); + rl->max_run[last] = av_malloc(MAX_LEVEL + 1); memcpy(rl->max_run[last], max_run, MAX_LEVEL + 1); - rl->index_run[last] = malloc(MAX_RUN + 1); + rl->index_run[last] = av_malloc(MAX_RUN + 1); memcpy(rl->index_run[last], index_run, MAX_RUN + 1); } } @@ -1356,82 +1718,146 @@ static inline void memsetw(short *tab, int val, int n) tab[i] = val; } -static int mpeg4_resync(MpegEncContext *s) +void ff_mpeg4_init_partitions(MpegEncContext *s) +{ + init_put_bits(&s->tex_pb, s->tex_pb_buffer, PB_BUFFER_SIZE, NULL, NULL); + init_put_bits(&s->pb2 , s->pb2_buffer , PB_BUFFER_SIZE, NULL, NULL); +} + +void ff_mpeg4_merge_partitions(MpegEncContext *s) +{ + const int pb2_len = get_bit_count(&s->pb2 ); + const int tex_pb_len= get_bit_count(&s->tex_pb); + const int bits= get_bit_count(&s->pb); + + if(s->pict_type==I_TYPE){ + put_bits(&s->pb, 19, DC_MARKER); + s->misc_bits+=19 + pb2_len + bits - s->last_bits; + s->i_tex_bits+= tex_pb_len; + }else{ + put_bits(&s->pb, 17, MOTION_MARKER); + s->misc_bits+=17 + pb2_len;; + s->mv_bits+= bits - s->last_bits; + s->p_tex_bits+= tex_pb_len; + } + + flush_put_bits(&s->pb2); + flush_put_bits(&s->tex_pb); + + ff_copy_bits(&s->pb, s->pb2_buffer , pb2_len); + ff_copy_bits(&s->pb, s->tex_pb_buffer, tex_pb_len); + s->last_bits= get_bit_count(&s->pb); +} + +void ff_mpeg4_encode_video_packet_header(MpegEncContext *s) { - int state, v, bits; int mb_num_bits= av_log2(s->mb_num - 1) + 1; - int header_extension=0, mb_num; - int c_wrap, c_xy, l_wrap, l_xy; -//printf("resync at %d %d\n", s->mb_x, s->mb_y); -//printf("%X\n", show_bits(&s->gb, 24)); - if( get_bits_count(&s->gb) > s->gb.size*8-32) - return 0; + ff_mpeg4_stuffing(&s->pb); + if(s->pict_type==I_TYPE) + put_bits(&s->pb, 16, 0); + else if(s->pict_type==B_TYPE) + put_bits(&s->pb, MAX(MAX(s->f_code, s->b_code)+15, 17), 0); + else /* S/P_TYPE */ + put_bits(&s->pb, s->f_code+15, 0); + put_bits(&s->pb, 1, 1); + + put_bits(&s->pb, mb_num_bits, s->mb_x + s->mb_y*s->mb_width); + put_bits(&s->pb, 5, s->qscale); + put_bits(&s->pb, 1, 0); /* no HEC */ +} - align_get_bits(&s->gb); - state = 0xff; - for(;;) { - v = get_bits(&s->gb, 8); -//printf("%X ", v); - state = ((state << 8) | v) & 0xffff; - if (state == 0) break; - if( get_bits_count(&s->gb) > s->gb.size*8-32){ - printf("resync failed\n"); - return -1; - } +/** + * decodes the next video packet and sets s->next_qscale + * returns mb_num of the next packet or <0 if something went wrong + */ +static int decode_video_packet_header(MpegEncContext *s, GetBitContext *gb) +{ + int bits; + int mb_num_bits= av_log2(s->mb_num - 1) + 1; + int header_extension=0, mb_num; +//printf("%X\n", show_bits(&gb, 24)); +//printf("parse_video_packet_header\n"); +// if(show_aligned_bits(gb, 1, 16) != 0) return -1; + + /* is there enough space left for a video packet + header */ + if( get_bits_count(gb) > gb->size*8-20) return -1; + +//printf("resync at %d %d\n", s->mb_x, s->mb_y); +// skip_bits(gb, 1); +// align_get_bits(gb); + if(get_bits(gb, 16)!=0){ + printf("internal error while decoding video packet header\n"); } -//printf("%X\n", show_bits(&s->gb, 24)); + +//printf("%X\n", show_bits(gb, 24)); bits=0; - while(!get_bits1(&s->gb) && bits<30) bits++; - if(s->pict_type == P_TYPE && bits != s->f_code-1) - printf("marker does not match f_code\n"); - //FIXME check bits for B-framess -//printf("%X\n", show_bits(&s->gb, 24)); + while(!get_bits1(gb) && bits<30) bits++; + if((s->pict_type == P_TYPE || s->pict_type == S_TYPE) && bits != s->f_code-1){ + printf("marker does not match f_code (is: %d should be: %d pos: %d end %d x: %d y: %d)\n", + bits+1, s->f_code, get_bits_count(gb), gb->size*8, s->mb_x, s->mb_y); + return -1; + }else if(s->pict_type == I_TYPE && bits != 0){ + printf("marker too long\n"); + return -1; + }else if(s->pict_type == B_TYPE && bits != MAX(MAX(s->f_code, s->b_code)-1, 1)){ + printf("marker does not match f/b_code\n"); + return -1; + } +//printf("%X\n", show_bits(gb, 24)); if(s->shape != RECT_SHAPE){ - header_extension= get_bits1(&s->gb); + header_extension= get_bits1(gb); //FIXME more stuff here } - mb_num= get_bits(&s->gb, mb_num_bits); - if(mb_num != s->mb_x + s->mb_y*s->mb_width){ - printf("MB-num change not supported %d %d\n", mb_num, s->mb_x + s->mb_y*s->mb_width); -// s->mb_x= mb_num % s->mb_width; -// s->mb_y= mb_num / s->mb_width; - //FIXME many vars are wrong now - } + mb_num= get_bits(gb, mb_num_bits); + if(mb_num < s->mb_x + s->mb_y*s->mb_width || mb_num>=s->mb_num){ + fprintf(stderr, "illegal mb_num in video packet (%d %d) \n", mb_num, s->mb_x + s->mb_y*s->mb_width); + return -1; + } if(s->shape != BIN_ONLY_SHAPE){ - s->qscale= get_bits(&s->gb, 5); - h263_dc_scale(s); + s->next_resync_qscale= get_bits(gb, 5); + if(s->next_resync_qscale==0) + s->next_resync_qscale= s->qscale; + if(s->next_resync_qscale==0){ + fprintf(stderr, "qscale==0\n"); + return -1; + } } if(s->shape == RECT_SHAPE){ - header_extension= get_bits1(&s->gb); + header_extension= get_bits1(gb); } if(header_extension){ + int time_increment; int time_incr=0; - printf("header extension not really supported\n"); - while (get_bits1(&s->gb) != 0) + printf("header extension not supported\n"); + return -1; + + while (get_bits1(gb) != 0) time_incr++; - check_marker(&s->gb, "before time_increment in video packed header"); - s->time_increment= get_bits(&s->gb, s->time_increment_bits); + check_marker(gb, "before time_increment in video packed header"); + time_increment= get_bits(gb, s->time_increment_bits); if(s->pict_type!=B_TYPE){ + s->last_time_base= s->time_base; s->time_base+= time_incr; - s->last_non_b_time[1]= s->last_non_b_time[0]; - s->last_non_b_time[0]= s->time_base*s->time_increment_resolution + s->time_increment; + s->time= s->time_base*s->time_increment_resolution + time_increment; + s->pp_time= s->time - s->last_non_b_time; + s->last_non_b_time= s->time; }else{ - s->time= (s->last_non_b_time[1]/s->time_increment_resolution + time_incr)*s->time_increment_resolution; - s->time+= s->time_increment; + s->time= (s->last_time_base + time_incr)*s->time_increment_resolution + time_increment; + s->bp_time= s->last_non_b_time - s->time; } - check_marker(&s->gb, "before vop_coding_type in video packed header"); + check_marker(gb, "before vop_coding_type in video packed header"); - skip_bits(&s->gb, 2); /* vop coding type */ + skip_bits(gb, 2); /* vop coding type */ //FIXME not rect stuff here if(s->shape != BIN_ONLY_SHAPE){ - skip_bits(&s->gb, 3); /* intra dc vlc threshold */ + skip_bits(gb, 3); /* intra dc vlc threshold */ if(s->pict_type == S_TYPE && s->vol_sprite_usage==GMC_SPRITE && s->num_sprite_warping_points){ mpeg4_decode_sprite_trajectory(s); @@ -1440,44 +1866,506 @@ static int mpeg4_resync(MpegEncContext *s) //FIXME reduced res stuff here if (s->pict_type != I_TYPE) { - s->f_code = get_bits(&s->gb, 3); /* fcode_for */ + s->f_code = get_bits(gb, 3); /* fcode_for */ if(s->f_code==0){ printf("Error, video packet header damaged or not MPEG4 header (f_code=0)\n"); return -1; // makes no sense to continue, as the MV decoding will break very quickly } } if (s->pict_type == B_TYPE) { - s->b_code = get_bits(&s->gb, 3); + s->b_code = get_bits(gb, 3); } } - } //FIXME new-pred stuff + +//printf("parse ok %d %d %d %d\n", mb_num, s->mb_x + s->mb_y*s->mb_width, get_bits_count(gb), get_bits_count(&s->gb)); + + return mb_num; +} + +void ff_mpeg4_clean_buffers(MpegEncContext *s) +{ + int c_wrap, c_xy, l_wrap, l_xy; l_wrap= s->block_wrap[0]; - l_xy= s->mb_y*l_wrap*2; + l_xy= s->mb_y*l_wrap*2 + s->mb_x*2; c_wrap= s->block_wrap[4]; - c_xy= s->mb_y*c_wrap; + c_xy= s->mb_y*c_wrap + s->mb_x; /* clean DC */ - memsetw(s->dc_val[0] + l_xy, 1024, l_wrap*3); - memsetw(s->dc_val[1] + c_xy, 1024, c_wrap*2); - memsetw(s->dc_val[2] + c_xy, 1024, c_wrap*2); + memsetw(s->dc_val[0] + l_xy, 1024, l_wrap*2+1); + memsetw(s->dc_val[1] + c_xy, 1024, c_wrap+1); + memsetw(s->dc_val[2] + c_xy, 1024, c_wrap+1); /* clean AC */ - memset(s->ac_val[0] + l_xy, 0, l_wrap*3*16*sizeof(INT16)); - memset(s->ac_val[1] + c_xy, 0, c_wrap*2*16*sizeof(INT16)); - memset(s->ac_val[2] + c_xy, 0, c_wrap*2*16*sizeof(INT16)); + memset(s->ac_val[0] + l_xy, 0, (l_wrap*2+1)*16*sizeof(INT16)); + memset(s->ac_val[1] + c_xy, 0, (c_wrap +1)*16*sizeof(INT16)); + memset(s->ac_val[2] + c_xy, 0, (c_wrap +1)*16*sizeof(INT16)); /* clean MV */ - memset(s->motion_val + l_xy, 0, l_wrap*3*2*sizeof(INT16)); + // we cant clear the MVs as they might be needed by a b frame +// memset(s->motion_val + l_xy, 0, (l_wrap*2+1)*2*sizeof(INT16)); // memset(s->motion_val, 0, 2*sizeof(INT16)*(2 + s->mb_width*2)*(2 + s->mb_height*2)); - s->resync_x_pos= s->mb_x; + s->last_mv[0][0][0]= + s->last_mv[0][0][1]= + s->last_mv[1][0][0]= + s->last_mv[1][0][1]= 0; +} + +/* searches for the next resync marker clears ac,dc,mc, and sets s->next_resync_gb, s->mb_num_left */ +int ff_mpeg4_resync(MpegEncContext *s) +{ + GetBitContext gb; + + /* search & parse next resync marker */ + + gb= s->next_resync_gb; + align_get_bits(&gb); +//printf("mpeg4_resync %d next:%d \n", get_bits_count(&gb), get_bits_count(&s->next_resync_gb)); + for(;;) { + int v= show_bits(&gb, 24); + if( get_bits_count(&gb) >= gb.size*8-24 || v == 1 /* start-code */){ + s->mb_num_left= s->mb_num - s->mb_x - s->mb_y*s->mb_width; +//printf("mpeg4_resync end\n"); + s->gb= s->next_resync_gb; //continue at the next resync marker + return -1; + }else if(v>>8 == 0){ + int next; + s->next_resync_pos= get_bits_count(&gb); + + next= decode_video_packet_header(s, &gb); + if(next >= 0){ + s->mb_num_left= next - s->mb_x - s->mb_y*s->mb_width; + break; + } + + align_get_bits(&gb); + } + skip_bits(&gb, 8); + } + s->next_resync_gb=gb; + + return 0; +} + +static inline void init_block_index(MpegEncContext *s) +{ + s->block_index[0]= s->block_wrap[0]*(s->mb_y*2 + 1) - 1 + s->mb_x*2; + s->block_index[1]= s->block_wrap[0]*(s->mb_y*2 + 1) + s->mb_x*2; + s->block_index[2]= s->block_wrap[0]*(s->mb_y*2 + 2) - 1 + s->mb_x*2; + s->block_index[3]= s->block_wrap[0]*(s->mb_y*2 + 2) + s->mb_x*2; + s->block_index[4]= s->block_wrap[4]*(s->mb_y + 1) + s->block_wrap[0]*(s->mb_height*2 + 2) + s->mb_x; + s->block_index[5]= s->block_wrap[4]*(s->mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2) + s->mb_x; +} + +static inline void update_block_index(MpegEncContext *s) +{ + s->block_index[0]+=2; + s->block_index[1]+=2; + s->block_index[2]+=2; + s->block_index[3]+=2; + s->block_index[4]++; + s->block_index[5]++; +} + +/** + * decodes the first & second partition + * returns error type or 0 if no error + */ +int ff_mpeg4_decode_partitions(MpegEncContext *s) +{ + static const INT8 quant_tab[4] = { -1, -2, 1, 2 }; + int mb_num; + + /* decode first partition */ + mb_num=0; s->first_slice_line=1; + s->mb_x= s->resync_mb_x; + for(s->mb_y= s->resync_mb_y; mb_num < s->mb_num_left; s->mb_y++){ + init_block_index(s); + for(; mb_num < s->mb_num_left && s->mb_x<s->mb_width; s->mb_x++){ + const int xy= s->mb_x + s->mb_y*s->mb_width; + int cbpc; + int dir=0; + + mb_num++; + update_block_index(s); + if(s->mb_x == s->resync_mb_x && s->mb_y == s->resync_mb_y+1) + s->first_slice_line=0; + + if(s->mb_x==0) PRINT_MB_TYPE("\n"); + + if(s->pict_type==I_TYPE){ + int i; + + PRINT_MB_TYPE("I"); + cbpc = get_vlc(&s->gb, &intra_MCBPC_vlc); + if (cbpc < 0){ + fprintf(stderr, "cbpc corrupted at %d %d\n", s->mb_x, s->mb_y); + return DECODING_DESYNC; + } + s->cbp_table[xy]= cbpc & 3; + s->mb_type[xy]= MB_TYPE_INTRA; + s->mb_intra = 1; + + if(cbpc & 4) { + s->qscale += quant_tab[get_bits(&s->gb, 2)]; + if (s->qscale < 1) + s->qscale = 1; + else if (s->qscale > 31) + s->qscale = 31; + h263_dc_scale(s); + } + s->qscale_table[xy]= s->qscale; + + s->mbintra_table[xy]= 1; + for(i=0; i<6; i++){ + int dc_pred_dir; + int dc= mpeg4_decode_dc(s, i, &dc_pred_dir); + if(dc < 0){ + fprintf(stderr, "DC corrupted at %d %d\n", s->mb_x, s->mb_y); + return DECODING_DESYNC; + } + dir<<=1; + if(dc_pred_dir) dir|=1; + } + s->pred_dir_table[xy]= dir; + }else{ /* P/S_TYPE */ + int mx, my, pred_x, pred_y; + INT16 * const mot_val= s->motion_val[s->block_index[0]]; + const int stride= s->block_wrap[0]*2; + + if(get_bits1(&s->gb)){ + /* skip mb */ + s->mb_type[xy]= MB_TYPE_SKIPED; + if(s->pict_type==S_TYPE && s->vol_sprite_usage==GMC_SPRITE){ + const int a= s->sprite_warping_accuracy; + PRINT_MB_TYPE("G"); + if(s->divx_version==500 && s->divx_build==413){ + mx = s->sprite_offset[0][0] / (1<<(a-s->quarter_sample)); + my = s->sprite_offset[0][1] / (1<<(a-s->quarter_sample)); + }else{ + mx = RSHIFT(s->sprite_offset[0][0], a-s->quarter_sample); + my = RSHIFT(s->sprite_offset[0][1], a-s->quarter_sample); + s->mb_type[xy]= MB_TYPE_GMC | MB_TYPE_SKIPED; + } + }else{ + PRINT_MB_TYPE("S"); + mx = 0; + my = 0; + } + mot_val[0 ]= mot_val[2 ]= + mot_val[0+stride]= mot_val[2+stride]= mx; + mot_val[1 ]= mot_val[3 ]= + mot_val[1+stride]= mot_val[3+stride]= my; + + if(s->mbintra_table[xy]) + ff_clean_intra_table_entries(s); + + continue; + } + cbpc = get_vlc(&s->gb, &inter_MCBPC_vlc); + if (cbpc < 0){ + fprintf(stderr, "cbpc corrupted at %d %d\n", s->mb_x, s->mb_y); + return DECODING_DESYNC; + } + if (cbpc > 20) + cbpc+=3; + else if (cbpc == 20) + fprintf(stderr, "Stuffing !"); + s->cbp_table[xy]= cbpc&(8+3); //8 is dquant + + s->mb_intra = ((cbpc & 4) != 0); + + if(s->mb_intra){ + PRINT_MB_TYPE("I"); + s->mbintra_table[xy]= 1; + s->mb_type[xy]= MB_TYPE_INTRA; + mot_val[0 ]= mot_val[2 ]= + mot_val[0+stride]= mot_val[2+stride]= 0; + mot_val[1 ]= mot_val[3 ]= + mot_val[1+stride]= mot_val[3+stride]= 0; + }else{ + if(s->mbintra_table[xy]) + ff_clean_intra_table_entries(s); + + if(s->pict_type==S_TYPE && s->vol_sprite_usage==GMC_SPRITE && (cbpc & 16) == 0) + s->mcsel= get_bits1(&s->gb); + else s->mcsel= 0; + + if ((cbpc & 16) == 0) { + PRINT_MB_TYPE("P"); + /* 16x16 motion prediction */ + s->mb_type[xy]= MB_TYPE_INTER; + + h263_pred_motion(s, 0, &pred_x, &pred_y); + if(!s->mcsel) + mx = h263_decode_motion(s, pred_x, s->f_code); + else { + const int a= s->sprite_warping_accuracy; + if(s->divx_version==500 && s->divx_build==413){ + mx = s->sprite_offset[0][0] / (1<<(a-s->quarter_sample)); + }else{ + mx = RSHIFT(s->sprite_offset[0][0], a-s->quarter_sample); + } + } + if (mx >= 0xffff) + return DECODING_DESYNC; + + if(!s->mcsel) + my = h263_decode_motion(s, pred_y, s->f_code); + else{ + const int a= s->sprite_warping_accuracy; + if(s->divx_version==500 && s->divx_build==413){ + my = s->sprite_offset[0][1] / (1<<(a-s->quarter_sample)); + }else{ + my = RSHIFT(s->sprite_offset[0][1], a-s->quarter_sample); + } + } + if (my >= 0xffff) + return DECODING_DESYNC; + mot_val[0 ]= mot_val[2 ] = + mot_val[0+stride]= mot_val[2+stride]= mx; + mot_val[1 ]= mot_val[3 ]= + mot_val[1+stride]= mot_val[3+stride]= my; + } else { + int i; + PRINT_MB_TYPE("4"); + s->mb_type[xy]= MB_TYPE_INTER4V; + for(i=0;i<4;i++) { + INT16 *mot_val= h263_pred_motion(s, i, &pred_x, &pred_y); + mx = h263_decode_motion(s, pred_x, s->f_code); + if (mx >= 0xffff) + return DECODING_DESYNC; + + my = h263_decode_motion(s, pred_y, s->f_code); + if (my >= 0xffff) + return DECODING_DESYNC; + mot_val[0] = mx; + mot_val[1] = my; + } + } + } + } + } + s->mb_x= 0; + } + + if (s->pict_type==I_TYPE && get_bits(&s->gb, 19)!=DC_MARKER ) s->decoding_error= DECODING_DESYNC; + else if(s->pict_type!=I_TYPE && get_bits(&s->gb, 17)!=MOTION_MARKER) s->decoding_error= DECODING_DESYNC; + if(s->decoding_error== DECODING_DESYNC){ + fprintf(stderr, "marker missing after first partition at %d %d\n", s->mb_x, s->mb_y); + return DECODING_DESYNC; + } + + /* decode second partition */ + mb_num=0; + s->mb_x= s->resync_mb_x; + for(s->mb_y= s->resync_mb_y; mb_num < s->mb_num_left; s->mb_y++){ + init_block_index(s); + for(; mb_num < s->mb_num_left && s->mb_x<s->mb_width; s->mb_x++){ + const int xy= s->mb_x + s->mb_y*s->mb_width; + + mb_num++; + update_block_index(s); + + if(s->pict_type==I_TYPE){ + int ac_pred= get_bits1(&s->gb); + int cbpy = get_vlc(&s->gb, &cbpy_vlc); + if(cbpy<0){ + fprintf(stderr, "cbpy corrupted at %d %d\n", s->mb_x, s->mb_y); + return DECODING_AC_LOST; + } + + s->cbp_table[xy]|= cbpy<<2; + s->pred_dir_table[xy]|= ac_pred<<7; + }else{ /* P || S_TYPE */ + if(s->mb_type[xy]&MB_TYPE_INTRA){ + int dir=0,i; + int ac_pred = get_bits1(&s->gb); + int cbpy = get_vlc(&s->gb, &cbpy_vlc); + + if(cbpy<0){ + fprintf(stderr, "I cbpy corrupted at %d %d\n", s->mb_x, s->mb_y); + return DECODING_ACDC_LOST; + } + + if(s->cbp_table[xy] & 8) { + s->qscale += quant_tab[get_bits(&s->gb, 2)]; + if (s->qscale < 1) + s->qscale = 1; + else if (s->qscale > 31) + s->qscale = 31; + h263_dc_scale(s); + } + s->qscale_table[xy]= s->qscale; + + for(i=0; i<6; i++){ + int dc_pred_dir; + int dc= mpeg4_decode_dc(s, i, &dc_pred_dir); + if(dc < 0){ + fprintf(stderr, "DC corrupted at %d %d\n", s->mb_x, s->mb_y); + return DECODING_ACDC_LOST; + } + dir<<=1; + if(dc_pred_dir) dir|=1; + } + s->cbp_table[xy]&= 3; //remove dquant + s->cbp_table[xy]|= cbpy<<2; + s->pred_dir_table[xy]= dir | (ac_pred<<7); + }else if(s->mb_type[xy]&MB_TYPE_SKIPED){ + s->qscale_table[xy]= s->qscale; + s->cbp_table[xy]= 0; + }else{ + int cbpy = get_vlc(&s->gb, &cbpy_vlc); + + if(cbpy<0){ + fprintf(stderr, "P cbpy corrupted at %d %d\n", s->mb_x, s->mb_y); + return DECODING_ACDC_LOST; + } + + if(s->cbp_table[xy] & 8) { +//fprintf(stderr, "dquant\n"); + s->qscale += quant_tab[get_bits(&s->gb, 2)]; + if (s->qscale < 1) + s->qscale = 1; + else if (s->qscale > 31) + s->qscale = 31; + h263_dc_scale(s); + } + s->qscale_table[xy]= s->qscale; + + s->cbp_table[xy]&= 3; //remove dquant + s->cbp_table[xy]|= (cbpy^0xf)<<2; + } + } + } + s->mb_x= 0; + } + + + return 0; +} + +static int mpeg4_decode_partitioned_mb(MpegEncContext *s, + DCTELEM block[6][64]) +{ + int cbp, mb_type; + const int xy= s->mb_x + s->mb_y*s->mb_width; + + if(s->mb_x==s->resync_mb_x && s->mb_y==s->resync_mb_y){ //Note resync_mb_{x,y}==0 at the start + int i; + int block_index_backup[6]; + int qscale= s->qscale; + + for(i=0; i<6; i++) block_index_backup[i]= s->block_index[i]; + + s->decoding_error= ff_mpeg4_decode_partitions(s); + + for(i=0; i<6; i++) s->block_index[i]= block_index_backup[i]; + s->first_slice_line=1; + s->mb_x= s->resync_mb_x; + s->mb_y= s->resync_mb_y; + s->qscale= qscale; + h263_dc_scale(s); + + if(s->decoding_error==DECODING_DESYNC) return -1; + } + + mb_type= s->mb_type[xy]; + if(s->decoding_error) + cbp=0; + else + cbp = s->cbp_table[xy]; + + if(s->decoding_error!=DECODING_ACDC_LOST && s->qscale_table[xy] != s->qscale){ + s->qscale= s->qscale_table[xy]; + h263_dc_scale(s); + } + + if (s->pict_type == P_TYPE || s->pict_type==S_TYPE) { + int i; + for(i=0; i<4; i++){ + s->mv[0][i][0] = s->motion_val[ s->block_index[i] ][0]; + s->mv[0][i][1] = s->motion_val[ s->block_index[i] ][1]; + } + s->mb_intra = mb_type&MB_TYPE_INTRA; + + if (mb_type&MB_TYPE_SKIPED) { + /* skip mb */ + for(i=0;i<6;i++) + s->block_last_index[i] = -1; + s->mv_dir = MV_DIR_FORWARD; + s->mv_type = MV_TYPE_16X16; + if(s->pict_type==S_TYPE && s->vol_sprite_usage==GMC_SPRITE){ + s->mcsel=1; + s->mb_skiped = 0; + }else{ + s->mcsel=0; + s->mb_skiped = 1; + } + return 0; + }else if(s->mb_intra && s->decoding_error!=DECODING_ACDC_LOST){ + s->ac_pred = s->pred_dir_table[xy]>>7; + + /* decode each block */ + for (i = 0; i < 6; i++) { + int ret= mpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1); + if(ret==DECODING_AC_LOST){ + fprintf(stderr, "texture corrupted at %d %d (trying to continue with mc/dc only)\n", s->mb_x, s->mb_y); + s->decoding_error=DECODING_AC_LOST; + cbp=0; + }else if(ret==DECODING_ACDC_LOST){ + fprintf(stderr, "dc corrupted at %d %d (trying to continue with mc only)\n", s->mb_x, s->mb_y); + s->decoding_error=DECODING_ACDC_LOST; + break; + } + } + }else if(!s->mb_intra){ +// s->mcsel= 0; //FIXME do we need to init that + + s->mv_dir = MV_DIR_FORWARD; + if (mb_type&MB_TYPE_INTER4V) { + s->mv_type = MV_TYPE_8X8; + } else { + s->mv_type = MV_TYPE_16X16; + } + if(s->decoding_error==0 && cbp){ + /* decode each block */ + for (i = 0; i < 6; i++) { + int ret= mpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1); + if(ret==DECODING_AC_LOST){ + fprintf(stderr, "texture corrupted at %d %d (trying to continue with mc/dc only)\n", s->mb_x, s->mb_y); + s->decoding_error=DECODING_AC_LOST; + break; + } + } + } + } + } else { /* I-Frame */ + int i; + s->mb_intra = 1; + s->ac_pred = s->pred_dir_table[xy]>>7; + + /* decode each block */ + for (i = 0; i < 6; i++) { + int ret= mpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1); + if(ret==DECODING_AC_LOST){ + fprintf(stderr, "texture corrupted at %d %d (trying to continue with dc only)\n", s->mb_x, s->mb_y); + s->decoding_error=DECODING_AC_LOST; + cbp=0; + }else if(ret==DECODING_ACDC_LOST){ + fprintf(stderr, "dc corrupted at %d %d\n", s->mb_x, s->mb_y); + return -1; + } + } + } return 0; } + int h263_decode_mb(MpegEncContext *s, DCTELEM block[6][64]) { @@ -1485,27 +2373,17 @@ int h263_decode_mb(MpegEncContext *s, INT16 *mot_val; static INT8 quant_tab[4] = { -1, -2, 1, 2 }; - if(s->resync_marker){ - if( s->resync_x_pos == s->mb_x+1 - || s->resync_x_pos == s->mb_x){ - /* f*ck mpeg4 - this is here so we dont need to slowdown h263_pred_motion with it */ - if(s->resync_x_pos == s->mb_x+1 && s->mb_x==0){ - int xy= s->block_index[0] - s->block_wrap[0]; - s->motion_val[xy][0]= s->motion_val[xy+2][0]; - s->motion_val[xy][1]= s->motion_val[xy+2][1]; - } + if(s->mb_x==0) PRINT_MB_TYPE("\n") + if(s->resync_marker){ + if(s->resync_mb_x == s->mb_x && s->resync_mb_y+1 == s->mb_y){ s->first_slice_line=0; - s->resync_x_pos=0; // isnt needed but for cleanness sake ;) - } - - if(show_aligned_bits(&s->gb, 1, 16) == 0){ - if( mpeg4_resync(s) < 0 ) return -1; - } } + if(s->data_partitioning && s->pict_type!=B_TYPE) + return mpeg4_decode_partitioned_mb(s, block); + if (s->pict_type == P_TYPE || s->pict_type==S_TYPE) { if (get_bits1(&s->gb)) { /* skip mb */ @@ -1517,7 +2395,7 @@ int h263_decode_mb(MpegEncContext *s, if(s->pict_type==S_TYPE && s->vol_sprite_usage==GMC_SPRITE){ const int a= s->sprite_warping_accuracy; // int l = (1 << (s->f_code - 1)) * 32; - + PRINT_MB_TYPE("G"); s->mcsel=1; if(s->divx_version==500 && s->divx_build==413){ s->mv[0][0][0] = s->sprite_offset[0][0] / (1<<(a-s->quarter_sample)); @@ -1533,6 +2411,7 @@ int h263_decode_mb(MpegEncContext *s, s->mb_skiped = 0; }else{ + PRINT_MB_TYPE("S"); s->mcsel=0; s->mv[0][0][0] = 0; s->mv[0][0][1] = 0; @@ -1568,6 +2447,7 @@ int h263_decode_mb(MpegEncContext *s, } s->mv_dir = MV_DIR_FORWARD; if ((cbpc & 16) == 0) { + PRINT_MB_TYPE("P"); /* 16x16 motion prediction */ s->mv_type = MV_TYPE_16X16; h263_pred_motion(s, 0, &pred_x, &pred_y); @@ -1615,6 +2495,7 @@ int h263_decode_mb(MpegEncContext *s, skip_bits1(&s->gb); /* Bit stuffing to prevent PSC */ } else { + PRINT_MB_TYPE("4"); s->mv_type = MV_TYPE_8X8; for(i=0;i<4;i++) { mot_val = h263_pred_motion(s, i, &pred_x, &pred_y); @@ -1643,8 +2524,8 @@ int h263_decode_mb(MpegEncContext *s, int modb1; // first bit of modb int modb2; // second bit of modb int mb_type; - int time_pp; - int time_pb; + uint16_t time_pp; + uint16_t time_pb; int xy; s->mb_intra = 0; //B-frames never contain intra blocks @@ -1674,7 +2555,7 @@ int h263_decode_mb(MpegEncContext *s, //FIXME is this correct? /* s->last_mv[0][0][0]= s->last_mv[0][0][1]=0;*/ - s->mb_skiped = 1; + PRINT_MB_TYPE("s") return 0; } @@ -1702,14 +2583,14 @@ int h263_decode_mb(MpegEncContext *s, mx=my=0; //for case 4, we could put this to the mb_type=4 but than gcc compains about uninitalized mx/my switch(mb_type) { - case 0: + case 0: /* direct */ mx = h263_decode_motion(s, 0, 1); my = h263_decode_motion(s, 0, 1); - case 4: + case 4: /* direct with mx=my=0 */ s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT; xy= s->block_index[0]; - time_pp= s->last_non_b_time[0] - s->last_non_b_time[1]; - time_pb= s->time - s->last_non_b_time[1]; + time_pp= s->pp_time; + time_pb= time_pp - s->bp_time; //if(time_pp>3000 )printf("%d %d ", time_pp, time_pb); //FIXME 4MV //FIXME avoid divides @@ -1723,6 +2604,7 @@ int h263_decode_mb(MpegEncContext *s, s->mv[0][0][1] = s->mv[1][0][0] = s->mv[1][0][1] = 1000;*/ + PRINT_MB_TYPE("D"); break; case 1: s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD; @@ -1735,6 +2617,7 @@ int h263_decode_mb(MpegEncContext *s, my = h263_decode_motion(s, s->last_mv[1][0][1], s->b_code); s->last_mv[1][0][0]= s->mv[1][0][0] = mx; s->last_mv[1][0][1]= s->mv[1][0][1] = my; + PRINT_MB_TYPE("i"); break; case 2: s->mv_dir = MV_DIR_BACKWARD; @@ -1742,6 +2625,7 @@ int h263_decode_mb(MpegEncContext *s, my = h263_decode_motion(s, s->last_mv[1][0][1], s->b_code); s->last_mv[1][0][0]= s->mv[1][0][0] = mx; s->last_mv[1][0][1]= s->mv[1][0][1] = my; + PRINT_MB_TYPE("B"); break; case 3: s->mv_dir = MV_DIR_FORWARD; @@ -1749,8 +2633,11 @@ int h263_decode_mb(MpegEncContext *s, my = h263_decode_motion(s, s->last_mv[0][0][1], s->f_code); s->last_mv[0][0][0]= s->mv[0][0][0] = mx; s->last_mv[0][0][1]= s->mv[0][0][1] = my; + PRINT_MB_TYPE("F"); break; - default: return -1; + default: + printf("illegal MB_type\n"); + return -1; } } else { /* I-Frame */ cbpc = get_vlc(&s->gb, &intra_MCBPC_vlc); @@ -1759,6 +2646,7 @@ int h263_decode_mb(MpegEncContext *s, dquant = cbpc & 4; s->mb_intra = 1; intra: + PRINT_MB_TYPE("I"); s->ac_pred = 0; if (s->h263_pred || s->h263_aic) { s->ac_pred = get_bits1(&s->gb); @@ -1770,6 +2658,7 @@ intra: s->c_dc_scale = 2 * s->qscale; } cbpy = get_vlc(&s->gb, &cbpy_vlc); + if(cbpy<0) return -1; cbp = (cbpc & 3) | (cbpy << 2); if (dquant) { s->qscale += quant_tab[get_bits(&s->gb, 2)]; @@ -1815,7 +2704,7 @@ static int h263_decode_motion(MpegEncContext * s, int pred, int f_code) if (sign) val = -val; val += pred; - + /* modulo decoding */ if (!s->h263_long_vectors) { l = (1 << (f_code - 1)) * 32; @@ -1951,7 +2840,7 @@ not_coded: return 0; } -static int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr) +static inline int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr) { int level, pred, code; UINT16 *dc_val; @@ -1960,16 +2849,22 @@ static int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr) code = get_vlc(&s->gb, &dc_lum); else code = get_vlc(&s->gb, &dc_chrom); - if (code < 0) + if (code < 0 || code > 9 /* && s->nbit<9 */){ + fprintf(stderr, "illegal dc vlc\n"); return -1; + } if (code == 0) { level = 0; } else { level = get_bits(&s->gb, code); if ((level >> (code - 1)) == 0) /* if MSB not set it is negative*/ level = - (level ^ ((1 << code) - 1)); - if (code > 8) - skip_bits1(&s->gb); /* marker */ + if (code > 8){ + if(get_bits1(&s->gb)==0){ /* marker */ + fprintf(stderr, "dc marker bit missing\n"); + return -1; + } + } } pred = mpeg4_pred_dc(s, n, &dc_val, dir_ptr); @@ -1984,7 +2879,13 @@ static int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr) return level; } -static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block, +/** + * decode a block + * returns 0 if everything went ok + * returns DECODING_AC_LOST if an error was detected during AC decoding + * returns DECODING_ACDC_LOST if an error was detected during DC decoding + */ +static inline int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block, int n, int coded) { int code, level, i, j, last, run; @@ -1994,11 +2895,18 @@ static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block, if (s->mb_intra) { /* DC coef */ - level = mpeg4_decode_dc(s, n, &dc_pred_dir); - if (level < 0) - return -1; + if(s->data_partitioning && s->pict_type!=B_TYPE){ + level = s->dc_val[0][ s->block_index[n] ]; + if(n<4) level= (level + (s->y_dc_scale>>1))/s->y_dc_scale; //FIXME optimizs + else level= (level + (s->c_dc_scale>>1))/s->c_dc_scale; + dc_pred_dir= (s->pred_dir_table[s->mb_x + s->mb_y*s->mb_width]<<n)&32; + }else{ + level = mpeg4_decode_dc(s, n, &dc_pred_dir); + if (level < 0) + return DECODING_ACDC_LOST; + } block[0] = level; - i = 1; + i = 1; if (!coded) goto not_coded; rl = &rl_intra; @@ -2023,7 +2931,7 @@ static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block, for(;;) { code = get_vlc(&s->gb, &rl->vlc); if (code < 0) - return -1; + return DECODING_AC_LOST; if (code == rl->n) { /* escape */ if (get_bits1(&s->gb) != 0) { @@ -2031,15 +2939,46 @@ static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block, /* third escape */ last = get_bits1(&s->gb); run = get_bits(&s->gb, 6); - get_bits1(&s->gb); /* marker */ + if(get_bits1(&s->gb)==0){ + fprintf(stderr, "1. marker bit missing in 3. esc\n"); + return DECODING_AC_LOST; + } level = get_bits(&s->gb, 12); level = (level << 20) >> 20; /* sign extend */ - skip_bits1(&s->gb); /* marker */ + if(get_bits1(&s->gb)==0){ + fprintf(stderr, "2. marker bit missing in 3. esc\n"); + return DECODING_AC_LOST; + } + if(level>512 || level<-512){ //FIXME check that QP=1 is ok with this too + fprintf(stderr, "|level| overflow in 3. esc\n"); + return DECODING_AC_LOST; + } +#if 1 + { + const int abs_level= ABS(level); + int run1; + if(abs_level<=MAX_LEVEL && run<=MAX_RUN && s->error_resilience>=0){ + if(abs_level <= rl->max_level[last][run]){ + fprintf(stderr, "illegal 3. esc, vlc encoding possible\n"); + return DECODING_AC_LOST; + } + if(abs_level <= rl->max_level[last][run]*2){ + fprintf(stderr, "illegal 3. esc, esc 1 encoding possible\n"); + return DECODING_AC_LOST; + } + run1 = run - rl->max_run[last][abs_level] - 1; + if(run1 >= 0 && abs_level <= rl->max_level[last][run1]){ + fprintf(stderr, "illegal 3. esc, esc 2 encoding possible\n"); + return DECODING_AC_LOST; + } + } + } +#endif } else { /* second escape */ code = get_vlc(&s->gb, &rl->vlc); if (code < 0 || code >= rl->n) - return -1; + return DECODING_AC_LOST; run = rl->table_run[code]; level = rl->table_level[code]; last = code >= rl->last; @@ -2051,7 +2990,7 @@ static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block, /* first escape */ code = get_vlc(&s->gb, &rl->vlc); if (code < 0 || code >= rl->n) - return -1; + return DECODING_AC_LOST; run = rl->table_run[code]; level = rl->table_level[code]; last = code >= rl->last; @@ -2068,7 +3007,7 @@ static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block, } i += run; if (i >= 64) - return -1; + return DECODING_AC_LOST; j = scan_table[i]; block[j] = level; i++; @@ -2091,15 +3030,24 @@ int h263_decode_picture_header(MpegEncContext *s) { int format, width, height; - /* picture header */ - if (get_bits(&s->gb, 22) != 0x20) + /* picture start code */ + if (get_bits(&s->gb, 22) != 0x20) { + fprintf(stderr, "Bad picture start code\n"); return -1; + } + /* temporal reference */ s->picture_number = get_bits(&s->gb, 8); /* picture timestamp */ - - if (get_bits1(&s->gb) != 1) - return -1; /* marker */ - if (get_bits1(&s->gb) != 0) + + /* PTYPE starts here */ + if (get_bits1(&s->gb) != 1) { + /* marker */ + fprintf(stderr, "Bad marker\n"); + return -1; + } + if (get_bits1(&s->gb) != 0) { + fprintf(stderr, "Bad H263 id\n"); return -1; /* h263 id */ + } skip_bits1(&s->gb); /* split screen off */ skip_bits1(&s->gb); /* camera off */ skip_bits1(&s->gb); /* freeze picture release off */ @@ -2108,6 +3056,12 @@ int h263_decode_picture_header(MpegEncContext *s) s->gob_number = 0; format = get_bits(&s->gb, 3); + /* + 0 forbidden + 1 sub-QCIF + 10 QCIF + 7 extended PTYPE (PLUSPTYPE) + */ if (format != 7 && format != 6) { s->h263_plus = 0; @@ -2124,15 +3078,18 @@ int h263_decode_picture_header(MpegEncContext *s) s->unrestricted_mv = get_bits1(&s->gb); s->h263_long_vectors = s->unrestricted_mv; - if (get_bits1(&s->gb) != 0) + if (get_bits1(&s->gb) != 0) { + fprintf(stderr, "H263 SAC not supported\n"); return -1; /* SAC: off */ + } if (get_bits1(&s->gb) != 0) { s->mv_type = MV_TYPE_8X8; /* Advanced prediction mode */ } - if (get_bits1(&s->gb) != 0) + if (get_bits1(&s->gb) != 0) { + fprintf(stderr, "H263 PB frame not supported\n"); return -1; /* not PB frame */ - + } s->qscale = get_bits(&s->gb, 5); skip_bits1(&s->gb); /* Continuous Presence Multipoint mode: off */ } else { @@ -2141,10 +3098,12 @@ int h263_decode_picture_header(MpegEncContext *s) /* H.263v2 */ s->h263_plus = 1; ufep = get_bits(&s->gb, 3); /* Update Full Extended PTYPE */ - + + /* ufep other than 0 and 1 are reserved */ if (ufep == 1) { /* OPPTYPE */ format = get_bits(&s->gb, 3); + dprintf("ufep=1, format: %d\n", format); skip_bits(&s->gb,1); /* Custom PCF */ s->umvplus_dec = get_bits(&s->gb, 1); /* Unrestricted Motion Vector */ skip_bits1(&s->gb); /* Syntax-based Arithmetic Coding (SAC) */ @@ -2154,34 +3113,59 @@ int h263_decode_picture_header(MpegEncContext *s) if (get_bits1(&s->gb) != 0) { /* Advanced Intra Coding (AIC) */ s->h263_aic = 1; } + skip_bits(&s->gb, 7); + /* these are the 7 bits: (in order of appearence */ + /* Deblocking Filter */ + /* Slice Structured */ + /* Reference Picture Selection */ + /* Independent Segment Decoding */ + /* Alternative Inter VLC */ + /* Modified Quantization */ + /* Prevent start code emulation */ + skip_bits(&s->gb, 3); /* Reserved */ - } else if (ufep != 0) + } else if (ufep != 0) { + fprintf(stderr, "Bad UFEP type (%d)\n", ufep); return -1; + } /* MPPTYPE */ - s->pict_type = get_bits(&s->gb, 3) + 1; + s->pict_type = get_bits(&s->gb, 3) + I_TYPE; + dprintf("pict_type: %d\n", s->pict_type); if (s->pict_type != I_TYPE && s->pict_type != P_TYPE) return -1; skip_bits(&s->gb, 2); s->no_rounding = get_bits1(&s->gb); - //fprintf(stderr, "\nRTYPE: %d", s->no_rounding); + dprintf("RTYPE: %d\n", s->no_rounding); skip_bits(&s->gb, 4); /* Get the picture dimensions */ if (ufep) { if (format == 6) { /* Custom Picture Format (CPFMT) */ - skip_bits(&s->gb, 4); /* aspect ratio */ + s->aspect_ratio_info = get_bits(&s->gb, 4); + dprintf("aspect: %d\n", s->aspect_ratio_info); + /* aspect ratios: + 0 - forbidden + 1 - 1:1 + 2 - 12:11 (CIF 4:3) + 3 - 10:11 (525-type 4:3) + 4 - 16:11 (CIF 16:9) + 5 - 40:33 (525-type 16:9) + 6-14 - reserved + */ width = (get_bits(&s->gb, 9) + 1) * 4; skip_bits1(&s->gb); height = get_bits(&s->gb, 9) * 4; -#ifdef DEBUG - fprintf(stderr,"\nH.263+ Custom picture: %dx%d\n",width,height); -#endif - } - else { + dprintf("\nH.263+ Custom picture: %dx%d\n",width,height); + if (s->aspect_ratio_info == EXTENDED_PAR) { + /* aspected dimensions */ + skip_bits(&s->gb, 8); /* width */ + skip_bits(&s->gb, 8); /* height */ + } + } else { width = h263_format[format][0]; height = h263_format[format][1]; } @@ -2210,7 +3194,7 @@ static void mpeg4_decode_sprite_trajectory(MpegEncContext * s) int a= 2<<s->sprite_warping_accuracy; int rho= 3-s->sprite_warping_accuracy; int r=16/a; - int vop_ref[4][2]= {{0,0}, {s->width,0}, {0, s->height}, {s->width, s->height}}; // only true for rectangle shapes + const int vop_ref[4][2]= {{0,0}, {s->width,0}, {0, s->height}, {s->width, s->height}}; // only true for rectangle shapes int d[4][2]={{0,0}, {0,0}, {0,0}, {0,0}}; int sprite_ref[4][2]; int virtual_ref[2][2]; @@ -2276,13 +3260,13 @@ static void mpeg4_decode_sprite_trajectory(MpegEncContext * s) // the idea behind this virtual_ref mess is to be able to use shifts later per pixel instead of divides // so the distance between points is converted from w&h based to w2&h2 based which are of the 2^x form virtual_ref[0][0]= 16*(vop_ref[0][0] + w2) - + RDIV(((w - w2)*(r*sprite_ref[0][0] - 16*vop_ref[0][0]) + w2*(r*sprite_ref[1][0] - 16*vop_ref[1][0])),w); + + ROUNDED_DIV(((w - w2)*(r*sprite_ref[0][0] - 16*vop_ref[0][0]) + w2*(r*sprite_ref[1][0] - 16*vop_ref[1][0])),w); virtual_ref[0][1]= 16*vop_ref[0][1] - + RDIV(((w - w2)*(r*sprite_ref[0][1] - 16*vop_ref[0][1]) + w2*(r*sprite_ref[1][1] - 16*vop_ref[1][1])),w); + + ROUNDED_DIV(((w - w2)*(r*sprite_ref[0][1] - 16*vop_ref[0][1]) + w2*(r*sprite_ref[1][1] - 16*vop_ref[1][1])),w); virtual_ref[1][0]= 16*vop_ref[0][0] - + RDIV(((h - h2)*(r*sprite_ref[0][0] - 16*vop_ref[0][0]) + h2*(r*sprite_ref[2][0] - 16*vop_ref[2][0])),h); + + ROUNDED_DIV(((h - h2)*(r*sprite_ref[0][0] - 16*vop_ref[0][0]) + h2*(r*sprite_ref[2][0] - 16*vop_ref[2][0])),h); virtual_ref[1][1]= 16*(vop_ref[0][1] + h2) - + RDIV(((h - h2)*(r*sprite_ref[0][1] - 16*vop_ref[0][1]) + h2*(r*sprite_ref[2][1] - 16*vop_ref[2][1])),h); + + ROUNDED_DIV(((h - h2)*(r*sprite_ref[0][1] - 16*vop_ref[0][1]) + h2*(r*sprite_ref[2][1] - 16*vop_ref[2][1])),h); switch(s->num_sprite_warping_points) { @@ -2398,6 +3382,7 @@ printf("%d %d\n", s->sprite_delta[1][1][1], a<<s->sprite_shift[1][1]);*/ int mpeg4_decode_picture_header(MpegEncContext * s) { int time_incr, startcode, state, v; + int time_increment; redo: /* search next start code */ @@ -2412,8 +3397,13 @@ int mpeg4_decode_picture_header(MpegEncContext * s) } state = ((state << 8) | v) & 0xffffff; if( get_bits_count(&s->gb) > s->gb.size*8-32){ - printf("no VOP startcode found\n"); - return -1; + if(s->gb.size>50){ + printf("no VOP startcode found, frame size was=%d\n", s->gb.size); + return -1; + }else{ + printf("frame skip\n"); + return FRAME_SKIPED; + } } } //printf("startcode %X %d\n", startcode, get_bits_count(&s->gb)); @@ -2422,24 +3412,34 @@ int mpeg4_decode_picture_header(MpegEncContext * s) /* vol header */ skip_bits(&s->gb, 1); /* random access */ - skip_bits(&s->gb, 8); /* vo_type */ + s->vo_type= get_bits(&s->gb, 8); if (get_bits1(&s->gb) != 0) { /* is_ol_id */ vo_ver_id = get_bits(&s->gb, 4); /* vo_ver_id */ skip_bits(&s->gb, 3); /* vo_priority */ } else { vo_ver_id = 1; } - +//printf("vo type:%d\n",s->vo_type); s->aspect_ratio_info= get_bits(&s->gb, 4); - if(s->aspect_ratio_info == EXTENDET_PAR){ + if(s->aspect_ratio_info == EXTENDED_PAR){ skip_bits(&s->gb, 8); //par_width skip_bits(&s->gb, 8); // par_height } - if(get_bits1(&s->gb)){ /* vol control parameter */ - printf("vol control parameter not supported\n"); - return -1; + if ((s->vol_control_parameters=get_bits1(&s->gb))) { /* vol control parameter */ + int chroma_format= get_bits(&s->gb, 2); + if(chroma_format!=1){ + printf("illegal chroma format\n"); + } + s->low_delay= get_bits1(&s->gb); + if(get_bits1(&s->gb)){ /* vbv parameters */ + printf("vbv parameters not supported\n"); + return -1; + } + }else{ + s->low_delay=0; } + s->shape = get_bits(&s->gb, 2); /* vol shape */ if(s->shape != RECT_SHAPE) printf("only rectangular vol supported\n"); if(s->shape == GRAY_SHAPE && vo_ver_id != 1){ @@ -2469,12 +3469,12 @@ int mpeg4_decode_picture_header(MpegEncContext * s) if(width && height){ /* they should be non zero but who knows ... */ s->width = width; s->height = height; -// printf("%d %d\n", width, height); +// printf("width/height: %d %d\n", width, height); } } if(get_bits1(&s->gb)) printf("interlaced not supported\n"); /* interlaced */ - if(!get_bits1(&s->gb)) printf("OBMC not supported\n"); /* OBMC Disable */ + if(!get_bits1(&s->gb)) printf("OBMC not supported (very likely buggy encoder)\n"); /* OBMC Disable */ if (vo_ver_id == 1) { s->vol_sprite_usage = get_bits1(&s->gb); /* vol_sprite_usage */ } else { @@ -2509,7 +3509,57 @@ int mpeg4_decode_picture_header(MpegEncContext * s) } // FIXME a bunch of grayscale shape things - if(get_bits1(&s->gb)) printf("Quant-Type not supported\n"); /* vol_quant_type */ //FIXME + + if(get_bits1(&s->gb)){ /* vol_quant_type */ + int i, j, v; + /* load default matrixes */ + for(i=0; i<64; i++){ + v= ff_mpeg4_default_intra_matrix[i]; + s->intra_matrix[i]= v; + s->chroma_intra_matrix[i]= v; + + v= ff_mpeg4_default_non_intra_matrix[i]; + s->inter_matrix[i]= v; + s->chroma_inter_matrix[i]= v; + } + + /* load custom intra matrix */ + if(get_bits1(&s->gb)){ + for(i=0; i<64; i++){ + v= get_bits(&s->gb, 8); + if(v==0) break; + + j= zigzag_direct[i]; + s->intra_matrix[j]= v; + s->chroma_intra_matrix[j]= v; + } + } + + /* load custom non intra matrix */ + if(get_bits1(&s->gb)){ + for(i=0; i<64; i++){ + v= get_bits(&s->gb, 8); + if(v==0) break; + + j= zigzag_direct[i]; + s->inter_matrix[j]= v; + s->chroma_inter_matrix[j]= v; + } + + /* replicate last value */ + for(; i<64; i++){ + j= zigzag_direct[i]; + s->inter_matrix[j]= v; + s->chroma_inter_matrix[j]= v; + } + } + + s->dct_unquantize= s->dct_unquantize_mpeg2; + + // FIXME a bunch of grayscale shape things + }else + s->dct_unquantize= s->dct_unquantize_h263; + if(vo_ver_id != 1) s->quarter_sample= get_bits1(&s->gb); else s->quarter_sample=0; @@ -2518,10 +3568,12 @@ int mpeg4_decode_picture_header(MpegEncContext * s) s->resync_marker= !get_bits1(&s->gb); /* resync_marker_disabled */ - s->data_partioning= get_bits1(&s->gb); - if(s->data_partioning){ - printf("data partitioning not supported\n"); - skip_bits1(&s->gb); // reversible vlc + s->data_partitioning= get_bits1(&s->gb); + if(s->data_partitioning){ + s->rvlc= get_bits1(&s->gb); + if(s->rvlc){ + printf("reversible vlc not supported\n"); + } } if(vo_ver_id != 1) { @@ -2561,16 +3613,20 @@ int mpeg4_decode_picture_header(MpegEncContext * s) } buf[255]=0; e=sscanf(buf, "DivX%dBuild%d", &ver, &build); + if(e!=2) + e=sscanf(buf, "DivX%db%d", &ver, &build); if(e==2){ s->divx_version= ver; s->divx_build= build; if(s->picture_number==0){ printf("This file was encoded with DivX%d Build%d\n", ver, build); - if(ver==500 && build==413){ //most likely all version are indeed totally buggy but i dunno for sure ... + if(ver==500 && build==413){ printf("WARNING: this version of DivX is not MPEG4 compatible, trying to workaround these bugs...\n"); +#if 0 }else{ printf("hmm, i havnt seen that version of divx yet, lets assume they fixed these bugs ...\n" "using mpeg4 decoder, if it fails contact the developers (of ffmpeg)\n"); +#endif } } } @@ -2580,21 +3636,34 @@ int mpeg4_decode_picture_header(MpegEncContext * s) goto redo; } - s->pict_type = get_bits(&s->gb, 2) + 1; /* pict type: I = 0 , P = 1 */ -//printf("pic: %d, qpel:%d\n", s->pict_type, s->quarter_sample); + s->pict_type = get_bits(&s->gb, 2) + I_TYPE; /* pict type: I = 0 , P = 1 */ +//if(s->pict_type!=I_TYPE) return FRAME_SKIPED; + if(s->pict_type==B_TYPE && s->low_delay && s->vol_control_parameters==0){ + printf("low_delay flag set, but shouldnt, clearing it\n"); + s->low_delay=0; + } +// printf("pic: %d, qpel:%d\n", s->pict_type, s->quarter_sample); +//printf("%d", s->pict_type); time_incr=0; while (get_bits1(&s->gb) != 0) time_incr++; check_marker(&s->gb, "before time_increment"); - s->time_increment= get_bits(&s->gb, s->time_increment_bits); + time_increment= get_bits(&s->gb, s->time_increment_bits); +//printf(" type:%d incr:%d increment:%d\n", s->pict_type, time_incr, time_increment); if(s->pict_type!=B_TYPE){ + s->last_time_base= s->time_base; s->time_base+= time_incr; - s->last_non_b_time[1]= s->last_non_b_time[0]; - s->last_non_b_time[0]= s->time_base*s->time_increment_resolution + s->time_increment; + s->time= s->time_base*s->time_increment_resolution + time_increment; + s->pp_time= s->time - s->last_non_b_time; + s->last_non_b_time= s->time; }else{ - s->time= (s->last_non_b_time[1]/s->time_increment_resolution + time_incr)*s->time_increment_resolution; - s->time+= s->time_increment; + s->time= (s->last_time_base + time_incr)*s->time_increment_resolution + time_increment; + s->bp_time= s->last_non_b_time - s->time; + if(s->pp_time <=s->bp_time){ +// printf("messed up order, seeking?, skiping current b frame\n"); + return FRAME_SKIPED; + } } if(check_marker(&s->gb, "before vop_coded")==0 && s->picture_number==0){ @@ -2674,14 +3743,23 @@ int mpeg4_decode_picture_header(MpegEncContext * s) s->b_code = get_bits(&s->gb, 3); //printf("b-code %d\n", s->b_code); } -//printf("quant:%d fcode:%d\n", s->qscale, s->f_code); +//printf("quant:%d fcode:%d bcode:%d type:%d\n", s->qscale, s->f_code, s->b_code, s->pict_type); if(!s->scalability){ if (s->shape!=RECT_SHAPE && s->pict_type!=I_TYPE) { skip_bits1(&s->gb); // vop shape coding type } } } + /* detect buggy encoders which dont set the low_delay flag (divx4/xvid/opendivx)*/ + // note we cannot detect divx5 without b-frames easyly (allthough its buggy too) + if(s->vo_type==0 && s->vol_control_parameters==0 && s->divx_version==0 && s->picture_number==0){ + printf("looks like this file was encoded with (divx4/(old)xvid/opendivx) -> forcing low_delay flag\n"); + s->low_delay=1; + } + s->picture_number++; // better than pic number==0 allways ;) +//printf("done\n"); + return 0; } @@ -2691,22 +3769,29 @@ int intel_h263_decode_picture_header(MpegEncContext *s) int format; /* picture header */ - if (get_bits(&s->gb, 22) != 0x20) + if (get_bits(&s->gb, 22) != 0x20) { + fprintf(stderr, "Bad picture start code\n"); return -1; - skip_bits(&s->gb, 8); /* picture timestamp */ + } + s->picture_number = get_bits(&s->gb, 8); /* picture timestamp */ - if (get_bits1(&s->gb) != 1) + if (get_bits1(&s->gb) != 1) { + fprintf(stderr, "Bad marker\n"); return -1; /* marker */ - if (get_bits1(&s->gb) != 0) + } + if (get_bits1(&s->gb) != 0) { + fprintf(stderr, "Bad H263 id\n"); return -1; /* h263 id */ + } skip_bits1(&s->gb); /* split screen off */ skip_bits1(&s->gb); /* camera off */ skip_bits1(&s->gb); /* freeze picture release off */ format = get_bits(&s->gb, 3); - if (format != 7) + if (format != 7) { + fprintf(stderr, "Intel H263 free format not supported\n"); return -1; - + } s->h263_plus = 0; s->pict_type = I_TYPE + get_bits1(&s->gb); @@ -2714,12 +3799,18 @@ int intel_h263_decode_picture_header(MpegEncContext *s) s->unrestricted_mv = get_bits1(&s->gb); s->h263_long_vectors = s->unrestricted_mv; - if (get_bits1(&s->gb) != 0) + if (get_bits1(&s->gb) != 0) { + fprintf(stderr, "SAC not supported\n"); return -1; /* SAC: off */ - if (get_bits1(&s->gb) != 0) + } + if (get_bits1(&s->gb) != 0) { + fprintf(stderr, "Advanced Prediction Mode not supported\n"); return -1; /* advanced prediction mode: off */ - if (get_bits1(&s->gb) != 0) - return -1; /* not PB frame */ + } + if (get_bits1(&s->gb) != 0) { + fprintf(stderr, "PB frame mode no supported\n"); + return -1; /* PB frame mode */ + } /* skip unknown header garbage */ skip_bits(&s->gb, 41); diff --git a/src/libffmpeg/libavcodec/h263data.h b/src/libffmpeg/libavcodec/h263data.h index a129fd6bf..5a7b943ea 100644 --- a/src/libffmpeg/libavcodec/h263data.h +++ b/src/libffmpeg/libavcodec/h263data.h @@ -1,11 +1,11 @@ /* intra MCBPC, mb_type = (intra), then (intraq) */ -static const UINT8 intra_MCBPC_code[8] = { 1, 1, 2, 3, 1, 1, 2, 3 }; -static const UINT8 intra_MCBPC_bits[8] = { 1, 3, 3, 3, 4, 6, 6, 6 }; +const UINT8 intra_MCBPC_code[8] = { 1, 1, 2, 3, 1, 1, 2, 3 }; +const UINT8 intra_MCBPC_bits[8] = { 1, 3, 3, 3, 4, 6, 6, 6 }; /* inter MCBPC, mb_type = (inter), (intra), (interq), (intraq), (inter4v) */ /* Changed the tables for interq and inter4v+q, following the standard ** Juanjo ** */ -static const UINT8 inter_MCBPC_code[25] = { +const UINT8 inter_MCBPC_code[25] = { 1, 3, 2, 5, 3, 4, 3, 3, 3, 7, 6, 5, @@ -14,7 +14,7 @@ static const UINT8 inter_MCBPC_code[25] = { 1, /* Stuffing */ 2, 12, 14, 15, }; -static const UINT8 inter_MCBPC_bits[25] = { +const UINT8 inter_MCBPC_bits[25] = { 1, 4, 4, 6, 5, 8, 8, 7, 3, 7, 7, 9, @@ -125,45 +125,73 @@ static RLTable rl_inter = { inter_level, }; -/* table used for Advanced INTRA Coding, just RUN and LEVEL change */ -const INT8 inter_level_aic[102] = { - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 3, 2, 1, 2, 2, 4, 5, - 6, 7, 3, 2, 3, 4, 5, 2, - 3, 4, 2, 3, 1, 2, 25, 1, - 2, 24, 8, 2, 7, 4, 6, 1, - 9, 23, 2, 3, 1, 10, 12, 11, - 18, 17, 16, 15, 14, 13, 20, 19, - 22, 21, 1, 1, 1, 1, 1, 1, - 1, 2, 1, 1, 1, 3, 1, 1, - 1, 1, 1, 1, 1, 4, 1, 1, - 1, 1, 2, 2, 6, 5, 2, 2, - 3, 7, 3, 4, 9, 8, 1, 1, - 1, 2, 2, 2, 3, 10, +const UINT16 intra_vlc_aic[103][2] = { +{ 0x2, 2 }, { 0x6, 3 }, { 0xe, 4 }, { 0xc, 5 }, +{ 0xd, 5 }, { 0x10, 6 }, { 0x11, 6 }, { 0x12, 6 }, +{ 0x16, 7 }, { 0x1b, 8 }, { 0x20, 9 }, { 0x21, 9 }, +{ 0x1a, 9 }, { 0x1b, 9 }, { 0x1c, 9 }, { 0x1d, 9 }, +{ 0x1e, 9 }, { 0x1f, 9 }, { 0x23, 11 }, { 0x22, 11 }, +{ 0x57, 12 }, { 0x56, 12 }, { 0x55, 12 }, { 0x54, 12 }, +{ 0x53, 12 }, { 0xf, 4 }, { 0x14, 6 }, { 0x14, 7 }, +{ 0x1e, 8 }, { 0xf, 10 }, { 0x21, 11 }, { 0x50, 12 }, +{ 0xb, 5 }, { 0x15, 7 }, { 0xe, 10 }, { 0x9, 10 }, +{ 0x15, 6 }, { 0x1d, 8 }, { 0xd, 10 }, { 0x51, 12 }, +{ 0x13, 6 }, { 0x23, 9 }, { 0x7, 11 }, { 0x17, 7 }, +{ 0x22, 9 }, { 0x52, 12 }, { 0x1c, 8 }, { 0xc, 10 }, +{ 0x1f, 8 }, { 0xb, 10 }, { 0x25, 9 }, { 0xa, 10 }, +{ 0x24, 9 }, { 0x6, 11 }, { 0x21, 10 }, { 0x20, 10 }, +{ 0x8, 10 }, { 0x20, 11 }, { 0x7, 4 }, { 0xc, 6 }, +{ 0x10, 7 }, { 0x13, 8 }, { 0x11, 9 }, { 0x12, 9 }, +{ 0x4, 10 }, { 0x27, 11 }, { 0x26, 11 }, { 0x5f, 12 }, +{ 0xf, 6 }, { 0x13, 9 }, { 0x5, 10 }, { 0x25, 11 }, +{ 0xe, 6 }, { 0x14, 9 }, { 0x24, 11 }, { 0xd, 6 }, +{ 0x6, 10 }, { 0x5e, 12 }, { 0x11, 7 }, { 0x7, 10 }, +{ 0x13, 7 }, { 0x5d, 12 }, { 0x12, 7 }, { 0x5c, 12 }, +{ 0x14, 8 }, { 0x5b, 12 }, { 0x15, 8 }, { 0x1a, 8 }, +{ 0x19, 8 }, { 0x18, 8 }, { 0x17, 8 }, { 0x16, 8 }, +{ 0x19, 9 }, { 0x15, 9 }, { 0x16, 9 }, { 0x18, 9 }, +{ 0x17, 9 }, { 0x4, 11 }, { 0x5, 11 }, { 0x58, 12 }, +{ 0x59, 12 }, { 0x5a, 12 }, { 0x3, 7 }, }; -const INT8 inter_run_aic[102] = { - 0, 1, 3, 5, 7, 8, 9, 10, - 11, 4, 9, 13, 0, 1, 1, 1, - 1, 1, 0, 3, 2, 3, 0, 4, - 3, 0, 5, 5, 2, 6, 0, 4, - 7, 0, 0, 8, 0, 2, 0, 12, - 0, 0, 2, 1, 6, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 14, 20, 1, 19, 2, - 3, 0, 5, 6, 4, 0, 9, 10, - 11, 12, 13, 8, 7, 0, 17, 18, - 16, 15, 2, 1, 0, 0, 4, 3, - 1, 0, 2, 1, 0, 0, 21, 22, - 23, 7, 6, 5, 3, 0, +const INT8 intra_run_aic[102] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 3, 3, 3, 3, + 4, 4, 4, 5, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 11, +12, 13, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 1, 1, + 2, 2, 2, 3, 3, 3, 4, 4, + 5, 5, 6, 6, 7, 7, 8, 9, +10, 11, 12, 13, 14, 15, 16, 17, +18, 19, 20, 21, 22, 23, +}; + +const INT8 intra_level_aic[102] = { + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, +17, 18, 19, 20, 21, 22, 23, 24, +25, 1, 2, 3, 4, 5, 6, 7, + 1, 2, 3, 4, 1, 2, 3, 4, + 1, 2, 3, 1, 2, 3, 1, 2, + 1, 2, 1, 2, 1, 2, 1, 1, + 1, 1, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 1, 2, 3, 4, + 1, 2, 3, 1, 2, 3, 1, 2, + 1, 2, 1, 2, 1, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, }; static RLTable rl_intra_aic = { 102, 58, - inter_vlc, - inter_run_aic, - inter_level_aic, + intra_vlc_aic, + intra_run_aic, + intra_level_aic, }; static const UINT16 h263_format[8][2] = { @@ -174,4 +202,3 @@ static const UINT16 h263_format[8][2] = { { 704, 576 }, { 1408, 1152 }, }; - diff --git a/src/libffmpeg/libavcodec/h263dec.c b/src/libffmpeg/libavcodec/h263dec.c index e909ac56e..3c90a1e47 100644 --- a/src/libffmpeg/libavcodec/h263dec.c +++ b/src/libffmpeg/libavcodec/h263dec.c @@ -1,53 +1,60 @@ /* * H263 decoder - * Copyright (c) 2001 Gerard Lantau. + * Copyright (c) 2001 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include "config.h" -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include "dsputil.h" #include "avcodec.h" +#include "dsputil.h" #include "mpegvideo.h" -#include "xine-utils/xineutils.h" //#define DEBUG +//#define PRINT_FRAME_TIME +#ifdef PRINT_FRAME_TIME +static inline long long rdtsc() +{ + long long l; + asm volatile( "rdtsc\n\t" + : "=A" (l) + ); +// printf("%d\n", int(l/1000)); + return l; +} +#endif static int h263_decode_init(AVCodecContext *avctx) { MpegEncContext *s = avctx->priv_data; - int i; s->avctx = avctx; s->out_format = FMT_H263; s->width = avctx->width; s->height = avctx->height; + s->workaround_bugs= avctx->workaround_bugs; /* select sub codec */ switch(avctx->codec->id) { case CODEC_ID_H263: s->gob_number = 0; - s->first_gob_line = 0; + s->first_slice_line = 0; break; case CODEC_ID_MPEG4: s->time_increment_bits = 4; /* default value for broken headers */ s->h263_pred = 1; - s->has_b_frames = 1; + s->has_b_frames = 1; //default, might be overriden in the vol header during header parsing break; case CODEC_ID_MSMPEG4V1: s->h263_msmpeg4 = 1; @@ -64,23 +71,25 @@ static int h263_decode_init(AVCodecContext *avctx) s->h263_pred = 1; s->msmpeg4_version=3; break; + case CODEC_ID_WMV1: + s->h263_msmpeg4 = 1; + s->h263_pred = 1; + s->msmpeg4_version=4; + break; case CODEC_ID_H263I: s->h263_intel = 1; break; default: return -1; } - + s->codec_id= avctx->codec->id; + avctx->mbskip_table= s->mbskip_table; + /* for h263, we allocate the images after having read the header */ if (avctx->codec->id != CODEC_ID_H263 && avctx->codec->id != CODEC_ID_MPEG4) if (MPV_common_init(s) < 0) return -1; - /* XXX: suppress this matrix init, only needed because using mpeg1 - dequantize in mmx case */ - for(i=0;i<64;i++) - s->non_intra_matrix[i] = default_non_intra_matrix[i]; - if (s->h263_msmpeg4) msmpeg4_decode_init_vlc(s); else @@ -104,25 +113,37 @@ static int h263_decode_frame(AVCodecContext *avctx, MpegEncContext *s = avctx->priv_data; int ret; AVPicture *pict = data; - +#ifdef PRINT_FRAME_TIME +uint64_t time= rdtsc(); +#endif #ifdef DEBUG printf("*****frame %d size=%d\n", avctx->frame_number, buf_size); printf("bytes=%x %x %x %x\n", buf[0], buf[1], buf[2], buf[3]); #endif - + + s->hurry_up= avctx->hurry_up; + s->error_resilience= avctx->error_resilience; + s->workaround_bugs= avctx->workaround_bugs; + s->flags= avctx->flags; + /* no supplementary picture */ if (buf_size == 0) { *data_size = 0; return 0; } - init_get_bits(&s->gb, buf, buf_size); + if(s->bitstream_buffer_size && buf_size<20){ //divx 5.01+ frame reorder + init_get_bits(&s->gb, s->bitstream_buffer, s->bitstream_buffer_size); + }else + init_get_bits(&s->gb, buf, buf_size); + s->bitstream_buffer_size=0; /* let's go :-) */ if (s->h263_msmpeg4) { ret = msmpeg4_decode_picture_header(s); } else if (s->h263_pred) { ret = mpeg4_decode_picture_header(s); + s->has_b_frames= !s->low_delay; } else if (s->h263_intel) { ret = intel_h263_decode_picture_header(s); } else { @@ -146,8 +167,21 @@ static int h263_decode_frame(AVCodecContext *avctx, return -1; } + if(ret==FRAME_SKIPED) return 0; + /* skip if the header was thrashed */ if (ret < 0) return -1; + /* skip b frames if we dont have reference frames */ + if(s->num_available_buffers<2 && s->pict_type==B_TYPE) return 0; + /* skip b frames if we are in a hurry */ + if(s->hurry_up && s->pict_type==B_TYPE) return 0; + + if(s->next_p_frame_damaged){ + if(s->pict_type==B_TYPE) + return 0; + else + s->next_p_frame_damaged=0; + } MPV_frame_start(s); @@ -155,6 +189,12 @@ static int h263_decode_frame(AVCodecContext *avctx, printf("qscale=%d\n", s->qscale); #endif + /* init resync/ error resilience specific variables */ + s->next_resync_qscale= s->qscale; + s->next_resync_gb= s->gb; + if(s->resync_marker) s->mb_num_left= 0; + else s->mb_num_left= s->mb_num; + /* decode each macroblock */ s->block_wrap[0]= s->block_wrap[1]= @@ -167,7 +207,13 @@ static int h263_decode_frame(AVCodecContext *avctx, /* FIXME: In the future H.263+ will have intra prediction */ /* and we are gonna need another way to detect MPEG4 */ if (s->mb_y && !s->h263_pred) { - s->first_gob_line = h263_decode_gob_header(s); + s->first_slice_line = h263_decode_gob_header(s); + } + + if(s->msmpeg4_version==1){ + s->last_dc[0]= + s->last_dc[1]= + s->last_dc[2]= 128; } s->block_index[0]= s->block_wrap[0]*(s->mb_y*2 + 1) - 1; @@ -186,35 +232,95 @@ static int h263_decode_frame(AVCodecContext *avctx, #ifdef DEBUG printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y); #endif + + if(s->resync_marker){ + if(s->mb_num_left<=0){ + /* except the first block */ + if(s->mb_x!=0 || s->mb_y!=0){ + /* did we miss the next resync marker without noticing an error yet */ + if(((get_bits_count(&s->gb)+8)&(~7)) != s->next_resync_pos && s->decoding_error==0){ + fprintf(stderr, "slice end missmatch x:%d y:%d %d %d\n", + s->mb_x, s->mb_y, get_bits_count(&s->gb), s->next_resync_pos); + ff_conceal_past_errors(s, 1); + } + } + s->qscale= s->next_resync_qscale; + s->gb= s->next_resync_gb; + s->resync_mb_x= s->mb_x; //we know that the marker is here cuz mb_num_left was the distance to it + s->resync_mb_y= s->mb_y; + s->first_slice_line=1; + + if(s->codec_id==CODEC_ID_MPEG4){ + ff_mpeg4_clean_buffers(s); + ff_mpeg4_resync(s); + } + } + + if( s->resync_mb_x==s->mb_x + && s->resync_mb_y==s->mb_y && s->decoding_error!=0){ + fprintf(stderr, "resynced at %d %d\n", s->mb_x, s->mb_y); + s->decoding_error= 0; + } + } + //fprintf(stderr,"\nFrame: %d\tMB: %d",avctx->frame_number, (s->mb_y * s->mb_width) + s->mb_x); /* DCT & quantize */ - if (s->h263_msmpeg4) { - msmpeg4_dc_scale(s); - } else if (s->h263_pred) { - h263_dc_scale(s); + if (s->h263_pred && !(s->msmpeg4_version==1 || s->msmpeg4_version==2)) { + /* old ffmpeg encoded msmpeg4v3 workaround */ + if(s->workaround_bugs==1 && s->msmpeg4_version==3) + ff_old_msmpeg4_dc_scale(s); + else + h263_dc_scale(s); } else { /* default quantization values */ s->y_dc_scale = 8; s->c_dc_scale = 8; } - clear_blocks(s->block[0]); + + if(s->decoding_error!=DECODING_DESYNC){ + int last_error= s->decoding_error; + clear_blocks(s->block[0]); - s->mv_dir = MV_DIR_FORWARD; - s->mv_type = MV_TYPE_16X16; - if (s->h263_msmpeg4) { - if (msmpeg4_decode_mb(s, s->block) < 0) { - fprintf(stderr,"\nError at MB: %d\n", (s->mb_y * s->mb_width) + s->mb_x); - return -1; - } - } else { - if (h263_decode_mb(s, s->block) < 0) { - fprintf(stderr,"\nError at MB: %d\n", (s->mb_y * s->mb_width) + s->mb_x); - return -1; + s->mv_dir = MV_DIR_FORWARD; + s->mv_type = MV_TYPE_16X16; + if (s->h263_msmpeg4) { + if (msmpeg4_decode_mb(s, s->block) < 0) { + fprintf(stderr,"Error at MB: %d\n", (s->mb_y * s->mb_width) + s->mb_x); + s->decoding_error=DECODING_DESYNC; + } + } else { + if (h263_decode_mb(s, s->block) < 0) { + fprintf(stderr,"Error at MB: %d\n", (s->mb_y * s->mb_width) + s->mb_x); + s->decoding_error=DECODING_DESYNC; + } + } + + if(s->decoding_error!=last_error){ + ff_conceal_past_errors(s, 0); } } + + /* conceal errors */ + if( s->decoding_error==DECODING_DESYNC + || (s->decoding_error==DECODING_ACDC_LOST && s->mb_intra)){ + s->mv_dir = MV_DIR_FORWARD; + s->mv_type = MV_TYPE_16X16; + s->mb_skiped=0; + s->mb_intra=0; + s->mv[0][0][0]=0; //FIXME this is not optimal + s->mv[0][0][1]=0; + clear_blocks(s->block[0]); + }else if(s->decoding_error && !s->mb_intra){ + clear_blocks(s->block[0]); + } + //FIXME remove AC for intra + MPV_decode_mb(s, s->block); + + s->mb_num_left--; } - if (avctx->draw_horiz_band) { + if ( avctx->draw_horiz_band + && (s->num_available_buffers>=1 || (!s->has_b_frames)) ) { UINT8 *src_ptr[3]; int y, h, offset; y = s->mb_y * 16; @@ -236,11 +342,84 @@ static int h263_decode_frame(AVCodecContext *avctx, } } - if (s->h263_msmpeg4 && s->pict_type==I_TYPE) + if (s->h263_msmpeg4 && s->msmpeg4_version<4 && s->pict_type==I_TYPE) if(msmpeg4_decode_ext_header(s, buf_size) < 0) return -1; + + /* divx 5.01+ bistream reorder stuff */ + if(s->codec_id==CODEC_ID_MPEG4 && s->bitstream_buffer_size==0){ + int current_pos= get_bits_count(&s->gb)>>3; + if( buf_size - current_pos > 5 + && buf_size - current_pos < BITSTREAM_BUFFER_SIZE){ + int i; + int startcode_found=0; + for(i=current_pos; i<buf_size; i++){ + if(buf[i]==0 && buf[i+1]==0 && buf[i+2]==1 && buf[i+3]==0xB6){ + startcode_found=1; + break; + } + } + if(startcode_found){ + memcpy(s->bitstream_buffer, buf + current_pos, buf_size - current_pos); + s->bitstream_buffer_size= buf_size - current_pos; + } + } + } + + if(s->bitstream_buffer_size==0 && s->error_resilience>0){ + int left= s->gb.size*8 - get_bits_count(&s->gb); + int max_extra=8; + + if(s->codec_id==CODEC_ID_MPEG4) max_extra+=32; + + if(left>max_extra){ + fprintf(stderr, "discarding %d junk bits at end, next would be %X\n", left, show_bits(&s->gb, 24)); + if(s->decoding_error==0) + ff_conceal_past_errors(s, 1); + } + if(left<0){ + fprintf(stderr, "overreading %d bits\n", -left); + if(s->decoding_error==0) + ff_conceal_past_errors(s, 1); + } + } + MPV_frame_end(s); - +#if 0 //dirty show MVs, we should export the MV tables and write a filter to show them +{ + int mb_y; + s->has_b_frames=1; + for(mb_y=0; mb_y<s->mb_height; mb_y++){ + int mb_x; + int y= mb_y*16 + 8; + for(mb_x=0; mb_x<s->mb_width; mb_x++){ + int x= mb_x*16 + 8; + uint8_t *ptr= s->last_picture[0]; + int xy= 1 + mb_x*2 + (mb_y*2 + 1)*(s->mb_width*2 + 2); + int mx= (s->motion_val[xy][0]>>1) + x; + int my= (s->motion_val[xy][1]>>1) + y; + int i; + int max; + + if(mx<0) mx=0; + if(my<0) my=0; + if(mx>=s->width) mx= s->width -1; + if(my>=s->height) my= s->height-1; + max= ABS(mx-x); + if(ABS(my-y) > max) max= ABS(my-y); + /* the ugliest linedrawing routine ... */ + for(i=0; i<max; i++){ + int x1= x + (mx-x)*i/max; + int y1= y + (my-y)*i/max; + ptr[y1*s->linesize + x1]+=100; + } + ptr[y*s->linesize + x]+=100; + s->mbskip_table[mb_x + mb_y*s->mb_width]=0; + } + } + +} +#endif if(s->pict_type==B_TYPE || (!s->has_b_frames)){ pict->data[0] = s->current_picture[0]; pict->data[1] = s->current_picture[1]; @@ -260,7 +439,13 @@ static int h263_decode_frame(AVCodecContext *avctx, /* we substract 1 because it is added on utils.c */ avctx->frame_number = s->picture_number - 1; - *data_size = sizeof(AVPicture); + /* dont output the last pic after seeking + note we allready added +1 for the current pix in MPV_frame_end(s) */ + if(s->num_available_buffers>=2 || (!s->has_b_frames)) + *data_size = sizeof(AVPicture); +#ifdef PRINT_FRAME_TIME +printf("%Ld\n", rdtsc()-time); +#endif return buf_size; } @@ -324,6 +509,18 @@ AVCodec msmpeg4v3_decoder = { CODEC_CAP_DRAW_HORIZ_BAND, }; +AVCodec wmv1_decoder = { + "wmv1", + CODEC_TYPE_VIDEO, + CODEC_ID_WMV1, + sizeof(MpegEncContext), + h263_decode_init, + NULL, + h263_decode_end, + h263_decode_frame, + CODEC_CAP_DRAW_HORIZ_BAND, +}; + AVCodec h263i_decoder = { "h263i", CODEC_TYPE_VIDEO, diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c index 2c71850ee..b8eaa5fbd 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c @@ -1,25 +1,24 @@ /* * MMX optimized DSP utils - * Copyright (c) 2000, 2001 Gerard Lantau. + * Copyright (c) 2000, 2001 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> */ -#include "xine-utils/xineutils.h" #include "../dsputil.h" #include "../simple_idct.h" @@ -45,38 +44,124 @@ int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); - /* external functions, from idct_mmx.c */ void ff_mmx_idct(DCTELEM *block); void ff_mmxext_idct(DCTELEM *block); /* pixel operations */ -static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL; -static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL; -//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; -//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; +static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; +static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; +static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; #define JUMPALIGN() __asm __volatile (".balign 8"::) #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) +#define MOVQ_WONE(regd) \ + __asm __volatile ( \ + "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ + "psrlw $15, %%" #regd ::) + +#define MOVQ_BFE(regd) \ + __asm __volatile ( \ + "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ + "paddb %%" #regd ", %%" #regd " \n\t" ::) + #ifndef PIC -#define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) +#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone)) #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) #else // for shared library it's better to use this way for accessing constants // pcmpeqd -> -1 -#define MOVQ_WONE(regd) \ +#define MOVQ_BONE(regd) \ __asm __volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ - "psrlw $15, %%" #regd ::) + "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ + "psrlw $15, %%" #regd " \n\t" \ + "packuswb %%" #regd ", %%" #regd " \n\t" ::) #define MOVQ_WTWO(regd) \ __asm __volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ - "psrlw $15, %%" #regd " \n\t" \ - "psllw $1, %%" #regd ::) + "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ + "psrlw $15, %%" #regd " \n\t" \ + "psllw $1, %%" #regd " \n\t"::) + #endif +// using regr as temporary and for the output result +// first argument is unmodifed and second is trashed +// regfe is supposed to contain 0xfefefefefefefefe +#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ + "movq " #rega ", " #regr " \n\t"\ + "pand " #regb ", " #regr " \n\t"\ + "pxor " #rega ", " #regb " \n\t"\ + "pand " #regfe "," #regb " \n\t"\ + "psrlq $1, " #regb " \n\t"\ + "paddb " #regb ", " #regr " \n\t" + +#define PAVGB_MMX(rega, regb, regr, regfe) \ + "movq " #rega ", " #regr " \n\t"\ + "por " #regb ", " #regr " \n\t"\ + "pxor " #rega ", " #regb " \n\t"\ + "pand " #regfe "," #regb " \n\t"\ + "psrlq $1, " #regb " \n\t"\ + "psubb " #regb ", " #regr " \n\t" + +// mm6 is supposed to contain 0xfefefefefefefefe +#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ + "movq " #rega ", " #regr " \n\t"\ + "movq " #regc ", " #regp " \n\t"\ + "pand " #regb ", " #regr " \n\t"\ + "pand " #regd ", " #regp " \n\t"\ + "pxor " #rega ", " #regb " \n\t"\ + "pxor " #regc ", " #regd " \n\t"\ + "pand %%mm6, " #regb " \n\t"\ + "pand %%mm6, " #regd " \n\t"\ + "psrlq $1, " #regb " \n\t"\ + "psrlq $1, " #regd " \n\t"\ + "paddb " #regb ", " #regr " \n\t"\ + "paddb " #regd ", " #regp " \n\t" + +#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ + "movq " #rega ", " #regr " \n\t"\ + "movq " #regc ", " #regp " \n\t"\ + "por " #regb ", " #regr " \n\t"\ + "por " #regd ", " #regp " \n\t"\ + "pxor " #rega ", " #regb " \n\t"\ + "pxor " #regc ", " #regd " \n\t"\ + "pand %%mm6, " #regb " \n\t"\ + "pand %%mm6, " #regd " \n\t"\ + "psrlq $1, " #regd " \n\t"\ + "psrlq $1, " #regb " \n\t"\ + "psubb " #regb ", " #regr " \n\t"\ + "psubb " #regd ", " #regp " \n\t" + +/***********************************/ +/* MMX no rounding */ +#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx +#define SET_RND MOVQ_WONE +#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) +#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) + +#include "dsputil_mmx_rnd.h" + +#undef DEF +#undef SET_RND +#undef PAVGBP +#undef PAVGB +/***********************************/ +/* MMX rounding */ + +#define DEF(x, y) x ## _ ## y ##_mmx +#define SET_RND MOVQ_WTWO +#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) +#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) + +#include "dsputil_mmx_rnd.h" + +#undef DEF +#undef SET_RND +#undef PAVGBP +#undef PAVGB + /***********************************/ /* 3Dnow specific */ @@ -92,7 +177,7 @@ static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x000 /***********************************/ /* MMX2 specific */ -#define DEF(x) x ## _sse +#define DEF(x) x ## _mmx2 /* Introduced only in MMX2 set */ #define PAVGB "pavgb" @@ -107,34 +192,59 @@ static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x000 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) { - DCTELEM *p; - const UINT8 *pix; - int i; + asm volatile( + "movl $-128, %%eax \n\t" + "pxor %%mm7, %%mm7 \n\t" + ".balign 16 \n\t" + "1: \n\t" + "movq (%0), %%mm0 \n\t" + "movq (%0, %2), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "movq %%mm0, (%1, %%eax)\n\t" + "movq %%mm1, 8(%1, %%eax)\n\t" + "movq %%mm2, 16(%1, %%eax)\n\t" + "movq %%mm3, 24(%1, %%eax)\n\t" + "addl %3, %0 \n\t" + "addl $32, %%eax \n\t" + "js 1b \n\t" + : "+r" (pixels) + : "r" (block+64), "r" (line_size), "r" (line_size*2) + : "%eax" + ); +} - /* read the pixels */ - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - for(i=0;i<4;i++) { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm2, 8%0\n\t" - "movq %%mm1, 16%0\n\t" - "movq %%mm3, 24%0\n\t" - :"=m"(*p) - :"m"(*pix), "m"(*(pix+line_size)) - :"memory"); - pix += line_size*2; - p += 16; - } +static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) +{ + asm volatile( + "pxor %%mm7, %%mm7 \n\t" + "movl $-128, %%eax \n\t" + ".balign 16 \n\t" + "1: \n\t" + "movq (%0), %%mm0 \n\t" + "movq (%1), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm3, %%mm1 \n\t" + "movq %%mm0, (%2, %%eax)\n\t" + "movq %%mm1, 8(%2, %%eax)\n\t" + "addl %3, %0 \n\t" + "addl %3, %1 \n\t" + "addl $16, %%eax \n\t" + "jnz 1b \n\t" + : "+r" (s1), "+r" (s2) + : "r" (block+64), "r" (stride) + : "%eax" + ); } static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) @@ -203,12 +313,12 @@ static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line pix = pixels; MOVQ_ZERO(mm7); i = 4; - while (i) { + do { __asm __volatile( - "movq %2, %%mm0\n\t" - "movq 8%2, %%mm1\n\t" - "movq 16%2, %%mm2\n\t" - "movq 24%2, %%mm3\n\t" + "movq (%2), %%mm0\n\t" + "movq 8(%2), %%mm1\n\t" + "movq 16(%2), %%mm2\n\t" + "movq 24(%2), %%mm3\n\t" "movq %0, %%mm4\n\t" "movq %1, %%mm6\n\t" "movq %%mm4, %%mm5\n\t" @@ -226,809 +336,42 @@ static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line "movq %%mm0, %0\n\t" "movq %%mm2, %1\n\t" :"+m"(*pix), "+m"(*(pix+line_size)) - :"m"(*p) + :"r"(p) :"memory"); pix += line_size*2; p += 16; - i--; - }; + } while (--i); } static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) { - int hh; - UINT8 *p; - const UINT8 *pix; - - p = block; - pix = pixels; // 2s -#if 0 - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += line_size; - } while (--h); -#else - // this optimized code is not very usefull - // the above loop is definitely faster - // at least on Celeron 500MHz - hh = h & 3; - while (hh) { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += line_size; - hh--; - } - hh=h>>2; - while (hh) { - __asm __volatile( - "movq (%1), %%mm0 \n\t" - "movq (%1, %2), %%mm1 \n\t" - "movq (%1, %2, 2), %%mm2 \n\t" - "movq (%1, %3), %%mm3 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm1, (%0, %2) \n\t" - "movq %%mm2, (%0, %2, 2) \n\t" - "movq %%mm3, (%0, %3) \n\t" - ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3) - :"memory"); - pix += line_size*4; - p += line_size*4; - hh--; - } -#endif -} - -static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - MOVQ_WONE(mm4); - JUMPALIGN(); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq 1%1, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm4, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; p += line_size; - } while (--h); -} - -static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - MOVQ_WONE(mm4); - JUMPALIGN(); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm4, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size; - } while (--h); -} - -static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; // 1s - MOVQ_ZERO(mm7); - MOVQ_WTWO(mm6); - JUMPALIGN(); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq 1%1, %%mm4\n\t" - "movq 1%2, %%mm5\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "movq %%mm4, %%mm1\n\t" - "movq %%mm5, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpcklbw %%mm7, %%mm5\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm5, %%mm4\n\t" - "paddusw %%mm3, %%mm1\n\t" - "paddusw %%mm6, %%mm4\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm1, %%mm2\n\t" - "psrlw $2, %%mm0\n\t" - "psrlw $2, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size; - } while(--h); -} - -static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq 1%1, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += line_size; - } while (--h); -} - -static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - JUMPALIGN(); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size; - } while(--h); -} - -static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - MOVQ_WONE(mm6); - JUMPALIGN(); - do { __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq 1%1, %%mm4\n\t" - "movq 1%2, %%mm5\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "movq %%mm4, %%mm1\n\t" - "movq %%mm5, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpcklbw %%mm7, %%mm5\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm5, %%mm4\n\t" - "paddusw %%mm3, %%mm1\n\t" - "paddusw %%mm6, %%mm4\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm1, %%mm2\n\t" - "psrlw $2, %%mm0\n\t" - "psrlw $2, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size; - } while(--h); -} - -static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - MOVQ_WONE(mm6); - JUMPALIGN(); - do { - __asm __volatile( - "movq %0, %%mm0\n\t" - "movq %1, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "paddusw %%mm6, %%mm0\n\t" - "paddusw %%mm6, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"+m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += line_size; - } - while (--h); -} - -static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - MOVQ_WONE(mm6); - JUMPALIGN(); - do { - __asm __volatile( - "movq %1, %%mm1\n\t" - "movq %0, %%mm0\n\t" - "movq 1%1, %%mm4\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "movq %%mm4, %%mm5\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpckhbw %%mm7, %%mm5\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "paddusw %%mm4, %%mm1\n\t" - "paddusw %%mm5, %%mm3\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm6, %%mm3\n\t" - "psrlw $1, %%mm1\n\t" - "psrlw $1, %%mm3\n\t" - "paddusw %%mm6, %%mm0\n\t" - "paddusw %%mm6, %%mm2\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"+m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += line_size; - } while (--h); -} - -static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - MOVQ_WONE(mm6); - JUMPALIGN(); - do { - __asm __volatile( - "movq %1, %%mm1\n\t" - "movq %0, %%mm0\n\t" - "movq %2, %%mm4\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "movq %%mm4, %%mm5\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpckhbw %%mm7, %%mm5\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "paddusw %%mm4, %%mm1\n\t" - "paddusw %%mm5, %%mm3\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm6, %%mm3\n\t" - "psrlw $1, %%mm1\n\t" - "psrlw $1, %%mm3\n\t" - "paddusw %%mm6, %%mm0\n\t" - "paddusw %%mm6, %%mm2\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"+m"(*p) - :"m"(*pix), "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size ; - } while(--h); -} - -static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - // this doesn't seem to be used offten - so - // the inside usage of mm_wone is not optimized - MOVQ_WTWO(mm6); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq 1%1, %%mm4\n\t" - "movq 1%2, %%mm5\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "movq %%mm4, %%mm1\n\t" - "movq %%mm5, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpcklbw %%mm7, %%mm5\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm5, %%mm4\n\t" - "paddusw %%mm3, %%mm1\n\t" - "paddusw %%mm6, %%mm4\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm1, %%mm2\n\t" - "movq %3, %%mm5\n\t" - "psrlw $2, %%mm0\n\t" - "movq %0, %%mm1\n\t" - "psrlw $2, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "paddusw %%mm5, %%mm0\n\t" - "paddusw %%mm5, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"+m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)), "m"(mm_wone) - :"memory"); - pix += line_size; - p += line_size ; - } while(--h); -} - -static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %0, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"+m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += line_size ; - } while (--h); -} - -static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq 1%1, %%mm1\n\t" - "movq %0, %%mm4\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "movq %%mm4, %%mm5\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpckhbw %%mm7, %%mm5\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm5, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"+m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += line_size; - } while (--h); -} - -static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq %0, %%mm4\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "movq %%mm4, %%mm5\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpckhbw %%mm7, %%mm5\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm5, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"+m"(*p) - :"m"(*pix), "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size ; - } while(--h); -} - -static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) -{ - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - MOVQ_WONE(mm6); - JUMPALIGN(); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq 1%1, %%mm4\n\t" - "movq 1%2, %%mm5\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "movq %%mm4, %%mm1\n\t" - "movq %%mm5, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpcklbw %%mm7, %%mm5\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm5, %%mm4\n\t" - "paddusw %%mm3, %%mm1\n\t" - "paddusw %%mm6, %%mm4\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm1, %%mm2\n\t" - "movq %0, %%mm1\n\t" - "psrlw $2, %%mm0\n\t" - "movq %%mm1, %%mm3\n\t" - "psrlw $2, %%mm2\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "psrlw $1, %%mm0\n\t" - "psrlw $1, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"+m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size; - } while(--h); -} - -static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) -{ - DCTELEM *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - do { - __asm __volatile( - "movq %0, %%mm0\n\t" - "movq %1, %%mm2\n\t" - "movq 8%0, %%mm1\n\t" - "movq %%mm2, %%mm3\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "psubsw %%mm2, %%mm0\n\t" - "psubsw %%mm3, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, 8%0\n\t" - :"+m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += 8; - } while (--h); -} - -static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) -{ - DCTELEM *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - MOVQ_WONE(mm6); - JUMPALIGN(); - do { - __asm __volatile( - "movq %0, %%mm0\n\t" - "movq %1, %%mm2\n\t" - "movq 8%0, %%mm1\n\t" - "movq 1%1, %%mm4\n\t" - "movq %%mm2, %%mm3\n\t" - "movq %%mm4, %%mm5\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpckhbw %%mm7, %%mm5\n\t" - "paddusw %%mm4, %%mm2\n\t" - "paddusw %%mm5, %%mm3\n\t" - "paddusw %%mm6, %%mm2\n\t" - "paddusw %%mm6, %%mm3\n\t" - "psrlw $1, %%mm2\n\t" - "psrlw $1, %%mm3\n\t" - "psubsw %%mm2, %%mm0\n\t" - "psubsw %%mm3, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, 8%0\n\t" - :"+m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += 8; - } while (--h); -} - -static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) -{ - DCTELEM *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - MOVQ_WONE(mm6); - do { - __asm __volatile( - "movq %0, %%mm0\n\t" - "movq %1, %%mm2\n\t" - "movq 8%0, %%mm1\n\t" - "movq %2, %%mm4\n\t" - "movq %%mm2, %%mm3\n\t" - "movq %%mm4, %%mm5\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpckhbw %%mm7, %%mm5\n\t" - "paddusw %%mm4, %%mm2\n\t" - "paddusw %%mm5, %%mm3\n\t" - "paddusw %%mm6, %%mm2\n\t" - "paddusw %%mm6, %%mm3\n\t" - "psrlw $1, %%mm2\n\t" - "psrlw $1, %%mm3\n\t" - "psubsw %%mm2, %%mm0\n\t" - "psubsw %%mm3, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, 8%0\n\t" - :"+m"(*p) - :"m"(*pix), "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += 8; - } while (--h); -} - -static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) -{ - DCTELEM *p; - const UINT8 *pix; - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - MOVQ_WTWO(mm6); - JUMPALIGN(); - do { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq 1%1, %%mm4\n\t" - "movq 1%2, %%mm5\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "movq %%mm4, %%mm1\n\t" - "movq %%mm5, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpcklbw %%mm7, %%mm5\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm5, %%mm4\n\t" - "paddusw %%mm3, %%mm1\n\t" - "paddusw %%mm6, %%mm4\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm1, %%mm2\n\t" - "movq %0, %%mm1\n\t" - "movq 8%0, %%mm3\n\t" - "psrlw $2, %%mm0\n\t" - "psrlw $2, %%mm2\n\t" - "psubsw %%mm0, %%mm1\n\t" - "psubsw %%mm2, %%mm3\n\t" - "movq %%mm1, %0\n\t" - "movq %%mm3, 8%0\n\t" - :"+m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += 8 ; - } while(--h); + "lea (%3, %3), %%eax \n\t" + ".balign 8 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r" (pixels), "+r" (block) + : "r"(line_size) + : "%eax", "memory" + ); } static void clear_blocks_mmx(DCTELEM *blocks) { - asm volatile( + __asm __volatile( "pxor %%mm7, %%mm7 \n\t" "movl $-128*6, %%eax \n\t" "1: \n\t" @@ -1043,7 +386,9 @@ static void clear_blocks_mmx(DCTELEM *blocks) ); } +#if 0 static void just_return() { return; } +#endif void dsputil_init_mmx(void) { @@ -1065,10 +410,11 @@ void dsputil_init_mmx(void) if (mm_flags & MM_MMX) { get_pixels = get_pixels_mmx; + diff_pixels = diff_pixels_mmx; put_pixels_clamped = put_pixels_clamped_mmx; add_pixels_clamped = add_pixels_clamped_mmx; clear_blocks= clear_blocks_mmx; - + pix_abs16x16 = pix_abs16x16_mmx; pix_abs16x16_x2 = pix_abs16x16_x2_mmx; pix_abs16x16_y2 = pix_abs16x16_y2_mmx; @@ -1088,7 +434,7 @@ void dsputil_init_mmx(void) put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; - + avg_pixels_tab[0] = avg_pixels_mmx; avg_pixels_tab[1] = avg_pixels_x2_mmx; avg_pixels_tab[2] = avg_pixels_y2_mmx; @@ -1098,44 +444,37 @@ void dsputil_init_mmx(void) avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; - - sub_pixels_tab[0] = sub_pixels_mmx; - sub_pixels_tab[1] = sub_pixels_x2_mmx; - sub_pixels_tab[2] = sub_pixels_y2_mmx; - sub_pixels_tab[3] = sub_pixels_xy2_mmx; if (mm_flags & MM_MMXEXT) { pix_abs16x16 = pix_abs16x16_mmx2; pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; - + pix_abs8x8 = pix_abs8x8_mmx2; pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; - - put_pixels_tab[1] = put_pixels_x2_sse; - put_pixels_tab[2] = put_pixels_y2_sse; - - avg_pixels_tab[0] = avg_pixels_sse; - avg_pixels_tab[1] = avg_pixels_x2_sse; - avg_pixels_tab[2] = avg_pixels_y2_sse; - avg_pixels_tab[3] = avg_pixels_xy2_sse; - - sub_pixels_tab[1] = sub_pixels_x2_sse; - sub_pixels_tab[2] = sub_pixels_y2_sse; + + put_pixels_tab[1] = put_pixels_x2_mmx2; + put_pixels_tab[2] = put_pixels_y2_mmx2; + put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2; + put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2; + + avg_pixels_tab[0] = avg_pixels_mmx2; + avg_pixels_tab[1] = avg_pixels_x2_mmx2; + avg_pixels_tab[2] = avg_pixels_y2_mmx2; + avg_pixels_tab[3] = avg_pixels_xy2_mmx2; } else if (mm_flags & MM_3DNOW) { put_pixels_tab[1] = put_pixels_x2_3dnow; put_pixels_tab[2] = put_pixels_y2_3dnow; - + put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow; + put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow; + avg_pixels_tab[0] = avg_pixels_3dnow; avg_pixels_tab[1] = avg_pixels_x2_3dnow; avg_pixels_tab[2] = avg_pixels_y2_3dnow; avg_pixels_tab[3] = avg_pixels_xy2_3dnow; - - sub_pixels_tab[1] = sub_pixels_x2_3dnow; - sub_pixels_tab[2] = sub_pixels_y2_3dnow; } /* idct */ @@ -1181,12 +520,25 @@ void dsputil_init_mmx(void) avg_no_rnd_pixels_tab[2] = just_return; avg_no_rnd_pixels_tab[3] = just_return; - sub_pixels_tab[0] = just_return; - sub_pixels_tab[1] = just_return; - sub_pixels_tab[2] = just_return; - sub_pixels_tab[3] = just_return; - //av_fdct = just_return; //ff_idct = just_return; #endif } + +/* remove any non bit exact operation (testing purpose). NOTE that + this function should be kept as small as possible because it is + always difficult to test automatically non bit exact cases. */ +void dsputil_set_bit_exact_mmx(void) +{ + if (mm_flags & MM_MMX) { + if (mm_flags & MM_MMXEXT) { + put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; + put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; + avg_pixels_tab[3] = avg_pixels_xy2_mmx; + } else if (mm_flags & MM_3DNOW) { + put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; + put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; + avg_pixels_tab[3] = avg_pixels_xy2_mmx; + } + } +} diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h index 830fe9f3b..a16ccc88b 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h @@ -1,342 +1,296 @@ /* * DSP utils : average functions are compiled twice for 3dnow/mmx2 - * Copyright (c) 2000, 2001 Gerard Lantau. + * Copyright (c) 2000, 2001 Fabrice Bellard. + * Copyright (c) 2002 Michael Niedermayer * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> + * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> + * and improved by Zdenek Kabelac <kabi@users.sf.net> + */ + +/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm + clobber bug - now it will work with 2.95.2 and also with -fPIC */ - static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { - int dh, hh; - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - hh=h>>2; - dh=h&3; - while(hh--) { __asm __volatile( - "movq (%1), %%mm0\n\t" - "movq 1(%1), %%mm1\n\t" - "movq (%1, %2), %%mm2\n\t" - "movq 1(%1, %2), %%mm3\n\t" - "movq (%1, %2, 2), %%mm4\n\t" - "movq 1(%1, %2, 2), %%mm5\n\t" - "movq (%1, %3), %%mm6\n\t" - "movq 1(%1, %3), %%mm7\n\t" - PAVGB" %%mm1, %%mm0\n\t" - PAVGB" %%mm3, %%mm2\n\t" - PAVGB" %%mm5, %%mm4\n\t" - PAVGB" %%mm7, %%mm6\n\t" - "movq %%mm0, (%0)\n\t" - "movq %%mm2, (%0, %2)\n\t" - "movq %%mm4, (%0, %2, 2)\n\t" - "movq %%mm6, (%0, %3)\n\t" - ::"r"(p), "r"(pix), "r" (line_size), "r" (line_size*3) - :"memory"); - pix += line_size*4; p += line_size*4; - } - while(dh--) { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq 1%1, %%mm1\n\t" - PAVGB" %%mm1, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; p += line_size; - } + "lea (%3, %3), %%eax \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + PAVGB" 1(%1), %%mm0 \n\t" + PAVGB" 1(%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + PAVGB" 1(%1), %%mm0 \n\t" + PAVGB" 1(%1, %3), %%mm1 \n\t" + "addl %%eax, %1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" (line_size) + :"%eax", "memory"); } - -static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) + +/* GL: this function does incorrect rounding if overflow */ +static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { - int dh, hh; - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - - hh=h>>1; - dh=h&1; - while(hh--) { - __asm __volatile( - "movq %2, %%mm0\n\t" - "movq %3, %%mm1\n\t" - "movq %4, %%mm2\n\t" - PAVGB" %%mm1, %%mm0\n\t" - PAVGB" %%mm2, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, %1\n\t" - :"=m"(*p), "=m"(*(p+line_size)) - :"m"(*pix), "m"(*(pix+line_size)), - "m"(*(pix+line_size*2)) - :"memory"); - pix += line_size*2; - p += line_size*2; - } - if(dh) { + MOVQ_BONE(mm6); __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - PAVGB" %%mm1, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"=m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - } + "lea (%3, %3), %%eax \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + "addl %%eax, %1 \n\t" + "psubusb %%mm6, %%mm0 \n\t" + "psubusb %%mm6, %%mm2 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm3, %%mm2 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm2, (%2, %3) \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + "addl %%eax, %2 \n\t" + "addl %%eax, %1 \n\t" + "psubusb %%mm6, %%mm0 \n\t" + "psubusb %%mm6, %%mm2 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm3, %%mm2 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm2, (%2, %3) \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" (line_size) + :"%eax", "memory"); } -static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { - int dh, hh; - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - hh=h>>2; - dh=h&3; - while(hh--) { __asm __volatile( - "movq (%0), %%mm0\n\t" - "movq (%1), %%mm1\n\t" - "movq (%0, %2), %%mm2\n\t" - "movq (%1, %2), %%mm3\n\t" - "movq (%0, %2, 2), %%mm4\n\t" - "movq (%1, %2, 2), %%mm5\n\t" - "movq (%0, %3), %%mm6\n\t" - "movq (%1, %3), %%mm7\n\t" - PAVGB" %%mm1, %%mm0\n\t" - PAVGB" %%mm3, %%mm2\n\t" - PAVGB" %%mm5, %%mm4\n\t" - PAVGB" %%mm7, %%mm6\n\t" - "movq %%mm0, (%0)\n\t" - "movq %%mm2, (%0, %2)\n\t" - "movq %%mm4, (%0, %2, 2)\n\t" - "movq %%mm6, (%0, %3)\n\t" - ::"r"(p), "r"(pix), "r" (line_size), "r" (line_size*3) - :"memory"); - pix += line_size*4; p += line_size*4; - } - while(dh--) { - __asm __volatile( - "movq %0, %%mm0\n\t" - "movq %1, %%mm1\n\t" - PAVGB" %%mm1, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"+m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; p += line_size; - } + "lea (%3, %3), %%eax \n\t" + "movq (%1), %%mm0 \n\t" + "subl %3, %2 \n\t" + "1: \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%eax), %%mm2 \n\t" + "addl %%eax, %1 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm2, %%mm1 \n\t" + "movq %%mm0, (%2, %3) \n\t" + "movq %%mm1, (%2, %%eax) \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%eax), %%mm0 \n\t" + "addl %%eax, %2 \n\t" + "addl %%eax, %1 \n\t" + PAVGB" %%mm1, %%mm2 \n\t" + PAVGB" %%mm0, %%mm1 \n\t" + "movq %%mm2, (%2, %3) \n\t" + "movq %%mm1, (%2, %%eax) \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D" (block) + :"r" (line_size) + :"%eax", "memory"); } -static void DEF(avg_pixels_x2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) +/* GL: this function does incorrect rounding if overflow */ +static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { - int dh, hh; - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - hh=h>>1; - dh=h&1; - while(hh--) { - __asm __volatile( - "movq %2, %%mm2\n\t" - "movq 1%2, %%mm3\n\t" - "movq %3, %%mm4\n\t" - "movq 1%3, %%mm5\n\t" - "movq %0, %%mm0\n\t" - "movq %1, %%mm1\n\t" - PAVGB" %%mm3, %%mm2\n\t" - PAVGB" %%mm2, %%mm0\n\t" - PAVGB" %%mm5, %%mm4\n\t" - PAVGB" %%mm4, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, %1\n\t" - :"+m"(*p), "+m"(*(p+line_size)) - :"m"(*pix), "m"(*(pix+line_size)) - :"memory"); - pix += line_size*2; - p += line_size*2; - } - if(dh) { + MOVQ_BONE(mm6); __asm __volatile( - "movq %1, %%mm1\n\t" - "movq 1%1, %%mm2\n\t" - "movq %0, %%mm0\n\t" - PAVGB" %%mm2, %%mm1\n\t" - PAVGB" %%mm1, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"+m"(*p) - :"m"(*pix) - :"memory"); - } + "lea (%3, %3), %%eax \n\t" + "movq (%1), %%mm0 \n\t" + "subl %3, %2 \n\t" + "1: \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%eax), %%mm2 \n\t" + "addl %%eax, %1 \n\t" + "psubusb %%mm6, %%mm1 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm2, %%mm1 \n\t" + "movq %%mm0, (%2, %3) \n\t" + "movq %%mm1, (%2, %%eax) \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%eax), %%mm0 \n\t" + "addl %%eax, %2 \n\t" + "addl %%eax, %1 \n\t" + "psubusb %%mm6, %%mm1 \n\t" + PAVGB" %%mm1, %%mm2 \n\t" + PAVGB" %%mm0, %%mm1 \n\t" + "movq %%mm2, (%2, %3) \n\t" + "movq %%mm1, (%2, %%eax) \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D" (block) + :"r" (line_size) + :"%eax", "memory"); } -static void DEF(avg_pixels_y2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { - int dh, hh; - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - hh=h>>1; - dh=h&1; - while(hh--) { - __asm __volatile( - "movq %2, %%mm2\n\t" - "movq %3, %%mm3\n\t" - "movq %3, %%mm4\n\t" - "movq %4, %%mm5\n\t" - "movq %0, %%mm0\n\t" - "movq %1, %%mm1\n\t" - PAVGB" %%mm3, %%mm2\n\t" - PAVGB" %%mm2, %%mm0\n\t" - PAVGB" %%mm5, %%mm4\n\t" - PAVGB" %%mm4, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, %1\n\t" - :"+m"(*p), "+m"(*(p+line_size)) - :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)) - :"memory"); - pix += line_size*2; - p += line_size*2; - } - if(dh) { __asm __volatile( - "movq %1, %%mm1\n\t" - "movq %2, %%mm2\n\t" - "movq %0, %%mm0\n\t" - PAVGB" %%mm2, %%mm1\n\t" - PAVGB" %%mm1, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"+m"(*p) - :"m"(*pix), "m"(*(pix+line_size)) - :"memory"); - } + "lea (%3, %3), %%eax \n\t" + "1: \n\t" + "movq (%2), %%mm0 \n\t" + "movq (%2, %3), %%mm1 \n\t" + PAVGB" (%1), %%mm0 \n\t" + PAVGB" (%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "movq (%2), %%mm0 \n\t" + "movq (%2, %3), %%mm1 \n\t" + PAVGB" (%1), %%mm0 \n\t" + PAVGB" (%1, %3), %%mm1 \n\t" + "addl %%eax, %1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" (line_size) + :"%eax", "memory"); } -static void DEF(avg_pixels_xy2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { - UINT8 *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7\n\t" - "movq %0, %%mm6\n\t" - ::"m"(mm_wtwo)); - do { __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %2, %%mm1\n\t" - "movq 1%1, %%mm4\n\t" - "movq 1%2, %%mm5\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm1, %%mm0\n\t" - "paddusw %%mm3, %%mm2\n\t" - "movq %%mm4, %%mm1\n\t" - "movq %%mm5, %%mm3\n\t" - "punpcklbw %%mm7, %%mm4\n\t" - "punpcklbw %%mm7, %%mm5\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "paddusw %%mm5, %%mm4\n\t" - "paddusw %%mm3, %%mm1\n\t" - "paddusw %%mm6, %%mm4\n\t" - "paddusw %%mm6, %%mm1\n\t" - "paddusw %%mm4, %%mm0\n\t" - "paddusw %%mm1, %%mm2\n\t" - "psrlw $2, %%mm0\n\t" - "psrlw $2, %%mm2\n\t" - "packuswb %%mm2, %%mm0\n\t" - PAVGB" %0, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :"+m"(*p) - :"m"(*pix), - "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += line_size ; - } while(--h); + "lea (%3, %3), %%eax \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm2 \n\t" + PAVGB" 1(%1), %%mm0 \n\t" + PAVGB" 1(%1, %3), %%mm2 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" (%2, %3), %%mm2 \n\t" + "addl %%eax, %1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm2, (%2, %3) \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm2 \n\t" + PAVGB" 1(%1), %%mm0 \n\t" + PAVGB" 1(%1, %3), %%mm2 \n\t" + "addl %%eax, %2 \n\t" + "addl %%eax, %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" (%2, %3), %%mm2 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm2, (%2, %3) \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" (line_size) + :"%eax", "memory"); } -static void DEF(sub_pixels_x2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { - DCTELEM *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7":); - do { __asm __volatile( - "movq 1%1, %%mm2\n\t" - "movq %0, %%mm0\n\t" - PAVGB" %1, %%mm2\n\t" - "movq 8%0, %%mm1\n\t" - "movq %%mm2, %%mm3\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "psubsw %%mm2, %%mm0\n\t" - "psubsw %%mm3, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, 8%0\n\t" - :"+m"(*p) - :"m"(*pix) - :"memory"); - pix += line_size; - p += 8; - } while (--h); + "lea (%3, %3), %%eax \n\t" + "movq (%1), %%mm0 \n\t" + "subl %3, %2 \n\t" + "1: \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%eax), %%mm2 \n\t" + "addl %%eax, %1 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm2, %%mm1 \n\t" + "movq (%2, %3), %%mm3 \n\t" + "movq (%2, %%eax), %%mm4 \n\t" + PAVGB" %%mm3, %%mm0 \n\t" + PAVGB" %%mm4, %%mm1 \n\t" + "movq %%mm0, (%2, %3) \n\t" + "movq %%mm1, (%2, %%eax) \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%eax), %%mm0 \n\t" + PAVGB" %%mm1, %%mm2 \n\t" + PAVGB" %%mm0, %%mm1 \n\t" + "addl %%eax, %2 \n\t" + "addl %%eax, %1 \n\t" + "movq (%2, %3), %%mm3 \n\t" + "movq (%2, %%eax), %%mm4 \n\t" + PAVGB" %%mm3, %%mm2 \n\t" + PAVGB" %%mm4, %%mm1 \n\t" + "movq %%mm2, (%2, %3) \n\t" + "movq %%mm1, (%2, %%eax) \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" (line_size) + :"%eax", "memory"); } -static void DEF(sub_pixels_y2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h) +// Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter +static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { - DCTELEM *p; - const UINT8 *pix; - p = block; - pix = pixels; - __asm __volatile( - "pxor %%mm7, %%mm7":); - do { + MOVQ_BONE(mm6); __asm __volatile( - "movq %2, %%mm2\n\t" - "movq %0, %%mm0\n\t" - PAVGB" %1, %%mm2\n\t" - "movq 8%0, %%mm1\n\t" - "movq %%mm2, %%mm3\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "psubsw %%mm2, %%mm0\n\t" - "psubsw %%mm3, %%mm1\n\t" - "movq %%mm0, %0\n\t" - "movq %%mm1, 8%0\n\t" - :"+m"(*p) - :"m"(*pix), "m"(*(pix+line_size)) - :"memory"); - pix += line_size; - p += 8; - } while (--h); + "lea (%3, %3), %%eax \n\t" + "movq (%1), %%mm0 \n\t" + PAVGB" 1(%1), %%mm0 \n\t" + ".balign 8 \n\t" + "1: \n\t" + "movq (%1, %%eax), %%mm2 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "psubusb %%mm6, %%mm2 \n\t" + PAVGB" 1(%1, %3), %%mm1 \n\t" + PAVGB" 1(%1, %%eax), %%mm2 \n\t" + "addl %%eax, %1 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm2, %%mm1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" (%2, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%eax), %%mm0 \n\t" + PAVGB" 1(%1, %3), %%mm1 \n\t" + PAVGB" 1(%1, %%eax), %%mm0 \n\t" + "addl %%eax, %2 \n\t" + "addl %%eax, %1 \n\t" + PAVGB" %%mm1, %%mm2 \n\t" + PAVGB" %%mm0, %%mm1 \n\t" + PAVGB" (%2), %%mm2 \n\t" + PAVGB" (%2, %3), %%mm1 \n\t" + "movq %%mm2, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" (line_size) + :"%eax", "memory"); } - diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h new file mode 100644 index 000000000..dc70c9c8e --- /dev/null +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h @@ -0,0 +1,305 @@ +/* + * DSP utils mmx functions are compiled twice for rnd/no_rnd + * Copyright (c) 2000, 2001 Fabrice Bellard. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * MMX optimization by Nick Kurshev <nickols_k@mail.ru> + * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> + * and improved by Zdenek Kabelac <kabi@users.sf.net> + */ + +// put_pixels +static void DEF(put, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + __asm __volatile( + "lea (%3, %3), %%eax \n\t" + ".balign 8 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r"(line_size) + :"eax", "memory"); +} + +static void DEF(put, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + __asm __volatile( + "lea (%3, %3), %%eax \n\t" + "movq (%1), %%mm0 \n\t" + ".balign 8 \n\t" + "1: \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%eax),%%mm2 \n\t" + PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%eax),%%mm0 \n\t" + PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r"(line_size) + :"eax", "memory"); +} + +static void DEF(put, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + MOVQ_ZERO(mm7); + SET_RND(mm6); // =2 for rnd and =1 for no_rnd version + __asm __volatile( + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm4 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddusw %%mm0, %%mm4 \n\t" + "paddusw %%mm1, %%mm5 \n\t" + "xorl %%eax, %%eax \n\t" + "addl %3, %1 \n\t" + ".balign 8 \n\t" + "1: \n\t" + "movq (%1, %%eax), %%mm0 \n\t" + "movq 1(%1, %%eax), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "paddusw %%mm2, %%mm0 \n\t" + "paddusw %%mm3, %%mm1 \n\t" + "paddusw %%mm6, %%mm4 \n\t" + "paddusw %%mm6, %%mm5 \n\t" + "paddusw %%mm0, %%mm4 \n\t" + "paddusw %%mm1, %%mm5 \n\t" + "psrlw $2, %%mm4 \n\t" + "psrlw $2, %%mm5 \n\t" + "packuswb %%mm5, %%mm4 \n\t" + "movq %%mm4, (%2, %%eax) \n\t" + "addl %3, %%eax \n\t" + + "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 + "movq 1(%1, %%eax), %%mm4 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddusw %%mm2, %%mm4 \n\t" + "paddusw %%mm3, %%mm5 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm6, %%mm1 \n\t" + "paddusw %%mm4, %%mm0 \n\t" + "paddusw %%mm5, %%mm1 \n\t" + "psrlw $2, %%mm0 \n\t" + "psrlw $2, %%mm1 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "movq %%mm0, (%2, %%eax) \n\t" + "addl %3, %%eax \n\t" + + "subl $2, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels) + :"D"(block), "r"(line_size) + :"eax", "memory"); +} + +// avg_pixels +// in case more speed is needed - unroling would certainly help +static void DEF(avg, pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm __volatile( + "movq %0, %%mm0 \n\t" + "movq %1, %%mm1 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + "movq %%mm2, %0 \n\t" + :"+m"(*block) + :"m"(*pixels) + :"memory"); + pixels += line_size; + block += line_size; + } + while (--h); +} + +static void DEF(avg, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm __volatile( + "movq %1, %%mm0 \n\t" + "movq 1%1, %%mm1 \n\t" + "movq %0, %%mm3 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) + "movq %%mm0, %0 \n\t" + :"+m"(*block) + :"m"(*pixels) + :"memory"); + pixels += line_size; + block += line_size; + } while (--h); +} + +static void DEF(avg, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + __asm __volatile( + "lea (%3, %3), %%eax \n\t" + "movq (%1), %%mm0 \n\t" + ".balign 8 \n\t" + "1: \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%eax), %%mm2 \n\t" + PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) + "movq (%2), %%mm3 \n\t" + PAVGB(%%mm3, %%mm4, %%mm0, %%mm6) + "movq (%2, %3), %%mm3 \n\t" + PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%eax), %%mm0 \n\t" + PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) + "movq (%2), %%mm3 \n\t" + PAVGB(%%mm3, %%mm4, %%mm0, %%mm6) + "movq (%2, %3), %%mm3 \n\t" + PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r"(line_size) + :"eax", "memory"); +} + +// this routine is 'slightly' suboptimal but mostly unused +static void DEF(avg, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + MOVQ_ZERO(mm7); + SET_RND(mm6); // =2 for rnd and =1 for no_rnd version + __asm __volatile( + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm4 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddusw %%mm0, %%mm4 \n\t" + "paddusw %%mm1, %%mm5 \n\t" + "xorl %%eax, %%eax \n\t" + "addl %3, %1 \n\t" + ".balign 8 \n\t" + "1: \n\t" + "movq (%1, %%eax), %%mm0 \n\t" + "movq 1(%1, %%eax), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "paddusw %%mm2, %%mm0 \n\t" + "paddusw %%mm3, %%mm1 \n\t" + "paddusw %%mm6, %%mm4 \n\t" + "paddusw %%mm6, %%mm5 \n\t" + "paddusw %%mm0, %%mm4 \n\t" + "paddusw %%mm1, %%mm5 \n\t" + "psrlw $2, %%mm4 \n\t" + "psrlw $2, %%mm5 \n\t" + "movq (%2, %%eax), %%mm3 \n\t" + "packuswb %%mm5, %%mm4 \n\t" + "pcmpeqd %%mm2, %%mm2 \n\t" + "paddb %%mm2, %%mm2 \n\t" + PAVGB(%%mm3, %%mm4, %%mm5, %%mm2) + "movq %%mm5, (%2, %%eax) \n\t" + "addl %3, %%eax \n\t" + + "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 + "movq 1(%1, %%eax), %%mm4 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddusw %%mm2, %%mm4 \n\t" + "paddusw %%mm3, %%mm5 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm6, %%mm1 \n\t" + "paddusw %%mm4, %%mm0 \n\t" + "paddusw %%mm5, %%mm1 \n\t" + "psrlw $2, %%mm0 \n\t" + "psrlw $2, %%mm1 \n\t" + "movq (%2, %%eax), %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "pcmpeqd %%mm2, %%mm2 \n\t" + "paddb %%mm2, %%mm2 \n\t" + PAVGB(%%mm3, %%mm0, %%mm1, %%mm2) + "movq %%mm1, (%2, %%eax) \n\t" + "addl %3, %%eax \n\t" + + "subl $2, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels) + :"D"(block), "r"(line_size) + :"eax", "memory"); +} diff --git a/src/libffmpeg/libavcodec/i386/fdct_mmx.c b/src/libffmpeg/libavcodec/i386/fdct_mmx.c index e9d48383d..7135beb21 100644 --- a/src/libffmpeg/libavcodec/i386/fdct_mmx.c +++ b/src/libffmpeg/libavcodec/i386/fdct_mmx.c @@ -1,6 +1,6 @@ /* * MMX optimized forward DCT - * The gcc porting is Copyright (c) 2001 Gerard Lantau. + * The gcc porting is Copyright (c) 2001 Fabrice Bellard. * * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT * @@ -10,7 +10,7 @@ #include "../common.h" #include "mmx.h" -//#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) +#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) ////////////////////////////////////////////////////////////////////// // diff --git a/src/libffmpeg/libavcodec/i386/idct_mmx.c b/src/libffmpeg/libavcodec/i386/idct_mmx.c index 618c1cfde..298c8a8b0 100644 --- a/src/libffmpeg/libavcodec/i386/idct_mmx.c +++ b/src/libffmpeg/libavcodec/i386/idct_mmx.c @@ -528,8 +528,12 @@ static inline void idct_col (int16_t * col, int offset) movq_r2m (mm3, *(col+offset+4*8)); // save y4 movq_r2m (mm4, *(col+offset+3*8)); // save y3 -} +#undef T1 +#undef T2 +#undef T3 +#undef C4 +} static int32_t rounder0[] ATTR_ALIGN(8) = rounder ((1 << (COL_SHIFT - 1)) - 0.5); @@ -547,6 +551,8 @@ static int32_t rounder3[] ATTR_ALIGN(8) = static int32_t rounder5[] ATTR_ALIGN(8) = rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ +#undef COL_SHIFT +#undef ROW_SHIFT #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ void idct (int16_t * block) \ diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c index e704c4219..9b76cdb07 100644 --- a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c +++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c @@ -1,20 +1,20 @@ /* * MMX optimized motion estimation - * Copyright (c) 2001 Gerard Lantau. + * Copyright (c) 2001 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * mostly by Michael Niedermayer <michaelni@gmx.at> */ @@ -26,6 +26,8 @@ static const __attribute__ ((aligned(8))) UINT64 round_tab[3]={ 0x0002000200020002, }; +static __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL; + static inline void sad8_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h) { int len= -(stride<<h); @@ -115,6 +117,7 @@ static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h) int len= -(stride<<h); asm volatile( ".balign 16 \n\t" + "movq "MANGLE(bone)", %%mm5 \n\t" "1: \n\t" "movq (%1, %%eax), %%mm0 \n\t" "movq (%2, %%eax), %%mm2 \n\t" @@ -122,6 +125,7 @@ static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h) "movq 1(%2, %%eax), %%mm3 \n\t" "pavgb %%mm2, %%mm0 \n\t" "pavgb %%mm1, %%mm3 \n\t" + "psubusb %%mm5, %%mm3 \n\t" "pavgb %%mm3, %%mm0 \n\t" "movq (%3, %%eax), %%mm2 \n\t" "psadbw %%mm2, %%mm0 \n\t" @@ -132,6 +136,7 @@ static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h) "movq 1(%2, %%eax), %%mm4 \n\t" "pavgb %%mm3, %%mm1 \n\t" "pavgb %%mm4, %%mm2 \n\t" + "psubusb %%mm5, %%mm2 \n\t" "pavgb %%mm1, %%mm2 \n\t" "movq (%3, %%eax), %%mm1 \n\t" "psadbw %%mm1, %%mm2 \n\t" diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c index b7a782f56..390aa554c 100644 --- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c +++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c @@ -1,34 +1,30 @@ /* * The simplest mpeg encoder (well, it was the simplest!) - * Copyright (c) 2000,2001 Gerard Lantau. + * Copyright (c) 2000,2001 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru> - * h263 dequantizer by Michael Niedermayer <michaelni@gmx.at> + * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> */ -#include "xine-utils/xineutils.h" #include "../dsputil.h" #include "../mpegvideo.h" #include "../avcodec.h" -#include "../mangle.h" extern UINT8 zigzag_end[64]; -extern void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w); -extern int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale); extern UINT8 zigzag_direct_noperm[64]; extern UINT16 inv_zigzag_direct16[64]; @@ -195,103 +191,86 @@ asm volatile( static void dct_unquantize_mpeg1_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { - int i, level, nCoeffs; + int nCoeffs; const UINT16 *quant_matrix; if(s->alternate_scan) nCoeffs= 64; else nCoeffs= nCoeffs= zigzag_end[ s->block_last_index[n] ]; if (s->mb_intra) { + int block0; if (n < 4) - block[0] = block[0] * s->y_dc_scale; + block0 = block[0] * s->y_dc_scale; else - block[0] = block[0] * s->c_dc_scale; - /* isnt used anymore (we have a h263 unquantizer since some time) - if (s->out_format == FMT_H263) { - i = 1; - goto unquant_even; - }*/ + block0 = block[0] * s->c_dc_scale; /* XXX: only mpeg1 */ quant_matrix = s->intra_matrix; - i=1; - /* Align on 4 elements boundary */ - while(i&3) - { - level = block[i]; - if (level) { - if (level < 0) level = -level; - level = (int)(level * qscale * quant_matrix[i]) >> 3; - level = (level - 1) | 1; - if (block[i] < 0) level = -level; - block[i] = level; - } - i++; - } - __asm __volatile( - "movd %0, %%mm6\n\t" /* mm6 = qscale | 0 */ - "punpckldq %%mm6, %%mm6\n\t" /* mm6 = qscale | qscale */ - "movq %2, %%mm4\n\t" - "movq %%mm6, %%mm7\n\t" - "movq %1, %%mm5\n\t" - "packssdw %%mm6, %%mm7\n\t" /* mm7 = qscale | qscale | qscale | qscale */ - "pxor %%mm6, %%mm6\n\t" - ::"g"(qscale),"m"(mm_wone),"m"(mm_wabs):"memory"); - for(;i<nCoeffs;i+=4) { - __asm __volatile( - "movq %1, %%mm0\n\t" - "movq %%mm7, %%mm1\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm0, %%mm3\n\t" - "pcmpgtw %%mm6, %%mm2\n\t" - "pmullw %2, %%mm1\n\t" - "pandn %%mm4, %%mm2\n\t" - "por %%mm5, %%mm2\n\t" - "pmullw %%mm2, %%mm0\n\t" /* mm0 = abs(block[i]). */ - - "pcmpeqw %%mm6, %%mm3\n\t" - "pmullw %%mm0, %%mm1\n\t" - "psraw $3, %%mm1\n\t" - "psubw %%mm5, %%mm1\n\t" /* block[i] --; */ - "pandn %%mm4, %%mm3\n\t" /* fake of pcmpneqw : mm0 != 0 then mm1 = -1 */ - "por %%mm5, %%mm1\n\t" /* block[i] |= 1 */ - "pmullw %%mm2, %%mm1\n\t" /* change signs again */ - - "pand %%mm3, %%mm1\n\t" /* nullify if was zero */ - "movq %%mm1, %0" - :"=m"(block[i]) - :"m"(block[i]), "m"(quant_matrix[i]) - :"memory"); - } - } else { - i = 0; -// unquant_even: - quant_matrix = s->non_intra_matrix; - /* Align on 4 elements boundary */ - while(i&7) - { - level = block[i]; - if (level) { - if (level < 0) level = -level; - level = (((level << 1) + 1) * qscale * - ((int) quant_matrix[i])) >> 4; - level = (level - 1) | 1; - if(block[i] < 0) level = -level; - block[i] = level; - } - i++; - } asm volatile( "pcmpeqw %%mm7, %%mm7 \n\t" "psrlw $15, %%mm7 \n\t" "movd %2, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" + "movl %3, %%eax \n\t" ".balign 16\n\t" "1: \n\t" - "movq (%0, %3), %%mm0 \n\t" - "movq 8(%0, %3), %%mm1 \n\t" - "movq (%1, %3), %%mm4 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" + "movq (%0, %%eax), %%mm0 \n\t" + "movq 8(%0, %%eax), %%mm1 \n\t" + "movq (%1, %%eax), %%mm4 \n\t" + "movq 8(%1, %%eax), %%mm5 \n\t" + "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] + "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] + "pxor %%mm2, %%mm2 \n\t" + "pxor %%mm3, %%mm3 \n\t" + "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) + "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) + "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q + "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" // FIXME slow + "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 + "psraw $3, %%mm0 \n\t" + "psraw $3, %%mm1 \n\t" + "psubw %%mm7, %%mm0 \n\t" + "psubw %%mm7, %%mm1 \n\t" + "por %%mm7, %%mm0 \n\t" + "por %%mm7, %%mm1 \n\t" + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm3, %%mm1 \n\t" + "pandn %%mm0, %%mm4 \n\t" + "pandn %%mm1, %%mm5 \n\t" + "movq %%mm4, (%0, %%eax) \n\t" + "movq %%mm5, 8(%0, %%eax) \n\t" + + "addl $16, %%eax \n\t" + "js 1b \n\t" + ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) + : "%eax", "memory" + ); + block[0]= block0; + + } else { + quant_matrix = s->inter_matrix; +asm volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $15, %%mm7 \n\t" + "movd %2, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "movl %3, %%eax \n\t" + ".balign 16\n\t" + "1: \n\t" + "movq (%0, %%eax), %%mm0 \n\t" + "movq 8(%0, %%eax), %%mm1 \n\t" + "movq (%1, %%eax), %%mm4 \n\t" + "movq 8(%1, %%eax), %%mm5 \n\t" "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pxor %%mm2, %%mm2 \n\t" @@ -310,8 +289,8 @@ asm volatile( "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q "pxor %%mm4, %%mm4 \n\t" "pxor %%mm5, %%mm5 \n\t" // FIXME slow - "pcmpeqw (%0, %3), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw 8(%0, %3), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 "psraw $4, %%mm0 \n\t" "psraw $4, %%mm1 \n\t" "psubw %%mm7, %%mm0 \n\t" @@ -324,13 +303,145 @@ asm volatile( "psubw %%mm3, %%mm1 \n\t" "pandn %%mm0, %%mm4 \n\t" "pandn %%mm1, %%mm5 \n\t" - "movq %%mm4, (%0, %3) \n\t" - "movq %%mm5, 8(%0, %3) \n\t" + "movq %%mm4, (%0, %%eax) \n\t" + "movq %%mm5, 8(%0, %%eax) \n\t" - "addl $16, %3 \n\t" + "addl $16, %%eax \n\t" "js 1b \n\t" - ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (2*(i-nCoeffs)) - : "memory" + ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) + : "%eax", "memory" + ); + } +} + +static void dct_unquantize_mpeg2_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int nCoeffs; + const UINT16 *quant_matrix; + + if(s->alternate_scan) nCoeffs= 64; + else nCoeffs= nCoeffs= zigzag_end[ s->block_last_index[n] ]; + + if (s->mb_intra) { + int block0; + if (n < 4) + block0 = block[0] * s->y_dc_scale; + else + block0 = block[0] * s->c_dc_scale; + quant_matrix = s->intra_matrix; +asm volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $15, %%mm7 \n\t" + "movd %2, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "movl %3, %%eax \n\t" + ".balign 16\n\t" + "1: \n\t" + "movq (%0, %%eax), %%mm0 \n\t" + "movq 8(%0, %%eax), %%mm1 \n\t" + "movq (%1, %%eax), %%mm4 \n\t" + "movq 8(%1, %%eax), %%mm5 \n\t" + "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] + "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] + "pxor %%mm2, %%mm2 \n\t" + "pxor %%mm3, %%mm3 \n\t" + "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) + "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) + "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q + "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" // FIXME slow + "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 + "psraw $3, %%mm0 \n\t" + "psraw $3, %%mm1 \n\t" + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm3, %%mm1 \n\t" + "pandn %%mm0, %%mm4 \n\t" + "pandn %%mm1, %%mm5 \n\t" + "movq %%mm4, (%0, %%eax) \n\t" + "movq %%mm5, 8(%0, %%eax) \n\t" + + "addl $16, %%eax \n\t" + "js 1b \n\t" + ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) + : "%eax", "memory" + ); + block[0]= block0; + //Note, we dont do mismatch control for intra as errors cannot accumulate + + } else { + quant_matrix = s->inter_matrix; +asm volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlq $48, %%mm7 \n\t" + "movd %2, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "movl %3, %%eax \n\t" + ".balign 16\n\t" + "1: \n\t" + "movq (%0, %%eax), %%mm0 \n\t" + "movq 8(%0, %%eax), %%mm1 \n\t" + "movq (%1, %%eax), %%mm4 \n\t" + "movq 8(%1, %%eax), %%mm5 \n\t" + "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] + "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] + "pxor %%mm2, %%mm2 \n\t" + "pxor %%mm3, %%mm3 \n\t" + "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) + "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) + "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 + "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 + "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q + "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q + "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q + "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" // FIXME slow + "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 + "psrlw $4, %%mm0 \n\t" + "psrlw $4, %%mm1 \n\t" + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm3, %%mm1 \n\t" + "pandn %%mm0, %%mm4 \n\t" + "pandn %%mm1, %%mm5 \n\t" + "pxor %%mm4, %%mm7 \n\t" + "pxor %%mm5, %%mm7 \n\t" + "movq %%mm4, (%0, %%eax) \n\t" + "movq %%mm5, 8(%0, %%eax) \n\t" + + "addl $16, %%eax \n\t" + "js 1b \n\t" + "movd 124(%0, %3), %%mm0 \n\t" + "movq %%mm7, %%mm6 \n\t" + "psrlq $32, %%mm7 \n\t" + "pxor %%mm6, %%mm7 \n\t" + "movq %%mm7, %%mm6 \n\t" + "psrlq $16, %%mm7 \n\t" + "pxor %%mm6, %%mm7 \n\t" + "pslld $31, %%mm7 \n\t" + "psrlq $15, %%mm7 \n\t" + "pxor %%mm7, %%mm0 \n\t" + "movd %%mm0, 124(%0, %3) \n\t" + + ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) + : "%eax", "memory" ); } } @@ -441,18 +552,16 @@ void unused_var_warning_killer(){ void MPV_common_init_mmx(MpegEncContext *s) { if (mm_flags & MM_MMX) { - if (s->out_format == FMT_H263) - s->dct_unquantize = dct_unquantize_h263_mmx; - else - s->dct_unquantize = dct_unquantize_mpeg1_mmx; - - draw_edges = draw_edges_mmx; - - if(mm_flags & MM_MMXEXT){ - dct_quantize= dct_quantize_MMX2; - }else{ - dct_quantize= dct_quantize_MMX; - } + s->dct_unquantize_h263 = dct_unquantize_h263_mmx; + s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx; + s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx; + + draw_edges = draw_edges_mmx; + + if(mm_flags & MM_MMXEXT){ + dct_quantize= dct_quantize_MMX2; + } else { + dct_quantize= dct_quantize_MMX; + } } } - diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c index 2b3322915..aed537a23 100644 --- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c +++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c @@ -1,21 +1,22 @@ /* - Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at> - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ - + * MPEG video MMX templates + * + * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ #undef SPREADW #undef PMAXW #ifdef HAVE_MMX2 @@ -33,149 +34,165 @@ static int RENAME(dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, - int qscale) + int qscale, int *overflow) { - int i, level, last_non_zero_p1, q; - const UINT16 *qmat; + int level=0, last_non_zero_p1, q; //=0 is cuz gcc says uninitalized ... + const UINT16 *qmat, *bias; static __align8 INT16 temp_block[64]; - int minLevel, maxLevel; - - if(s->avctx!=NULL && s->avctx->codec->id==CODEC_ID_MPEG4){ - /* mpeg4 */ - minLevel= -2048; - maxLevel= 2047; - }else if(s->out_format==FMT_MPEG1){ - /* mpeg1 */ - minLevel= -255; - maxLevel= 255; - }else if(s->out_format==FMT_MJPEG){ - /* (m)jpeg */ - minLevel= -1023; - maxLevel= 1023; - }else{ - /* h263 / msmpeg4 */ - minLevel= -128; - maxLevel= 127; - } av_fdct (block); - + if (s->mb_intra) { int dummy; if (n < 4) q = s->y_dc_scale; else q = s->c_dc_scale; - /* note: block[0] is assumed to be positive */ + if (!s->h263_aic) { #if 1 - asm volatile ( - "xorl %%edx, %%edx \n\t" - "mul %%ecx \n\t" - : "=d" (temp_block[0]), "=a"(dummy) - : "a" (block[0] + (q >> 1)), "c" (inverse[q]) - ); + asm volatile ( + "xorl %%edx, %%edx \n\t" + "mul %%ecx \n\t" + : "=d" (level), "=a"(dummy) + : "a" (block[0] + (q >> 1)), "c" (inverse[q]) + ); #else - asm volatile ( - "xorl %%edx, %%edx \n\t" - "divw %%cx \n\t" - "movzwl %%ax, %%eax \n\t" - : "=a" (temp_block[0]) - : "a" (block[0] + (q >> 1)), "c" (q) - : "%edx" - ); + asm volatile ( + "xorl %%edx, %%edx \n\t" + "divw %%cx \n\t" + "movzwl %%ax, %%eax \n\t" + : "=a" (level) + : "a" (block[0] + (q >> 1)), "c" (q) + : "%edx" + ); #endif + } else + /* For AIC we skip quant/dequant of INTRADC */ + level = block[0]; + + block[0]=0; //avoid fake overflow // temp_block[0] = (block[0] + (q >> 1)) / q; - i = 1; last_non_zero_p1 = 1; - if (s->out_format == FMT_H263) { - qmat = s->q_non_intra_matrix16; - } else { - qmat = s->q_intra_matrix16; - } - for(i=1;i<4;i++) { - level = block[i] * qmat[i]; - level = level / (1 << (QMAT_SHIFT_MMX - 3)); - /* XXX: currently, this code is not optimal. the range should be: - mpeg1: -255..255 - mpeg2: -2048..2047 - h263: -128..127 - mpeg4: -2048..2047 - */ - if (level > maxLevel) - level = maxLevel; - else if (level < minLevel) - level = minLevel; - temp_block[i] = level; - - if(level) - if(last_non_zero_p1 < inv_zigzag_direct16[i]) last_non_zero_p1= inv_zigzag_direct16[i]; - block[i]=0; - } + bias = s->q_intra_matrix16_bias[qscale]; + qmat = s->q_intra_matrix16[qscale]; } else { - i = 0; last_non_zero_p1 = 0; - qmat = s->q_non_intra_matrix16; + bias = s->q_inter_matrix16_bias[qscale]; + qmat = s->q_inter_matrix16[qscale]; } - asm volatile( /* XXX: small rounding bug, but it shouldnt matter */ - "movd %3, %%mm3 \n\t" - SPREADW(%%mm3) - "movd %4, %%mm4 \n\t" - SPREADW(%%mm4) -#ifndef HAVE_MMX2 - "movd %5, %%mm5 \n\t" - SPREADW(%%mm5) -#endif - "pxor %%mm7, %%mm7 \n\t" - "movd %%eax, %%mm2 \n\t" - SPREADW(%%mm2) - "movl %6, %%eax \n\t" - ".balign 16 \n\t" - "1: \n\t" - "movq (%1, %%eax), %%mm0 \n\t" - "movq (%2, %%eax), %%mm1 \n\t" - "movq %%mm0, %%mm6 \n\t" - "psraw $15, %%mm6 \n\t" - "pmulhw %%mm0, %%mm1 \n\t" - "psubsw %%mm6, %%mm1 \n\t" -#ifdef HAVE_MMX2 - "pminsw %%mm3, %%mm1 \n\t" - "pmaxsw %%mm4, %%mm1 \n\t" -#else - "paddsw %%mm3, %%mm1 \n\t" - "psubusw %%mm4, %%mm1 \n\t" - "paddsw %%mm5, %%mm1 \n\t" -#endif - "movq %%mm1, (%8, %%eax) \n\t" - "pcmpeqw %%mm7, %%mm1 \n\t" - "movq (%7, %%eax), %%mm0 \n\t" - "movq %%mm7, (%1, %%eax) \n\t" - "pandn %%mm0, %%mm1 \n\t" - PMAXW(%%mm1, %%mm2) - "addl $8, %%eax \n\t" - " js 1b \n\t" - "movq %%mm2, %%mm0 \n\t" - "psrlq $32, %%mm2 \n\t" - PMAXW(%%mm0, %%mm2) - "movq %%mm2, %%mm0 \n\t" - "psrlq $16, %%mm2 \n\t" - PMAXW(%%mm0, %%mm2) - "movd %%mm2, %%eax \n\t" - "movzbl %%al, %%eax \n\t" - : "+a" (last_non_zero_p1) - : "r" (block+64), "r" (qmat+64), -#ifdef HAVE_MMX2 - "m" (maxLevel), "m" (minLevel), "m" (minLevel /* dummy */), "g" (2*i - 128), -#else - "m" (0x7FFF - maxLevel), "m" (0x7FFF -maxLevel + minLevel), "m" (minLevel), "g" (2*i - 128), -#endif - "r" (inv_zigzag_direct16+64), "r" (temp_block+64) - ); + if(s->out_format == FMT_H263){ + + asm volatile( + "movd %%eax, %%mm3 \n\t" // last_non_zero_p1 + SPREADW(%%mm3) + "pxor %%mm7, %%mm7 \n\t" // 0 + "pxor %%mm4, %%mm4 \n\t" // 0 + "movq (%2), %%mm5 \n\t" // qmat[0] + "pxor %%mm6, %%mm6 \n\t" + "psubw (%3), %%mm6 \n\t" // -bias[0] + "movl $-128, %%eax \n\t" + ".balign 16 \n\t" + "1: \n\t" + "pxor %%mm1, %%mm1 \n\t" // 0 + "movq (%1, %%eax), %%mm0 \n\t" // block[i] + "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 + "pxor %%mm1, %%mm0 \n\t" + "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) + "psubusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] + "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16 + "por %%mm0, %%mm4 \n\t" + "pxor %%mm1, %%mm0 \n\t" + "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) + "movq %%mm0, (%5, %%eax) \n\t" + "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 + "movq (%4, %%eax), %%mm1 \n\t" + "movq %%mm7, (%1, %%eax) \n\t" // 0 + "pandn %%mm1, %%mm0 \n\t" + PMAXW(%%mm0, %%mm3) + "addl $8, %%eax \n\t" + " js 1b \n\t" + "movq %%mm3, %%mm0 \n\t" + "psrlq $32, %%mm3 \n\t" + PMAXW(%%mm0, %%mm3) + "movq %%mm3, %%mm0 \n\t" + "psrlq $16, %%mm3 \n\t" + PMAXW(%%mm0, %%mm3) + "movd %%mm3, %%eax \n\t" + "movzbl %%al, %%eax \n\t" // last_non_zero_p1 + : "+a" (last_non_zero_p1) + : "r" (block+64), "r" (qmat), "r" (bias), + "r" (inv_zigzag_direct16+64), "r" (temp_block+64) + ); + // note the asm is split cuz gcc doesnt like that many operands ... + asm volatile( + "movd %1, %%mm1 \n\t" // max_qcoeff + SPREADW(%%mm1) + "psubusw %%mm1, %%mm4 \n\t" + "packuswb %%mm4, %%mm4 \n\t" + "movd %%mm4, %0 \n\t" // *overflow + : "=g" (*overflow) + : "g" (s->max_qcoeff) + ); + }else{ // FMT_H263 + asm volatile( + "movd %%eax, %%mm3 \n\t" // last_non_zero_p1 + SPREADW(%%mm3) + "pxor %%mm7, %%mm7 \n\t" // 0 + "pxor %%mm4, %%mm4 \n\t" // 0 + "movl $-128, %%eax \n\t" + ".balign 16 \n\t" + "1: \n\t" + "pxor %%mm1, %%mm1 \n\t" // 0 + "movq (%1, %%eax), %%mm0 \n\t" // block[i] + "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 + "pxor %%mm1, %%mm0 \n\t" + "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) + "movq (%3, %%eax), %%mm6 \n\t" // bias[0] + "paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] + "movq (%2, %%eax), %%mm5 \n\t" // qmat[i] + "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 + "por %%mm0, %%mm4 \n\t" + "pxor %%mm1, %%mm0 \n\t" + "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) + "movq %%mm0, (%5, %%eax) \n\t" + "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 + "movq (%4, %%eax), %%mm1 \n\t" + "movq %%mm7, (%1, %%eax) \n\t" // 0 + "pandn %%mm1, %%mm0 \n\t" + PMAXW(%%mm0, %%mm3) + "addl $8, %%eax \n\t" + " js 1b \n\t" + "movq %%mm3, %%mm0 \n\t" + "psrlq $32, %%mm3 \n\t" + PMAXW(%%mm0, %%mm3) + "movq %%mm3, %%mm0 \n\t" + "psrlq $16, %%mm3 \n\t" + PMAXW(%%mm0, %%mm3) + "movd %%mm3, %%eax \n\t" + "movzbl %%al, %%eax \n\t" // last_non_zero_p1 + : "+a" (last_non_zero_p1) + : "r" (block+64), "r" (qmat+64), "r" (bias+64), + "r" (inv_zigzag_direct16+64), "r" (temp_block+64) + ); + // note the asm is split cuz gcc doesnt like that many operands ... + asm volatile( + "movd %1, %%mm1 \n\t" // max_qcoeff + SPREADW(%%mm1) + "psubusw %%mm1, %%mm4 \n\t" + "packuswb %%mm4, %%mm4 \n\t" + "movd %%mm4, %0 \n\t" // *overflow + : "=g" (*overflow) + : "g" (s->max_qcoeff) + ); + } + + if(s->mb_intra) temp_block[0]= level; //FIXME move afer permute // last_non_zero_p1=64; /* permute for IDCT */ asm volatile( - "movl %0, %%eax \n\t" + "movl %0, %%eax \n\t" "pushl %%ebp \n\t" "movl %%esp, " MANGLE(esp_temp) "\n\t" "1: \n\t" @@ -203,5 +220,6 @@ static int RENAME(dct_quantize)(MpegEncContext *s, } */ //block_permute(block); + return last_non_zero_p1 - 1; } diff --git a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c index 297f23724..4f19cc20a 100644 --- a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c +++ b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c @@ -1,29 +1,43 @@ /* - Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ - -#include <inttypes.h> + * Simple IDCT MMX + * + * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ #include "../dsputil.h" +/* +23170.475006 +22725.260826 +21406.727617 +19265.545870 +16384.000000 +12872.826198 +8866.956905 +4520.335430 +*/ #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#if 0 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#else +#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 +#endif #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 @@ -31,8 +45,8 @@ #define ROW_SHIFT 11 #define COL_SHIFT 20 // 6 -static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL; -static uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL; +static const uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL; +static const uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL; static int16_t __attribute__((aligned(8))) temp[64]; static int16_t __attribute__((aligned(8))) coeffs[]= { 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, @@ -43,27 +57,31 @@ static int16_t __attribute__((aligned(8))) coeffs[]= { // 0, 0, 0, 0, // 0, 0, 0, 0, - C4, C2, C4, C2, - C4, C6, C4, C6, - C1, C3, C1, C3, - C5, C7, C5, C7, + C4, C4, C4, C4, + C4, -C4, C4, -C4, + + C2, C6, C2, C6, + C6, -C2, C6, -C2, + + C1, C3, C1, C3, + C5, C7, C5, C7, + + C3, -C7, C3, -C7, +-C1, -C5, -C1, -C5, + + C5, -C1, C5, -C1, + C7, C3, C7, C3, + + C7, -C5, C7, -C5, + C3, -C1, C3, -C1 +}; - C4, C6, C4, C6, - -C4, -C2, -C4, -C2, - C3, -C7, C3, -C7, - -C1, -C5, -C1, -C5, - - C4, -C6, C4, -C6, - -C4, C2, -C4, C2, - C5, -C1, C5, -C1, - C7, C3, C7, C3, - - C4, -C2, C4, -C2, - C4, -C6, C4, -C6, - C7, -C5, C7, -C5, - C3, -C1, C3, -C1 - }; #if 0 +static void unused_var_killer(){ + int a= wm1010 + d40000; + temp[0]=a; +} + static void inline idctCol (int16_t * col, int16_t *input) { #undef C0 @@ -79,7 +97,7 @@ static void inline idctCol (int16_t * col, int16_t *input) const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 @@ -128,7 +146,7 @@ static void inline idctRow (int16_t * output, int16_t * input) const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 @@ -188,110 +206,160 @@ row[7] = input[13]; static inline void idct(int16_t *block) { - int i; -//for(i=0; i<64; i++) temp[i]= block[ block_permute_op(i) ]; -//for(i=0; i<64; i++) temp[block_permute_op(i)]= block[ i ]; -//for(i=0; i<64; i++) block[i]= temp[i]; -//block_permute(block); -/* -idctRow(temp, block); -idctRow(temp+16, block+16); -idctRow(temp+1, block+2); -idctRow(temp+17, block+18); -idctRow(temp+32, block+32); -idctRow(temp+48, block+48); -idctRow(temp+33, block+34); -idctRow(temp+49, block+50); -*/ - asm volatile( -// "lea 64(%0), %%eax \n\t" -//r0,r2,R0,R2 r4,r6,R4,R6 r1,r3,R1,R3 r5,r7,R5,R7 -//src0 src4 src1 src5 -//r0,R0,r7,R7 r1,R1,r6,R6 r2,R2,r5,R5 r3,R3,r4,R4 -//dst0 dst1 dst2 dst3 #if 0 //Alternative, simpler variant -#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ + +#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ - "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ #rounder ", %%mm4 \n\t"\ -\ - "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\ - "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ + "paddd %%mm0, %%mm0 \n\t" \ + "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ + "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ + "movq %%mm7, " #dst " \n\t"\ + "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "movq %%mm2, 24+" #dst " \n\t"\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ "psrad $" #shift ", %%mm6 \n\t"\ + "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ + "movq %%mm2, 8+" #dst " \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ - WRITE0(%%mm6, %%mm4, dst) \ -\ - "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ - "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ + "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ + "movq %%mm4, 16+" #dst " \n\t"\ + +#define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ #rounder ", %%mm4 \n\t"\ -\ - "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm6 \n\t"\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + #rounder ", %%mm0 \n\t"\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ + "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ + "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ - WRITE1(%%mm6, %%mm4, dst, %%mm7) \ -\ - "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ - "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm0 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "movd %%mm7, " #dst " \n\t"\ + "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "movd %%mm0, 16+" #dst " \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "movd %%mm2, 96+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "movd %%mm4, 112+" #dst " \n\t"\ + "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ - #rounder ", %%mm4 \n\t"\ -\ - "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ "psrad $" #shift ", %%mm6 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ -\ - "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\ - "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ - "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ - "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst) - -#define DC_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ + "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movd %%mm2, 32+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ + "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movd %%mm6, 48+" #dst " \n\t"\ + "movd %%mm4, 64+" #dst " \n\t"\ + "movd %%mm5, 80+" #dst " \n\t"\ + + +#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq wm1010, %%mm4 \n\t"\ + "movq "MANGLE(wm1010)", %%mm4 \n\t"\ "pand %%mm0, %%mm4 \n\t"\ "por %%mm1, %%mm4 \n\t"\ "por %%mm2, %%mm4 \n\t"\ @@ -300,234 +368,106 @@ idctRow(temp+49, block+50); "movd %%mm4, %%eax \n\t"\ "orl %%eax, %%eax \n\t"\ "jz 1f \n\t"\ - "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ - "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ #rounder ", %%mm4 \n\t"\ -\ - "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\ - "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ + "paddd %%mm0, %%mm0 \n\t" \ + "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - WRITE0(%%mm6, %%mm4, dst) \ -\ - "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ - "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ - #rounder ", %%mm4 \n\t"\ -\ - "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm6 \n\t"\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ - WRITE1(%%mm6, %%mm4, dst, %%mm7) \ -\ - "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ - "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ + "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ + "movq %%mm7, " #dst " \n\t"\ + "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "movq %%mm2, 24+" #dst " \n\t"\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ - #rounder ", %%mm4 \n\t"\ -\ - "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ -\ - "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\ - "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ - "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ - "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ + "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ "psrad $" #shift ", %%mm2 \n\t"\ "psrad $" #shift ", %%mm0 \n\t"\ - WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ + "movq %%mm2, 8+" #dst " \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ + "movq %%mm4, 16+" #dst " \n\t"\ "jmp 2f \n\t"\ "1: \n\t"\ - WRITE3(%%mm0, dst)\ - "2: \n\t"\ - - -#define WRITE0(s0, s7, dst)\ - "movq " #s0 ", " #dst " \n\t" /* R0 r0 */\ - "movq " #s7 ", 24+" #dst " \n\t" /* R7 r7 */ - -#define WRITE1(s1, s6, dst, tmp)\ - "movq " #dst ", " #tmp " \n\t" /* R0 r0 */\ - "packssdw " #s1 ", " #tmp " \n\t" /* R1 r1 R0 r0*/\ - "movq " #tmp ", " #dst " \n\t"\ - "movq 24+" #dst ", " #tmp " \n\t" /* R7 r7 */\ - "packssdw " #tmp ", " #s6 " \n\t" /* R7 r7 R6 r6*/\ - "movq " #s6 ", 24+" #dst " \n\t" - -#define WRITE2(s2, s5, s3, s4, dst)\ - "packssdw " #s3 ", " #s2 " \n\t" /* R3 r3 R2 r2*/\ - "packssdw " #s5 ", " #s4 " \n\t" /* R5 r5 R4 r4*/\ - "movq " #s2 ", 8+" #dst " \n\t"\ - "movq " #s4 ", 16+" #dst " \n\t" - -#define WRITE3(a, dst)\ - "pslld $16, " #a " \n\t"\ - "psrad $13, " #a " \n\t"\ - "packssdw " #a ", " #a " \n\t"\ - "movq " #a ", " #dst " \n\t"\ - "movq " #a ", 8+" #dst " \n\t"\ - "movq " #a ", 16+" #dst " \n\t"\ - "movq " #a ", 24+" #dst " \n\t"\ - -//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) -IDCT_CORE( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) -/* -DC_COND_IDCT_CORE( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) -DC_COND_IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) -DC_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) -*/ -IDCT_CORE( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) -IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) -IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) + "pslld $16, %%mm0 \n\t"\ + "#paddd "MANGLE(d40000)", %%mm0 \n\t"\ + "psrad $13, %%mm0 \n\t"\ + "packssdw %%mm0, %%mm0 \n\t"\ + "movq %%mm0, " #dst " \n\t"\ + "movq %%mm0, 8+" #dst " \n\t"\ + "movq %%mm0, 16+" #dst " \n\t"\ + "movq %%mm0, 24+" #dst " \n\t"\ + "2: \n\t" -#undef WRITE0 -#undef WRITE1 -#undef WRITE2 -#define WRITE0(s0, s7, dst)\ - "packssdw " #s0 ", " #s0 " \n\t" /* C0, c0, C0, c0 */\ - "packssdw " #s7 ", " #s7 " \n\t" /* C7, c7, C7, c7 */\ - "movd " #s0 ", " #dst " \n\t" /* C0, c0 */\ - "movd " #s7 ", 112+" #dst " \n\t" /* C7, c7 */ +//IDCT( src0, src4, src1, src5, dst, rounder, shift) +ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) +/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11) +ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11) +ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/ -#define WRITE1(s1, s6, dst, tmp)\ - "packssdw " #s1 ", " #s1 " \n\t" /* C1, c1, C1, c1 */\ - "packssdw " #s6 ", " #s6 " \n\t" /* C6, c6, C6, c6 */\ - "movd " #s1 ", 16+" #dst " \n\t" /* C1, c1 */\ - "movd " #s6 ", 96+" #dst " \n\t" /* C6, c6 */ +DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) +DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) +DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) -#define WRITE2(s2, s5, s3, s4, dst)\ - "packssdw " #s2 ", " #s2 " \n\t" /* C2, c2, C2, c2 */\ - "packssdw " #s3 ", " #s3 " \n\t" /* C3, c3, C3, c3 */\ - "movd " #s2 ", 32+" #dst " \n\t" /* C2, c2 */\ - "movd " #s3 ", 48+" #dst " \n\t" /* C3, c3 */\ - "packssdw " #s4 ", " #s4 " \n\t" /* C4, c4, C4, c4 */\ - "packssdw " #s5 ", " #s5 " \n\t" /* C5, c5, C5, c5 */\ - "movd " #s4 ", 64+" #dst " \n\t" /* C4, c4 */\ - "movd " #s5 ", 80+" #dst " \n\t" /* C5, c5 */\ -//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) -IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) +//IDCT( src0, src4, src1, src5, dst, rounder, shift) +COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) #else -#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ +#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ - "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - #rounder ", %%mm4 \n\t"\ -\ - "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\ - "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - WRITE0(%%mm6, %%mm4, dst) \ -\ - "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ - "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ - #rounder ", %%mm4 \n\t"\ -\ - "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - WRITE1(%%mm6, %%mm4, dst, %%mm7) \ -\ - "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ - "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ - #rounder ", %%mm4 \n\t"\ -\ - "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ -\ - "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\ - "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ - "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ - "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst) - -#define DC_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq wm1010, %%mm4 \n\t"\ + "movq "MANGLE(wm1010)", %%mm4 \n\t"\ "pand %%mm0, %%mm4 \n\t"\ "por %%mm1, %%mm4 \n\t"\ "por %%mm2, %%mm4 \n\t"\ @@ -536,920 +476,822 @@ IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "movd %%mm4, %%eax \n\t"\ "orl %%eax, %%eax \n\t"\ "jz 1f \n\t"\ - "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ - "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ #rounder ", %%mm4 \n\t"\ -\ - "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\ - "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ + "paddd %%mm0, %%mm0 \n\t" \ + "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - WRITE0(%%mm6, %%mm4, dst) \ -\ - "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ - "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ - #rounder ", %%mm4 \n\t"\ -\ - "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm6 \n\t"\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ - WRITE1(%%mm6, %%mm4, dst, %%mm7) \ -\ - "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ - "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ + "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ + "movq %%mm7, " #dst " \n\t"\ + "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "movq %%mm2, 24+" #dst " \n\t"\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ - #rounder ", %%mm4 \n\t"\ -\ - "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ -\ - "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\ - "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ - "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ - "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ + "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ "psrad $" #shift ", %%mm2 \n\t"\ "psrad $" #shift ", %%mm0 \n\t"\ - WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ + "movq %%mm2, 8+" #dst " \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ + "movq %%mm4, 16+" #dst " \n\t"\ "jmp 2f \n\t"\ - "#.balign 16 \n\t"\ "1: \n\t"\ - WRITE3(%%mm0, dst)\ - "2: \n\t"\ + "pslld $16, %%mm0 \n\t"\ + "paddd "MANGLE(d40000)", %%mm0 \n\t"\ + "psrad $13, %%mm0 \n\t"\ + "packssdw %%mm0, %%mm0 \n\t"\ + "movq %%mm0, " #dst " \n\t"\ + "movq %%mm0, 8+" #dst " \n\t"\ + "movq %%mm0, 16+" #dst " \n\t"\ + "movq %%mm0, 24+" #dst " \n\t"\ + "2: \n\t" -#define Z_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift, bt) \ - "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ +#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ "movq %%mm0, %%mm4 \n\t"\ "por %%mm1, %%mm4 \n\t"\ "por %%mm2, %%mm4 \n\t"\ "por %%mm3, %%mm4 \n\t"\ - "packssdw %%mm4, %%mm4 \n\t"\ + "packssdw %%mm4,%%mm4 \n\t"\ "movd %%mm4, %%eax \n\t"\ "orl %%eax, %%eax \n\t"\ "jz " #bt " \n\t"\ - "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ - "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ #rounder ", %%mm4 \n\t"\ -\ - "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\ - "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ + "paddd %%mm0, %%mm0 \n\t" \ + "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - WRITE0(%%mm6, %%mm4, dst) \ -\ - "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ - "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ - #rounder ", %%mm4 \n\t"\ -\ - "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm6 \n\t"\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ - WRITE1(%%mm6, %%mm4, dst, %%mm7) \ -\ - "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ - "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ + "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ + "movq %%mm7, " #dst " \n\t"\ + "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "movq %%mm2, 24+" #dst " \n\t"\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ - #rounder ", %%mm4 \n\t"\ -\ - "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ -\ - "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\ - "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ - "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ - "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ + "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ "psrad $" #shift ", %%mm2 \n\t"\ "psrad $" #shift ", %%mm0 \n\t"\ - WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\ - - -#define WRITE0(s0, s7, dst)\ - "movq " #s0 ", " #dst " \n\t" /* R0 r0 */\ - "movq " #s7 ", 24+" #dst " \n\t" /* R7 r7 */ - -#define WRITE1(s1, s6, dst, tmp)\ - "movq " #dst ", " #tmp " \n\t" /* R0 r0 */\ - "packssdw " #s1 ", " #tmp " \n\t" /* R1 r1 R0 r0*/\ - "movq " #tmp ", " #dst " \n\t"\ - "movq 24+" #dst ", " #tmp " \n\t" /* R7 r7 */\ - "packssdw " #tmp ", " #s6 " \n\t" /* R7 r7 R6 r6*/\ - "movq " #s6 ", 24+" #dst " \n\t" - -#define WRITE2(s2, s5, s3, s4, dst)\ - "packssdw " #s3 ", " #s2 " \n\t" /* R3 r3 R2 r2*/\ - "packssdw " #s5 ", " #s4 " \n\t" /* R5 r5 R4 r4*/\ - "movq " #s2 ", 8+" #dst " \n\t"\ - "movq " #s4 ", 16+" #dst " \n\t" - -#define WRITE3(a, dst)\ - "pslld $16, " #a " \n\t"\ - "paddd d40000, " #a " \n\t"\ - "psrad $13, " #a " \n\t"\ - "packssdw " #a ", " #a " \n\t"\ - "movq " #a ", " #dst " \n\t"\ - "movq " #a ", 8+" #dst " \n\t"\ - "movq " #a ", 16+" #dst " \n\t"\ - "movq " #a ", 24+" #dst " \n\t"\ - -#define WRITE0b(s0, s7, dst)\ - "packssdw " #s0 ", " #s0 " \n\t" /* C0, c0, C0, c0 */\ - "packssdw " #s7 ", " #s7 " \n\t" /* C7, c7, C7, c7 */\ - "movd " #s0 ", " #dst " \n\t" /* C0, c0 */\ - "movd " #s7 ", 112+" #dst " \n\t" /* C7, c7 */ - -#define WRITE1b(s1, s6, dst, tmp)\ - "packssdw " #s1 ", " #s1 " \n\t" /* C1, c1, C1, c1 */\ - "packssdw " #s6 ", " #s6 " \n\t" /* C6, c6, C6, c6 */\ - "movd " #s1 ", 16+" #dst " \n\t" /* C1, c1 */\ - "movd " #s6 ", 96+" #dst " \n\t" /* C6, c6 */ - -#define WRITE2b(s2, s5, s3, s4, dst)\ - "packssdw " #s2 ", " #s2 " \n\t" /* C2, c2, C2, c2 */\ - "packssdw " #s3 ", " #s3 " \n\t" /* C3, c3, C3, c3 */\ - "movd " #s2 ", 32+" #dst " \n\t" /* C2, c2 */\ - "movd " #s3 ", 48+" #dst " \n\t" /* C3, c3 */\ - "packssdw " #s4 ", " #s4 " \n\t" /* C4, c4, C4, c4 */\ - "packssdw " #s5 ", " #s5 " \n\t" /* C5, c5, C5, c5 */\ - "movd " #s4 ", 64+" #dst " \n\t" /* C4, c4 */\ - "movd " #s5 ", 80+" #dst " \n\t" /* C5, c5 */\ - - -//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) -DC_COND_IDCT_CORE( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) -Z_COND_IDCT_CORE( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) -Z_COND_IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) -Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ + "movq %%mm2, 8+" #dst " \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ + "movq %%mm4, 16+" #dst " \n\t"\ -#undef IDCT_CORE -#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ +#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ - "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + #rounder ", %%mm4 \n\t"\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ -\ - "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\ - "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ + "paddd %%mm0, %%mm0 \n\t" \ + "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm6 \n\t"\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ - WRITE0b(%%mm6, %%mm4, dst) \ -\ - "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ - "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ -\ - "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ + "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ + "movq %%mm7, " #dst " \n\t"\ + "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "movq %%mm2, 24+" #dst " \n\t"\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ "psrad $" #shift ", %%mm6 \n\t"\ + "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ + "movq %%mm2, 8+" #dst " \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ - WRITE1b(%%mm6, %%mm4, dst, %%mm7) \ -\ - "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ - "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ + "movq %%mm4, 16+" #dst " \n\t"\ + +//IDCT( src0, src4, src1, src5, dst, rounder, shift) +DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) +Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) +Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) +Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) + +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + #rounder ", %%mm4 \n\t"\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + #rounder ", %%mm0 \n\t"\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ + "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ + "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm0 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "movd %%mm7, " #dst " \n\t"\ + "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "movd %%mm0, 16+" #dst " \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "movd %%mm2, 96+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "movd %%mm4, 112+" #dst " \n\t"\ + "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ -\ - "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ "psrad $" #shift ", %%mm6 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ -\ - "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ - "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\ - "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ - "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ - "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst) - -//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) -IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movd %%mm2, 32+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ + "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movd %%mm6, 48+" #dst " \n\t"\ + "movd %%mm4, 64+" #dst " \n\t"\ + "movd %%mm5, 80+" #dst " \n\t" + + +//IDCT( src0, src4, src1, src5, dst, rounder, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "jmp 9f \n\t" "#.balign 16 \n\t"\ "4: \n\t" -Z_COND_IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) -Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) +Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) +Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) -#undef IDCT_CORE -#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ - "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + #rounder ", %%mm4 \n\t"\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + #rounder ", %%mm0 \n\t"\ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ -\ - "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - WRITE0b(%%mm7, %%mm4, dst) \ -\ - "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ + "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ + "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ -\ - "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ - "paddd %%mm4, %%mm7 \n\t" /* A1+B1 a1+b1 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm7, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - WRITE1b(%%mm7, %%mm4, dst, %%mm6) \ -\ - "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ - "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ -\ - "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ - "paddd %%mm4, %%mm7 \n\t" /* A1+B1 a1+b1 */\ - "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm7, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "psrad $" #shift ", %%mm7 \n\t"\ + "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm1 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ -\ - "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ - "paddd %%mm0, %%mm3 \n\t" /* A3+B3 a3+b3 */\ - "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ - "psubd %%mm3, %%mm0 \n\t" /* A3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm3 \n\t"\ + "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ "psrad $" #shift ", %%mm0 \n\t"\ - WRITE2b(%%mm7, %%mm4, %%mm3, %%mm0, dst) - -//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) -IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ + "movd %%mm1, " #dst " \n\t"\ + "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "movd %%mm0, 16+" #dst " \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "movd %%mm2, 96+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "movd %%mm4, 112+" #dst " \n\t"\ + "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm1 \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movd %%mm2, 32+" #dst " \n\t"\ + "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ + "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movd %%mm6, 48+" #dst " \n\t"\ + "movd %%mm1, 64+" #dst " \n\t"\ + "movd %%mm5, 80+" #dst " \n\t" + +//IDCT( src0, src4, src1, src5, dst, rounder, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "jmp 9f \n\t" "#.balign 16 \n\t"\ "6: \n\t" -Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) +Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) -#undef IDCT_CORE -#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ - "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ -\ - "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - WRITE0b(%%mm7, %%mm4, dst) \ -\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + #rounder ", %%mm4 \n\t"\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + #rounder ", %%mm0 \n\t"\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ -\ - "movq 80(%2), %%mm4 \n\t" /* -C6 C4 -C6 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ - "paddd %%mm5, %%mm7 \n\t" /* A1+B1 a1+b1 */\ - "paddd %%mm5, %%mm5 \n\t" /* 2A1 2a1 */\ - "psubd %%mm7, %%mm5 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - WRITE1b(%%mm7, %%mm5, dst, %%mm6) \ -\ - "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ -\ - "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ - "paddd %%mm4, %%mm7 \n\t" /* A1+B1 a1+b1 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm7, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "psrad $" #shift ", %%mm7 \n\t"\ + "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm1 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ -\ - "paddd %%mm0, %%mm3 \n\t" /* A3+B3 a3+b3 */\ - "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ - "psubd %%mm3, %%mm0 \n\t" /* A3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm3 \n\t"\ + "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ "psrad $" #shift ", %%mm0 \n\t"\ - WRITE2b(%%mm7, %%mm4, %%mm3, %%mm0, dst) - -//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) -IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ + "movd %%mm1, " #dst " \n\t"\ + "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "movd %%mm0, 16+" #dst " \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "movd %%mm2, 96+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "movd %%mm4, 112+" #dst " \n\t"\ + "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm1 \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movd %%mm2, 32+" #dst " \n\t"\ + "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ + "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movd %%mm6, 48+" #dst " \n\t"\ + "movd %%mm1, 64+" #dst " \n\t"\ + "movd %%mm5, 80+" #dst " \n\t" + + +//IDCT( src0, src4, src1, src5, dst, rounder, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "jmp 9f \n\t" "#.balign 16 \n\t"\ "2: \n\t" -Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) +Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) -#undef IDCT_CORE -#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ - "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ -\ - "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\ - "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + #rounder ", %%mm4 \n\t"\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + #rounder ", %%mm0 \n\t"\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm6 \n\t"\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ - WRITE0b(%%mm6, %%mm4, dst) \ -\ - "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ -\ - "movq 80(%2), %%mm4 \n\t" /* -C6 C4 -C6 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ - "paddd %%mm5, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "paddd %%mm5, %%mm5 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm5 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - WRITE1b(%%mm6, %%mm5, dst, %%mm7) \ -\ - "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm0 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "movd %%mm7, " #dst " \n\t"\ + "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "movd %%mm0, 16+" #dst " \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "movd %%mm2, 96+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "movd %%mm4, 112+" #dst " \n\t"\ + "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ -\ - "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ - "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ "psrad $" #shift ", %%mm6 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ -\ - "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\ - "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ - "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ - "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst) - -//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) -IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movd %%mm2, 32+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ + "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movd %%mm6, 48+" #dst " \n\t"\ + "movd %%mm4, 64+" #dst " \n\t"\ + "movd %%mm5, 80+" #dst " \n\t" + +//IDCT( src0, src4, src1, src5, dst, rounder, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "jmp 9f \n\t" "#.balign 16 \n\t"\ "3: \n\t" -#undef IDCT_CORE -#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ - "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ -\ - "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ - "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + #rounder ", %%mm4 \n\t"\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + #rounder ", %%mm0 \n\t"\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 64(%2), %%mm3 \n\t"\ + "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm6 \n\t"\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm7 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ - WRITE0b(%%mm6, %%mm4, dst) \ -\ - "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ -\ - "movq 80(%2), %%mm4 \n\t" /* -C6 C4 -C6 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ - "paddd %%mm5, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "paddd %%mm5, %%mm5 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm5 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm6 \n\t"\ + "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\ + "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm0 \n\t"\ + "psrad $" #shift ", %%mm1 \n\t"\ + "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "movd %%mm7, " #dst " \n\t"\ + "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "movd %%mm0, 16+" #dst " \n\t"\ + "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\ + "movd %%mm1, 96+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "movd %%mm4, 112+" #dst " \n\t"\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\ + "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm1 \n\t"\ "psrad $" #shift ", %%mm5 \n\t"\ - WRITE1b(%%mm6, %%mm5, dst, %%mm7) \ -\ - "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ -\ - "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ "psrad $" #shift ", %%mm6 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ -\ - "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ - "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ - "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst) - -//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) -IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\ + "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movd %%mm1, 32+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ + "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movd %%mm6, 48+" #dst " \n\t"\ + "movd %%mm4, 64+" #dst " \n\t"\ + "movd %%mm5, 80+" #dst " \n\t" + + +//IDCT( src0, src4, src1, src5, dst, rounder, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "jmp 9f \n\t" "#.balign 16 \n\t"\ "5: \n\t" -#undef IDCT_CORE -#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ - "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ - "movq %%mm4, %%mm6\n\t"\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ - "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "movq %%mm5, %%mm7\n\t"\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ - "movq 8+" #src0 ", %%mm2 \n\t" /*2R2 R0 r2 r0 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /*2C2R2+C4R0 C2r2+C4r0 */\ - "movq 8+" #src4 ", %%mm3 \n\t" /*2R6 R4 r6 r4 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /*2C6R6+C4R4 C6r6+C4r4 */\ -\ +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + #rounder ", %%mm4 \n\t"\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + #rounder ", %%mm0 \n\t"\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ + "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ + "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ + "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\ + "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + #rounder ", %%mm1 \n\t"\ + "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ + "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ + #rounder ", %%mm2 \n\t"\ + "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ + "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ + "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ + "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\ "psrad $" #shift ", %%mm4 \n\t"\ - "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ -\ - "paddd %%mm7, %%mm6 \n\t" /*2A0 a0 */\ - "movq 56(%2), %%mm7 \n\t" /* -C2 -C4 -C2 -C4 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "pmaddwd %%mm1, %%mm7 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ -\ - "packssdw %%mm6, %%mm4 \n\t" /* C0, c0, C0, c0 */\ - "movq 48(%2), %%mm6 \n\t" /* C6 C4 C6 C4 */\ - "movq %%mm4, " #dst " \n\t" /* C0, c0 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /*2C6R2+C4R0 C6r2+C4r0 */\ -\ - "movq %%mm4, 112+" #dst " \n\t" /* C0, c0 */\ - "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ - "pmaddwd %%mm3, %%mm4 \n\t" /*2-C2R6-C4R4 -C2r6-C4r4 */\ -\ - "paddd %%mm5, %%mm7 \n\t" /* A1 a1 */\ - "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ "psrad $" #shift ", %%mm7 \n\t"\ - "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ -\ - "paddd %%mm4, %%mm6 \n\t" /*2A1 a1 */\ - "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ -\ - "psrad $" #shift ", %%mm6 \n\t"\ - "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ -\ - "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ - "packssdw %%mm6, %%mm7 \n\t" /* C1, c1, C1, c1 */\ -\ - "movq 80(%2), %%mm6 \n\t" /* -C6 C4 -C6 C4 */\ - "movq %%mm7, 16+" #dst " \n\t" /* C1, c1 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /*2-C6R2+C4R0 -C6r2+C4r0 */\ -\ - "movq %%mm7, 96+" #dst " \n\t" /* C1, c1 */\ - "movq 88(%2), %%mm7 \n\t" /* C2 -C4 C2 -C4 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /*2C2R6-C4R4 C2r6-C4r4 */\ -\ - "pmaddwd 112(%2), %%mm2 \n\t" /*2-C2R2+C4R0 -C2r2+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ -\ - "pmaddwd 120(%2), %%mm3 \n\t" /*2-C6R6+C4R4 -C6r6+C4r4 */\ - "psrad $" #shift ", %%mm4 \n\t"\ -\ - "paddd %%mm7, %%mm6 \n\t" /*2A2 a2 */\ - "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ -\ - "psrad $" #shift ", %%mm6 \n\t"\ -\ - "packssdw %%mm6, %%mm4 \n\t" /* C2, c2, C2, c2 */\ - "movq %%mm4, 32+" #dst " \n\t" /* C2, c2 */\ + "psrad $" #shift ", %%mm3 \n\t"\ + "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\ + "movq %%mm4, " #dst " \n\t"\ "psrad $" #shift ", %%mm0 \n\t"\ - "paddd %%mm3, %%mm2 \n\t" /*2A3 a3 */\ -\ - "movq %%mm4, 80+" #dst " \n\t" /* C2, c2 */\ + "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\ + "movq %%mm0, 16+" #dst " \n\t"\ + "movq %%mm0, 96+" #dst " \n\t"\ + "movq %%mm4, 112+" #dst " \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "psrad $" #shift ", %%mm6 \n\t"\ "psrad $" #shift ", %%mm2 \n\t"\ -\ - "packssdw %%mm2, %%mm0 \n\t" /* C3, c3, C3, c3 */\ - "movq %%mm0, 48+" #dst " \n\t" /* C3, c3 */\ - "movq %%mm0, 64+" #dst " \n\t" /* C3, c3 */\ - -//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) -IDCT_CORE( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -//IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -//IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movq %%mm5, 32+" #dst " \n\t"\ + "psrad $" #shift ", %%mm1 \n\t"\ + "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movq %%mm6, 48+" #dst " \n\t"\ + "movq %%mm6, 64+" #dst " \n\t"\ + "movq %%mm5, 80+" #dst " \n\t" + + +//IDCT( src0, src4, src1, src5, dst, rounder, shift) +IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "jmp 9f \n\t" "#.balign 16 \n\t"\ "1: \n\t" -#undef IDCT_CORE -#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ - "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + #rounder ", %%mm4 \n\t"\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + #rounder ", %%mm0 \n\t"\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ -\ - "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ - "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ + "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ + "movq 64(%2), %%mm1 \n\t"\ + "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - WRITE0b(%%mm6, %%mm4, dst) \ -\ - "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ - "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ -\ - "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ - "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm6 \n\t"\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm7 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ - WRITE1b(%%mm6, %%mm4, dst, %%mm7) \ -\ - "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ - "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ - "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ -\ - "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ - "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ - "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ - "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ - "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm0 \n\t"\ + "psrad $" #shift ", %%mm3 \n\t"\ + "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "movd %%mm7, " #dst " \n\t"\ + "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "movd %%mm0, 16+" #dst " \n\t"\ + "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\ + "movd %%mm3, 96+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "movd %%mm4, 112+" #dst " \n\t"\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\ + "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm3 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ "psrad $" #shift ", %%mm6 \n\t"\ + "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\ + "movd %%mm3, 32+" #dst " \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ -\ - "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ - "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ - "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ - "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst) - -//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) -IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movd %%mm6, 48+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ + "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movd %%mm4, 64+" #dst " \n\t"\ + "movd %%mm5, 80+" #dst " \n\t" + + +//IDCT( src0, src4, src1, src5, dst, rounder, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "jmp 9f \n\t" "#.balign 16 \n\t" "7: \n\t" -#undef IDCT_CORE -#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ - "movq 16(%2), %%mm2 \n\t" /* C2 C4 C2 C4 */\ - "movq 8+" #src0 ", %%mm1 \n\t" /* R2 R0 r2 r0 */\ - "pmaddwd %%mm0, %%mm2 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ - "movq 16(%2), %%mm3 \n\t" /* C2 C4 C2 C4 */\ - "pmaddwd %%mm1, %%mm3 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ -\ - "movq 48(%2), %%mm4 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ - "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ - "movq 80(%2), %%mm6 \n\t" /* -C6 C4 -C6 C4 */\ - "pmaddwd %%mm0, %%mm6 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ - "movq 80(%2), %%mm7 \n\t" /* -C6 C4 -C6 C4 */\ - "pmaddwd %%mm1, %%mm7 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ - "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm3 \n\t"\ - "pmaddwd 112(%2), %%mm1 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ - "packssdw %%mm3, %%mm2 \n\t" /* C0, c0, C0, c0 */\ - "movq %%mm2, " #dst " \n\t" /* C0, c0 */\ +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + #rounder ", %%mm4 \n\t"\ + #rounder ", %%mm0 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm2, 112+" #dst " \n\t" /* C0, c0 */\ - "packssdw %%mm5, %%mm4 \n\t" /* C1, c1, C1, c1 */\ - "movq %%mm4, 16+" #dst " \n\t" /* C0, c0 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm6 \n\t"\ - "movq %%mm4, 96+" #dst " \n\t" /* C0, c0 */\ - "packssdw %%mm7, %%mm6 \n\t" /* C2, c2, C2, c2 */\ - "movq %%mm6, 32+" #dst " \n\t" /* C0, c0 */\ "psrad $" #shift ", %%mm0 \n\t"\ - "movq %%mm6, 80+" #dst " \n\t" /* C0, c0 */\ + "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ + "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ + #rounder ", %%mm1 \n\t"\ + #rounder ", %%mm2 \n\t"\ "psrad $" #shift ", %%mm1 \n\t"\ - "packssdw %%mm1, %%mm0 \n\t" /* C3, c3, C3, c3 */\ - "movq %%mm0, 48+" #dst " \n\t" /* C0, c0 */\ - "movq %%mm0, 64+" #dst " \n\t" /* C0, c0 */\ - -//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) -IDCT_CORE( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -//IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -//IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ + "movq %%mm4, " #dst " \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\ + "movq %%mm0, 16+" #dst " \n\t"\ + "movq %%mm0, 96+" #dst " \n\t"\ + "movq %%mm4, 112+" #dst " \n\t"\ + "movq %%mm0, 32+" #dst " \n\t"\ + "movq %%mm4, 48+" #dst " \n\t"\ + "movq %%mm4, 64+" #dst " \n\t"\ + "movq %%mm0, 80+" #dst " \n\t" + +//IDCT( src0, src4, src1, src5, dst, rounder, shift) +IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) #endif /* Input - 00 20 02 22 40 60 42 62 - 10 30 12 32 50 70 52 72 - 01 21 03 23 41 61 43 63 + 00 40 04 44 20 60 24 64 + 10 30 14 34 50 70 54 74 + 01 41 03 43 21 61 23 63 11 31 13 33 51 71 53 73 - 04 24 06 26 44 64 46 66 - 14 34 16 36 54 74 56 76 -... -*/ -/* + 02 42 06 46 22 62 26 66 + 12 32 16 36 52 72 56 76 + 05 45 07 47 25 65 27 67 + 15 35 17 37 55 75 57 77 + Temp - 00 02 10 12 20 22 30 32 - 40 42 50 52 60 62 70 72 + 00 04 10 14 20 24 30 34 + 40 44 50 54 60 64 70 74 01 03 11 13 21 23 31 33 41 43 51 53 61 63 71 73 - 04 06 14 16 24 26 34 36 - 44 46 54 56 64 66 74 76 + 02 06 12 16 22 26 32 36 + 42 46 52 56 62 66 72 76 05 07 15 17 25 27 35 37 45 47 55 57 65 67 75 77 */ -/* -Output - 00 10 20 30 40 50 60 70 - 01 11 21 31 41 51 61 71 -... -*/ - "9: \n\t" :: "r" (block), "r" (temp), "r" (coeffs) : "%eax" ); -/* -idctCol(block, temp); -idctCol(block+1, temp+2); -idctCol(block+2, temp+4); -idctCol(block+3, temp+6); -idctCol(block+4, temp+8); -idctCol(block+5, temp+10); -idctCol(block+6, temp+12); -idctCol(block+7, temp+14); -*/ } void simple_idct_mmx(int16_t *block) { - static int imax=0, imin=0; - static int omax=0, omin=0; - int i, j; -/* - for(i=0; i<64; i++) - { - if(block[i] > imax) - { - imax= block[i]; - printf("Input-Max: %d\n", imax); - printf("Input-Min: %d\n", imin); - printf("Output-Max: %d\n", omax); - printf("Output-Min: %d\n", omin); - } - if(block[i] < imin) - { - imin= block[i]; - printf("Input-Max: %d\n", imax); - printf("Input-Min: %d\n", imin); - printf("Output-Max: %d\n", omax); - printf("Output-Min: %d\n", omin); - } - }*/ -/* static int stat[64]; - for(j=0; j<4; j++) - { - static int line[8]={0,2,1,3,4,6,5,7}; - for(i=0; i<16; i++) - { - if(block[j*16+i]) - { - stat[j*16+1]++; - break; - } - } - for(i=0; i<16; i++) - { - if(block[j*16+i] && i!=0 && i!=2) - { - stat[j*16+2]++; - break; - } - } - } - stat[0]++;*/ -/* for(i=1; i<8; i++) - { - if(block[i] != 0) - { - stat[1]++; - break; - } - } - for(i=32; i<64; i++) - { - if(block[i] != 0) - { - stat[2]++; - break; - } - } - stat[0]++; -*/ -// return; idct(block); -// memset(block, 0, 128); -/* - if(stat[0] > 100000) - for(i=0; i<64; i++) - { - if((i&7) == 0) printf("\n"); - printf("%06d ", stat[i]); - } -*/ -/* - for(i=0; i<4; i++) printf("%d", stat[1+i*16]); - printf(" "); - for(i=0; i<4; i++) printf("%d", stat[2+i*16]); - printf("\n"); -*/ -// printf("%d", stat[2]); - -// memset(stat, 0, 256); - -/* - for(i=0; i<64; i++) - { - if(block[i] > omax) - { - omax= block[i]; - printf("Input-Max: %d\n", imax); - printf("Input-Min: %d\n", imin); - printf("Output-Max: %d\n", omax); - printf("Output-Min: %d\n", omin); - } - if(block[i] < omin) - { - omin= block[i]; - printf("Input-Max: %d\n", imax); - printf("Input-Min: %d\n", imin); - printf("Output-Max: %d\n", omax); - printf("Output-Min: %d\n", omin); - } - }*/ } diff --git a/src/libffmpeg/libavcodec/imgconvert.c b/src/libffmpeg/libavcodec/imgconvert.c index d39b6c1e9..04300744f 100644 --- a/src/libffmpeg/libavcodec/imgconvert.c +++ b/src/libffmpeg/libavcodec/imgconvert.c @@ -1,20 +1,20 @@ /* * Misc image convertion routines - * Copyright (c) 2001 Gerard Lantau. + * Copyright (c) 2001, 2002 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "avcodec.h" #include "dsputil.h" @@ -361,6 +361,8 @@ int img_convert(AVPicture *dst, int dst_pix_fmt, { int i; + assert(pix_fmt != PIX_FMT_ANY && dst_pix_fmt != PIX_FMT_ANY); + if (dst_pix_fmt == pix_fmt) { switch(pix_fmt) { case PIX_FMT_YUV420P: @@ -479,7 +481,7 @@ static void deinterlace_bottom_field(UINT8 *dst, int dst_wrap, int y, y1, i; UINT8 *buf; - buf= (UINT8*) malloc(5 * width); + buf = (UINT8*)av_malloc(5 * width); src = src1; for(y=0;y<height;y+=2) { @@ -509,7 +511,7 @@ static void deinterlace_bottom_field(UINT8 *dst, int dst_wrap, dst += dst_wrap; src += (2 + 1) * src_wrap; } - free(buf); + av_free(buf); } @@ -546,3 +548,5 @@ int avpicture_deinterlace(AVPicture *dst, AVPicture *src, } return 0; } + +#undef FIX diff --git a/src/libffmpeg/libavcodec/imgresample.c b/src/libffmpeg/libavcodec/imgresample.c index fda5a31c4..26519bd38 100644 --- a/src/libffmpeg/libavcodec/imgresample.c +++ b/src/libffmpeg/libavcodec/imgresample.c @@ -1,27 +1,23 @@ /* * High quality image resampling with polyphase filters - * Copyright (c) 2001 Gerard Lantau. + * Copyright (c) 2001 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <math.h> -#include "dsputil.h" #include "avcodec.h" +#include "dsputil.h" #ifdef USE_FASTMEMCPY #include "fastmemcpy.h" @@ -454,7 +450,7 @@ ImgReSampleContext *img_resample_init(int owidth, int oheight, return s; fail: - free(s); + av_free(s); return NULL; } @@ -474,8 +470,8 @@ void img_resample(ImgReSampleContext *s, void img_resample_close(ImgReSampleContext *s) { - free(s->line_buf); - free(s); + av_free(s->line_buf); + av_free(s); } #ifdef TEST @@ -522,7 +518,7 @@ static void dump_filter(INT16 *filter) } #ifdef HAVE_MMX -int mm_flags; +extern int mm_flags; #endif int main(int argc, char **argv) @@ -609,7 +605,7 @@ int main(int argc, char **argv) img, XSIZE, XSIZE, YSIZE); if (memcmp(img1, img2, xsize * ysize) != 0) { fprintf(stderr, "mmx error\n"); - abort(); + exit(1); } printf("MMX OK\n"); #endif diff --git a/src/libffmpeg/libavcodec/jfdctfst.c b/src/libffmpeg/libavcodec/jfdctfst.c index cdc3b47f9..4e3b55bb5 100644 --- a/src/libffmpeg/libavcodec/jfdctfst.c +++ b/src/libffmpeg/libavcodec/jfdctfst.c @@ -113,7 +113,7 @@ */ GLOBAL(void) -jpeg_fdct_ifast (DCTELEM * data) +fdct_ifast (DCTELEM * data) { DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; DCTELEM tmp10, tmp11, tmp12, tmp13; @@ -222,3 +222,10 @@ jpeg_fdct_ifast (DCTELEM * data) dataptr++; /* advance pointer to next column */ } } + + +#undef GLOBAL +#undef CONST_BITS +#undef DESCALE +#undef FIX_0_541196100 +#undef FIX_1_306562965 diff --git a/src/libffmpeg/libavcodec/jrevdct.c b/src/libffmpeg/libavcodec/jrevdct.c index 246f1b190..3ba91543d 100644 --- a/src/libffmpeg/libavcodec/jrevdct.c +++ b/src/libffmpeg/libavcodec/jrevdct.c @@ -1166,4 +1166,5 @@ void j_rev_dct(DCTBLOCK data) } } - +#undef FIX +#undef CONST_BITS diff --git a/src/libffmpeg/libavcodec/mjpeg.c b/src/libffmpeg/libavcodec/mjpeg.c index 1eb35d2b8..577e9d884 100644 --- a/src/libffmpeg/libavcodec/mjpeg.c +++ b/src/libffmpeg/libavcodec/mjpeg.c @@ -1,28 +1,36 @@ /* * MJPEG encoder and decoder - * Copyright (c) 2000, 2001 Gerard Lantau. + * Copyright (c) 2000, 2001 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Support for external huffman table and various fixes (AVID workaround) by + * Alex Beregszaszi <alex@naxine.org> */ //#define DEBUG -#include "config.h" - #include "avcodec.h" #include "dsputil.h" #include "mpegvideo.h" -#include "xine-utils/xineutils.h" + +#ifdef USE_FASTMEMCPY +#include "fastmemcpy.h" +#endif + +/* use two quantizer table (one for luminance and one for chrominance) */ +/* not yet working */ +#undef TWOMATRIXES typedef struct MJpegContext { UINT8 huff_size_dc_luminance[12]; @@ -36,12 +44,87 @@ typedef struct MJpegContext { UINT16 huff_code_ac_chrominance[256]; } MJpegContext; -#define SOF0 0xc0 -#define SOI 0xd8 -#define EOI 0xd9 -#define DQT 0xdb -#define DHT 0xc4 -#define SOS 0xda +/* JPEG marker codes */ +typedef enum { + /* start of frame */ + SOF0 = 0xc0, /* baseline */ + SOF1 = 0xc1, /* extended sequential, huffman */ + SOF2 = 0xc2, /* progressive, huffman */ + SOF3 = 0xc3, /* lossless, huffman */ + + SOF5 = 0xc5, /* differential sequential, huffman */ + SOF6 = 0xc6, /* differential progressive, huffman */ + SOF7 = 0xc7, /* differential lossless, huffman */ + JPG = 0xc8, /* reserved for JPEG extension */ + SOF9 = 0xc9, /* extended sequential, arithmetic */ + SOF10 = 0xca, /* progressive, arithmetic */ + SOF11 = 0xcb, /* lossless, arithmetic */ + + SOF13 = 0xcd, /* differential sequential, arithmetic */ + SOF14 = 0xce, /* differential progressive, arithmetic */ + SOF15 = 0xcf, /* differential lossless, arithmetic */ + + DHT = 0xc4, /* define huffman tables */ + + DAC = 0xcc, /* define arithmetic-coding conditioning */ + + /* restart with modulo 8 count "m" */ + RST0 = 0xd0, + RST1 = 0xd1, + RST2 = 0xd2, + RST3 = 0xd3, + RST4 = 0xd4, + RST5 = 0xd5, + RST6 = 0xd6, + RST7 = 0xd7, + + SOI = 0xd8, /* start of image */ + EOI = 0xd9, /* end of image */ + SOS = 0xda, /* start of scan */ + DQT = 0xdb, /* define quantization tables */ + DNL = 0xdc, /* define number of lines */ + DRI = 0xdd, /* define restart interval */ + DHP = 0xde, /* define hierarchical progression */ + EXP = 0xdf, /* expand reference components */ + + APP0 = 0xe0, + APP1 = 0xe1, + APP2 = 0xe2, + APP3 = 0xe3, + APP4 = 0xe4, + APP5 = 0xe5, + APP6 = 0xe6, + APP7 = 0xe7, + APP8 = 0xe8, + APP9 = 0xe9, + APP10 = 0xea, + APP11 = 0xeb, + APP12 = 0xec, + APP13 = 0xed, + APP14 = 0xee, + APP15 = 0xef, + + JPG0 = 0xf0, + JPG1 = 0xf1, + JPG2 = 0xf2, + JPG3 = 0xf3, + JPG4 = 0xf4, + JPG5 = 0xf5, + JPG6 = 0xf6, + JPG7 = 0xf7, + JPG8 = 0xf8, + JPG9 = 0xf9, + JPG10 = 0xfa, + JPG11 = 0xfb, + JPG12 = 0xfc, + JPG13 = 0xfd, + + COM = 0xfe, /* comment */ + + TEM = 0x01, /* temporary private use for arithmetic coding */ + + /* 0x02 -> 0xbf reserved */ +} JPEG_MARKER; #if 0 /* These are the sample quantization tables given in JPEG spec section K.1. @@ -135,7 +218,6 @@ static const UINT8 val_ac_chrominance[] = 0xf9, 0xfa }; - /* isn't this function nicer than the one in the libjpeg ? */ static void build_huffman_codes(UINT8 *huff_size, UINT16 *huff_code, const UINT8 *bits_table, const UINT8 *val_table) @@ -160,9 +242,13 @@ int mjpeg_init(MpegEncContext *s) { MJpegContext *m; - m = malloc(sizeof(MJpegContext)); + m = av_malloc(sizeof(MJpegContext)); if (!m) return -1; + + s->min_qcoeff=-1023; + s->max_qcoeff= 1023; + s->intra_quant_bias= 1<<(QUANT_BIAS_SHIFT-1); //(a + x/2)/x /* build all the huffman tables */ build_huffman_codes(m->huff_size_dc_luminance, @@ -188,7 +274,7 @@ int mjpeg_init(MpegEncContext *s) void mjpeg_close(MpegEncContext *s) { - free(s->mjpeg_ctx); + av_free(s->mjpeg_ctx); } static inline void put_marker(PutBitContext *p, int code) @@ -227,14 +313,18 @@ static void jpeg_table_header(MpegEncContext *s) /* quant matrixes */ put_marker(p, DQT); +#ifdef TWOMATRIXES + put_bits(p, 16, 2 + 2 * (1 + 64)); +#else put_bits(p, 16, 2 + 1 * (1 + 64)); +#endif put_bits(p, 4, 0); /* 8 bit precision */ put_bits(p, 4, 0); /* table 0 */ for(i=0;i<64;i++) { j = zigzag_direct[i]; put_bits(p, 8, s->intra_matrix[j]); } -#if 0 +#ifdef TWOMATRIXES put_bits(p, 4, 0); /* 8 bit precision */ put_bits(p, 4, 1); /* table 1 */ for(i=0;i<64;i++) { @@ -258,10 +348,46 @@ static void jpeg_table_header(MpegEncContext *s) ptr[1] = size; } +static void jpeg_put_comments(MpegEncContext *s) +{ + PutBitContext *p = &s->pb; + int size; + UINT8 *ptr; + +#if 0 + /* JFIF header */ + put_marker(p, APP0); + put_bits(p, 16, 16); + put_string(p, "JFIF"); /* this puts the trailing zero-byte too */ + put_bits(p, 16, 0x101); + put_bits(p, 8, 0); /* units type: 0 - aspect ratio */ + put_bits(p, 16, 1); /* aspect: 1:1 */ + put_bits(p, 16, 1); + put_bits(p, 8, 0); /* thumbnail width */ + put_bits(p, 8, 0); /* thumbnail height */ +#endif + + /* comment */ + put_marker(p, COM); + flush_put_bits(p); + ptr = pbBufPtr(p); + put_bits(p, 16, 0); /* patched later */ +#define VERSION "FFmpeg" LIBAVCODEC_VERSION "b" LIBAVCODEC_BUILD_STR + put_string(p, VERSION); + size = strlen(VERSION)+3; +#undef VERSION + ptr[0] = size >> 8; + ptr[1] = size; +} + void mjpeg_picture_header(MpegEncContext *s) { put_marker(&s->pb, SOI); + if (!s->mjpeg_data_only_frames) + { + jpeg_put_comments(s); + if (s->mjpeg_write_tables) jpeg_table_header(s); put_marker(&s->pb, SOF0); @@ -282,13 +408,22 @@ void mjpeg_picture_header(MpegEncContext *s) put_bits(&s->pb, 8, 2); /* component number */ put_bits(&s->pb, 4, s->mjpeg_hsample[1]); /* H factor */ put_bits(&s->pb, 4, s->mjpeg_vsample[1]); /* V factor */ +#ifdef TWOMATRIXES + put_bits(&s->pb, 8, 1); /* select matrix */ +#else put_bits(&s->pb, 8, 0); /* select matrix */ +#endif /* Cr component */ put_bits(&s->pb, 8, 3); /* component number */ put_bits(&s->pb, 4, s->mjpeg_hsample[2]); /* H factor */ put_bits(&s->pb, 4, s->mjpeg_vsample[2]); /* V factor */ +#ifdef TWOMATRIXES + put_bits(&s->pb, 8, 1); /* select matrix */ +#else put_bits(&s->pb, 8, 0); /* select matrix */ +#endif + } /* scan header */ put_marker(&s->pb, SOS); @@ -312,7 +447,7 @@ void mjpeg_picture_header(MpegEncContext *s) put_bits(&s->pb, 8, 0); /* Ss (not used) */ put_bits(&s->pb, 8, 63); /* Se (not used) */ - put_bits(&s->pb, 8, 0); /* (not used) */ + put_bits(&s->pb, 8, 0); /* Ah/Al (not used) */ } void mjpeg_picture_trailer(MpegEncContext *s) @@ -321,8 +456,8 @@ void mjpeg_picture_trailer(MpegEncContext *s) put_marker(&s->pb, EOI); } -static inline void encode_dc(MpegEncContext *s, int val, - UINT8 *huff_size, UINT16 *huff_code) +static inline void mjpeg_encode_dc(MpegEncContext *s, int val, + UINT8 *huff_size, UINT16 *huff_code) { int mant, nbits; @@ -361,11 +496,11 @@ static void encode_block(MpegEncContext *s, DCTELEM *block, int n) dc = block[0]; /* overflow is impossible */ val = dc - s->last_dc[component]; if (n < 4) { - encode_dc(s, val, m->huff_size_dc_luminance, m->huff_code_dc_luminance); + mjpeg_encode_dc(s, val, m->huff_size_dc_luminance, m->huff_code_dc_luminance); huff_size_ac = m->huff_size_ac_luminance; huff_code_ac = m->huff_code_ac_luminance; } else { - encode_dc(s, val, m->huff_size_dc_chrominance, m->huff_code_dc_chrominance); + mjpeg_encode_dc(s, val, m->huff_size_dc_chrominance, m->huff_code_dc_chrominance); huff_size_ac = m->huff_size_ac_chrominance; huff_code_ac = m->huff_code_ac_chrominance; } @@ -429,6 +564,7 @@ void mjpeg_encode_mb(MpegEncContext *s, #define MAX_COMPONENTS 4 typedef struct MJpegDecodeContext { + AVCodecContext *avctx; GetBitContext gb; UINT32 header_state; int start_code; /* current start code */ @@ -455,8 +591,21 @@ typedef struct MJpegDecodeContext { int linesize[MAX_COMPONENTS]; DCTELEM block[64] __align8; UINT8 buffer[PICTURE_BUFFER_SIZE]; + + int buggy_avid; + int restart_interval; + int restart_count; + int interleaved_rows; } MJpegDecodeContext; +#define SKIP_REMAINING(gb, len) { \ + dprintf("reamining %d bytes in marker\n", len); \ + if (len) while (--len) \ + skip_bits(gb, 8); \ +} + +static int mjpeg_decode_dht(MJpegDecodeContext *s, UINT8 *buf, int buf_size); + static void build_vlc(VLC *vlc, const UINT8 *bits_table, const UINT8 *val_table, int nb_codes) { @@ -473,6 +622,8 @@ static int mjpeg_decode_init(AVCodecContext *avctx) { MJpegDecodeContext *s = avctx->priv_data; + s->avctx = avctx; + s->header_state = 0; s->mpeg_enc_ctx_allocated = 0; s->buffer_size = PICTURE_BUFFER_SIZE - 1; /* minus 1 to take into @@ -487,6 +638,13 @@ static int mjpeg_decode_init(AVCodecContext *avctx) build_vlc(&s->vlcs[0][1], bits_dc_chrominance, val_dc_chrominance, 12); build_vlc(&s->vlcs[1][0], bits_ac_luminance, val_ac_luminance, 251); build_vlc(&s->vlcs[1][1], bits_ac_chrominance, val_ac_chrominance, 251); + + if (avctx->flags & CODEC_FLAG_EXTERN_HUFF) + { + printf("mjpeg: using external huffman table\n"); + mjpeg_decode_dht(s, avctx->extradata, avctx->extradata_size); + /* should check for error - but dunno */ + } return 0; } @@ -496,14 +654,16 @@ static int mjpeg_decode_dqt(MJpegDecodeContext *s, { int len, index, i, j; init_get_bits(&s->gb, buf, buf_size); - - len = get_bits(&s->gb, 16); - len -= 2; + + len = get_bits(&s->gb, 16) - 2; while (len >= 65) { /* only 8 bit precision handled */ if (get_bits(&s->gb, 4) != 0) + { + dprintf("dqt: 16bit precision\n"); return -1; + } index = get_bits(&s->gb, 4); if (index >= 4) return -1; @@ -511,10 +671,13 @@ static int mjpeg_decode_dqt(MJpegDecodeContext *s, /* read quant table */ for(i=0;i<64;i++) { j = zigzag_direct[i]; - s->quant_matrixes[index][j] = get_bits(&s->gb, 8); + s->quant_matrixes[index][j] = get_bits(&s->gb, 8); } len -= 65; } + + SKIP_REMAINING(&s->gb, len); + return 0; } @@ -581,6 +744,7 @@ static int mjpeg_decode_sof0(MJpegDecodeContext *s, return -1; height = get_bits(&s->gb, 16); width = get_bits(&s->gb, 16); + dprintf("sof0: picture: %dx%d\n", width, height); nb_components = get_bits(&s->gb, 8); if (nb_components <= 0 || @@ -602,16 +766,15 @@ static int mjpeg_decode_sof0(MJpegDecodeContext *s, s->quant_index[i] = get_bits(&s->gb, 8); if (s->quant_index[i] >= 4) return -1; - dprintf("component %d %d:%d\n", i, s->h_count[i], s->v_count[i]); + dprintf("component %d %d:%d id: %d quant:%d\n", i, s->h_count[i], + s->v_count[i], s->component_id[i], s->quant_index[i]); } /* if different size, realloc/alloc picture */ /* XXX: also check h_count and v_count */ if (width != s->width || height != s->height) { - for(i=0;i<MAX_COMPONENTS;i++) { - free(s->current_picture[i]); - s->current_picture[i] = NULL; - } + for(i=0;i<MAX_COMPONENTS;i++) + av_freep(&s->current_picture[i]); s->width = width; s->height = height; /* test interlaced mode */ @@ -619,7 +782,7 @@ static int mjpeg_decode_sof0(MJpegDecodeContext *s, s->org_height != 0 && s->height < ((s->org_height * 3) / 4)) { s->interlaced = 1; - s->bottom_field = 0; + s->bottom_field = 0; } for(i=0;i<nb_components;i++) { @@ -636,19 +799,26 @@ static int mjpeg_decode_sof0(MJpegDecodeContext *s, } s->first_picture = 0; } + + if (len != (8+(3*nb_components))) + { + dprintf("decode_sof0: error, len(%d) mismatch\n", len); + } return 0; } -static inline int decode_dc(MJpegDecodeContext *s, int dc_index) +static inline int mjpeg_decode_dc(MJpegDecodeContext *s, int dc_index) { - VLC *dc_vlc; int code, diff; - dc_vlc = &s->vlcs[0][dc_index]; - code = get_vlc(&s->gb, dc_vlc); + code = get_vlc(&s->gb, &s->vlcs[0][dc_index]); if (code < 0) + { + dprintf("mjpeg_decode_dc: bad vlc: %d:%d (%p)\n", 0, dc_index, + &s->vlcs[0][dc_index]); return 0xffff; + } if (code == 0) { diff = 0; } else { @@ -668,13 +838,13 @@ static int decode_block(MJpegDecodeContext *s, DCTELEM *block, VLC *ac_vlc; INT16 *quant_matrix; - quant_matrix = s->quant_matrixes[quant_index]; /* DC coef */ - val = decode_dc(s, dc_index); + val = mjpeg_decode_dc(s, dc_index); if (val == 0xffff) { dprintf("error dc\n"); return -1; } + quant_matrix = s->quant_matrixes[quant_index]; val = val * quant_matrix[0] + s->last_dc[component]; s->last_dc[component] = val; block[0] = val; @@ -731,17 +901,24 @@ static int mjpeg_decode_sos(MJpegDecodeContext *s, nb_components = get_bits(&s->gb, 8); /* XXX: only interleaved scan accepted */ if (nb_components != 3) + { + dprintf("decode_sos: components(%d) mismatch\n", nb_components); return -1; + } vmax = 0; hmax = 0; for(i=0;i<nb_components;i++) { id = get_bits(&s->gb, 8) - 1; + dprintf("component: %d\n", id); /* find component index */ for(index=0;index<s->nb_components;index++) if (id == s->component_id[index]) break; if (index == s->nb_components) + { + dprintf("decode_sos: index(%d) out of components\n", index); return -1; + } comp_index[i] = index; nb_blocks[i] = s->h_count[index] * s->v_count[index]; @@ -749,15 +926,31 @@ static int mjpeg_decode_sos(MJpegDecodeContext *s, v_count[i] = s->v_count[index]; dc_index[i] = get_bits(&s->gb, 4); - if (dc_index[i] >= 4) - return -1; ac_index[i] = get_bits(&s->gb, 4); - if (ac_index[i] >= 4) - return -1; + + if (dc_index[i] < 0 || ac_index[i] < 0 || + dc_index[i] >= 4 || ac_index[i] >= 4) + goto out_of_range; + switch(s->start_code) + { + case SOF0: + if (dc_index[i] > 1 || ac_index[i] > 1) + goto out_of_range; + break; + case SOF1: + case SOF2: + if (dc_index[i] > 3 || ac_index[i] > 3) + goto out_of_range; + break; + case SOF3: + if (dc_index[i] > 3 || ac_index[i] != 0) + goto out_of_range; + break; + } } - get_bits(&s->gb, 8); /* Ss */ - get_bits(&s->gb, 8); /* Se */ - get_bits(&s->gb, 8); /* not used */ + skip_bits(&s->gb, 8); /* Ss */ + skip_bits(&s->gb, 8); /* Se */ + skip_bits(&s->gb, 8); /* Ah and Al (each are 4 bits) */ for(i=0;i<nb_components;i++) s->last_dc[i] = 1024; @@ -787,22 +980,24 @@ static int mjpeg_decode_sos(MJpegDecodeContext *s, v = v_count[i]; x = 0; y = 0; + if (s->restart_interval && !s->restart_count) + s->restart_count = s->restart_interval; for(j=0;j<n;j++) { memset(s->block, 0, sizeof(s->block)); if (decode_block(s, s->block, i, dc_index[i], ac_index[i], s->quant_index[c]) < 0) { - dprintf("error %d %d\n", mb_y, mb_x); + dprintf("error y=%d x=%d\n", mb_y, mb_x); ret = -1; goto the_end; } - ff_idct (s->block); +// dprintf("mb: %d %d processed\n", mb_y, mb_x); ptr = s->current_picture[c] + (s->linesize[c] * (v * mb_y + y) * 8) + (h * mb_x + x) * 8; if (s->interlaced && s->bottom_field) ptr += s->linesize[c] >> 1; - put_pixels_clamped(s->block, ptr, s->linesize[c]); + ff_idct_put(ptr, s->linesize[c], s->block); if (++x == h) { x = 0; y++; @@ -815,6 +1010,139 @@ static int mjpeg_decode_sos(MJpegDecodeContext *s, the_end: emms_c(); return ret; + out_of_range: + dprintf("decode_sos: ac/dc index out of range\n"); + return -1; +} + +static int mjpeg_decode_dri(MJpegDecodeContext *s, + UINT8 *buf, int buf_size) +{ + init_get_bits(&s->gb, buf, buf_size); + + if (get_bits(&s->gb, 16) != 4) + return -1; + s->restart_interval = get_bits(&s->gb, 16); + printf("restart interval: %d\n", s->restart_interval); + + return 0; +} + +#define FOURCC(a,b,c,d) ((a << 24) | (b << 16) | (c << 8) | d) +static int mjpeg_decode_app(MJpegDecodeContext *s, + UINT8 *buf, int buf_size, int start_code) +{ + int len, id; + + init_get_bits(&s->gb, buf, buf_size); + + /* XXX: verify len field validity */ + len = get_bits(&s->gb, 16); + if (len < 5) + return -1; + + id = (get_bits(&s->gb, 16) << 16) | get_bits(&s->gb, 16); + len -= 6; + + /* buggy AVID, it puts EOI only at every 10th frame */ + /* also this fourcc is used by non-avid files too, it means + interleaving, but it's always present in AVID files */ + if (id == FOURCC('A','V','I','1')) + { + /* structure: + 4bytes AVI1 + 1bytes polarity + 1bytes always zero + 4bytes field_size + 4bytes field_size_less_padding + */ + s->buggy_avid = 1; + if (s->first_picture) + printf("mjpeg: workarounding buggy AVID\n"); + s->interleaved_rows = get_bits(&s->gb, 8); +#if 0 + skip_bits(&s->gb, 8); + skip_bits(&s->gb, 32); + skip_bits(&s->gb, 32); + len -= 10; +#endif + if (s->interleaved_rows) + printf("mjpeg: interleaved rows: %d\n", s->interleaved_rows); + goto out; + } + + len -= 2; + + if (id == FOURCC('J','F','I','F')) + { + skip_bits(&s->gb, 8); /* the trailing zero-byte */ + printf("mjpeg: JFIF header found (version: %x.%x)\n", + get_bits(&s->gb, 8), get_bits(&s->gb, 8)); + goto out; + } + + /* Apple MJPEG-A */ + if ((start_code == APP1) && (len > (0x28 - 8))) + { + id = (get_bits(&s->gb, 16) << 16) | get_bits(&s->gb, 16); + len -= 4; + if (id == FOURCC('m','j','p','g')) /* Apple MJPEG-A */ + { +#if 0 + skip_bits(&s->gb, 32); /* field size */ + skip_bits(&s->gb, 32); /* pad field size */ + skip_bits(&s->gb, 32); /* next off */ + skip_bits(&s->gb, 32); /* quant off */ + skip_bits(&s->gb, 32); /* huff off */ + skip_bits(&s->gb, 32); /* image off */ + skip_bits(&s->gb, 32); /* scan off */ + skip_bits(&s->gb, 32); /* data off */ +#endif + if (s->first_picture) + printf("mjpeg: Apple MJPEG-A header found\n"); + } + } + +out: + /* should check for further values.. */ + SKIP_REMAINING(&s->gb, len); + + return 0; +} +#undef FOURCC + +static int mjpeg_decode_com(MJpegDecodeContext *s, + UINT8 *buf, int buf_size) +{ + int len, i; + UINT8 *cbuf; + + init_get_bits(&s->gb, buf, buf_size); + + /* XXX: verify len field validity */ + len = get_bits(&s->gb, 16)-2; + cbuf = av_malloc(len+1); + + for (i = 0; i < len; i++) + cbuf[i] = get_bits(&s->gb, 8); + if (cbuf[i-1] == '\n') + cbuf[i-1] = 0; + else + cbuf[i] = 0; + + printf("mjpeg comment: '%s'\n", cbuf); + + /* buggy avid, it puts EOI only at every 10th frame */ + if (!strcmp(cbuf, "AVID")) + { + s->buggy_avid = 1; + if (s->first_picture) + printf("mjpeg: workarounding buggy AVID\n"); + } + + av_free(cbuf); + + return 0; } /* return the 8 bit start code value and update the search @@ -858,8 +1186,9 @@ static int mjpeg_decode_frame(AVCodecContext *avctx, { MJpegDecodeContext *s = avctx->priv_data; UINT8 *buf_end, *buf_ptr, *buf_start; - int len, code, start_code, input_size, i; + int len, code, input_size, i; AVPicture *picture = data; + unsigned int start_code; *data_size = 0; @@ -883,10 +1212,13 @@ static int mjpeg_decode_frame(AVCodecContext *avctx, } else { memcpy(s->buf_ptr, buf_start, len); s->buf_ptr += len; - /* if we got FF 00, we copy FF to the stream to unescape FF 00 */ - if (code == 0) { + if (code < 0) { + /* nothing to do: wait next marker */ + } else if (code == 0 || code == 0xff) { + /* if we got FF 00, we copy FF to the stream to unescape FF 00 */ + /* valid marker code is between 00 and ff - alex */ s->buf_ptr--; - } else if (code > 0) { + } else { /* prepare data for next start code */ input_size = s->buf_ptr - s->buffer; start_code = s->start_code; @@ -895,6 +1227,7 @@ static int mjpeg_decode_frame(AVCodecContext *avctx, dprintf("marker=%x\n", start_code); switch(start_code) { case SOI: + s->restart_interval = 0; /* nothing to do on SOI */ break; case DQT: @@ -908,7 +1241,7 @@ static int mjpeg_decode_frame(AVCodecContext *avctx, break; case SOS: mjpeg_decode_sos(s, s->buffer, input_size); - if (s->start_code == EOI) { + if (s->start_code == EOI || s->buggy_avid || s->restart_interval) { int l; if (s->interlaced) { s->bottom_field ^= 1; @@ -943,11 +1276,41 @@ static int mjpeg_decode_frame(AVCodecContext *avctx, } /* dummy quality */ /* XXX: infer it with matrix */ - avctx->quality = 3; + avctx->quality = 3; goto the_end; } break; + case DRI: + mjpeg_decode_dri(s, s->buffer, input_size); + break; + case SOF1: + case SOF2: + case SOF3: + case SOF5: + case SOF6: + case SOF7: + case SOF9: + case SOF10: + case SOF11: + case SOF13: + case SOF14: + case SOF15: + case JPG: + printf("mjpeg: unsupported coding type (%x)\n", start_code); + return -1; } +#if 1 + if (start_code >= 0xd0 && start_code <= 0xd7) { + dprintf("restart marker: %d\n", start_code&0x0f); + } else if (s->first_picture) { + /* APP fields */ + if (start_code >= 0xe0 && start_code <= 0xef) + mjpeg_decode_app(s, s->buffer, input_size, start_code); + /* Comment */ + else if (start_code == COM) + mjpeg_decode_com(s, s->buffer, input_size); + } +#endif } } } @@ -961,7 +1324,7 @@ static int mjpeg_decode_end(AVCodecContext *avctx) int i, j; for(i=0;i<MAX_COMPONENTS;i++) - free(s->current_picture[i]); + av_free(s->current_picture[i]); for(i=0;i<2;i++) { for(j=0;j<4;j++) free_vlc(&s->vlcs[i][j]); @@ -978,4 +1341,6 @@ AVCodec mjpeg_decoder = { NULL, mjpeg_decode_end, mjpeg_decode_frame, + 0, + NULL }; diff --git a/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c b/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c index 236c9206a..4539b6464 100644 --- a/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c +++ b/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c @@ -1,20 +1,20 @@ /* * Sun mediaLib optimized DSP utils - * Copyright (c) 2001 Gerard Lantau. + * Copyright (c) 2001 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "../dsputil.h" diff --git a/src/libffmpeg/libavcodec/motion_est.c b/src/libffmpeg/libavcodec/motion_est.c index 92724ac87..8f2ffa42e 100644 --- a/src/libffmpeg/libavcodec/motion_est.c +++ b/src/libffmpeg/libavcodec/motion_est.c @@ -1,43 +1,45 @@ /* * Motion estimation - * Copyright (c) 2000,2001 Gerard Lantau. + * Copyright (c) 2000,2001 Fabrice Bellard. + * Copyright (c) 2002 Michael Niedermayer * * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * new Motion Estimation (X1/EPZS) by Michael Niedermayer <michaelni@gmx.at> */ -#include "config.h" -#include "xine-utils/xineutils.h" #include <stdlib.h> #include <stdio.h> #include "avcodec.h" #include "dsputil.h" #include "mpegvideo.h" -#define ABS(a) ((a)>0 ? (a) : -(a)) -#define MAX(a,b) ((a) > (b) ? (a) : (b)) +#define SQ(a) ((a)*(a)) #define INTER_BIAS 257 -static void halfpel_motion_search(MpegEncContext * s, - int *mx_ptr, int *my_ptr, int dmin, - int xmin, int ymin, int xmax, int ymax, - int pred_x, int pred_y); +#define P_LAST P[0] +#define P_LEFT P[1] +#define P_TOP P[2] +#define P_TOPRIGHT P[3] +#define P_MEDIAN P[4] +#define P_LAST_LEFT P[5] +#define P_LAST_RIGHT P[6] +#define P_LAST_TOP P[7] +#define P_LAST_BOTTOM P[8] +#define P_MV1 P[9] -/* config it to test motion vector encoding (send random vectors) */ -//#define CONFIG_TEST_MV_ENCODE static int pix_sum(UINT8 * pix, int line_size) { @@ -140,7 +142,7 @@ static void no_motion_search(MpegEncContext * s, static int full_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr, int range, - int xmin, int ymin, int xmax, int ymax) + int xmin, int ymin, int xmax, int ymax, uint8_t *ref_picture) { int x1, y1, x2, y2, xx, yy, x, y; int mx, my, dmin, d; @@ -166,7 +168,7 @@ static int full_motion_search(MpegEncContext * s, my = 0; for (y = y1; y <= y2; y++) { for (x = x1; x <= x2; x++) { - d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, + d = pix_abs16x16(pix, ref_picture + (y * s->linesize) + x, s->linesize); if (d < dmin || (d == dmin && @@ -194,7 +196,7 @@ static int full_motion_search(MpegEncContext * s, static int log_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr, int range, - int xmin, int ymin, int xmax, int ymax) + int xmin, int ymin, int xmax, int ymax, uint8_t *ref_picture) { int x1, y1, x2, y2, xx, yy, x, y; int mx, my, dmin, d; @@ -231,7 +233,7 @@ static int log_motion_search(MpegEncContext * s, do { for (y = y1; y <= y2; y += range) { for (x = x1; x <= x2; x += range) { - d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize); + d = pix_abs16x16(pix, ref_picture + (y * s->linesize) + x, s->linesize); if (d < dmin || (d == dmin && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) { dmin = d; mx = x; @@ -270,7 +272,7 @@ static int log_motion_search(MpegEncContext * s, static int phods_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr, int range, - int xmin, int ymin, int xmax, int ymax) + int xmin, int ymin, int xmax, int ymax, uint8_t *ref_picture) { int x1, y1, x2, y2, xx, yy, x, y, lastx, d; int mx, my, dminx, dminy; @@ -311,7 +313,7 @@ static int phods_motion_search(MpegEncContext * s, lastx = x; for (x = x1; x <= x2; x += range) { - d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize); + d = pix_abs16x16(pix, ref_picture + (y * s->linesize) + x, s->linesize); if (d < dminx || (d == dminx && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) { dminx = d; mx = x; @@ -320,7 +322,7 @@ static int phods_motion_search(MpegEncContext * s, x = lastx; for (y = y1; y <= y2; y += range) { - d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize); + d = pix_abs16x16(pix, ref_picture + (y * s->linesize) + x, s->linesize); if (d < dminy || (d == dminy && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) { dminy = d; my = y; @@ -364,62 +366,61 @@ static int phods_motion_search(MpegEncContext * s, #define CHECK_MV(x,y)\ {\ - d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\ - d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\ - if(d<dmin){\ - best[0]=x;\ - best[1]=y;\ - dmin=d;\ + const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\ + const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\ + if(map[index]!=key){\ + d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\ + d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\ + COPY3_IF_LT(dmin, d, best[0], x, best[1], y)\ + map[index]= key;\ + score_map[index]= d;\ }\ } #define CHECK_MV_DIR(x,y,new_dir)\ {\ - d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\ - d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\ - if(d<dmin){\ - best[0]=x;\ - best[1]=y;\ - dmin=d;\ - next_dir= new_dir;\ + const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\ + const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\ + if(map[index]!=key){\ + d = pix_abs(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\ + d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\ + if(d<dmin){\ + best[0]=x;\ + best[1]=y;\ + dmin=d;\ + next_dir= new_dir;\ + }\ + map[index]= key;\ + score_map[index]= d;\ }\ } #define CHECK_MV4(x,y)\ {\ - d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\ - d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\ - if(d<dmin){\ - best[0]=x;\ - best[1]=y;\ - dmin=d;\ + const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\ + const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\ + if(map[index]!=key){\ + d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\ + d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\ + COPY3_IF_LT(dmin, d, best[0], x, best[1], y)\ + map[index]= key;\ + score_map[index]= d;\ }\ } -#define CHECK_MV4_DIR(x,y,new_dir)\ -{\ - d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\ - d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\ - if(d<dmin){\ - best[0]=x;\ - best[1]=y;\ - dmin=d;\ - next_dir= new_dir;\ - }\ -} - - #define check(x,y,S,v)\ -if( (x)<(xmin<<(S)) ) printf("%d %d %d %d xmin" #v, (x), (y), s->mb_x, s->mb_y);\ -if( (x)>(xmax<<(S)) ) printf("%d %d %d %d xmax" #v, (x), (y), s->mb_x, s->mb_y);\ -if( (y)<(ymin<<(S)) ) printf("%d %d %d %d ymin" #v, (x), (y), s->mb_x, s->mb_y);\ -if( (y)>(ymax<<(S)) ) printf("%d %d %d %d ymax" #v, (x), (y), s->mb_x, s->mb_y);\ +if( (x)<(xmin<<(S)) ) printf("%d %d %d %d %d xmin" #v, xmin, (x), (y), s->mb_x, s->mb_y);\ +if( (x)>(xmax<<(S)) ) printf("%d %d %d %d %d xmax" #v, xmax, (x), (y), s->mb_x, s->mb_y);\ +if( (y)<(ymin<<(S)) ) printf("%d %d %d %d %d ymin" #v, ymin, (x), (y), s->mb_x, s->mb_y);\ +if( (y)>(ymax<<(S)) ) printf("%d %d %d %d %d ymax" #v, ymax, (x), (y), s->mb_x, s->mb_y);\ static inline int small_diamond_search(MpegEncContext * s, int *best, int dmin, UINT8 *new_pic, UINT8 *old_pic, int pic_stride, int pred_x, int pred_y, UINT16 *mv_penalty, int quant, - int xmin, int ymin, int xmax, int ymax, int shift) + int xmin, int ymin, int xmax, int ymax, int shift, + uint32_t *map, uint16_t *score_map, int map_generation, + op_pixels_abs_func pix_abs) { int next_dir=-1; @@ -467,36 +468,19 @@ static inline int small_diamond_search(MpegEncContext * s, int *best, int dmin, */ } -static inline int small_diamond_search4MV(MpegEncContext * s, int *best, int dmin, - UINT8 *new_pic, UINT8 *old_pic, int pic_stride, - int pred_x, int pred_y, UINT16 *mv_penalty, int quant, - int xmin, int ymin, int xmax, int ymax, int shift) -{ - int next_dir=-1; - - for(;;){ - int d; - const int dir= next_dir; - const int x= best[0]; - const int y= best[1]; - next_dir=-1; - -//printf("%d", dir); - if(dir!=2 && x>xmin) CHECK_MV4_DIR(x-1, y , 0) - if(dir!=3 && y>ymin) CHECK_MV4_DIR(x , y-1, 1) - if(dir!=0 && x<xmax) CHECK_MV4_DIR(x+1, y , 2) - if(dir!=1 && y<ymax) CHECK_MV4_DIR(x , y+1, 3) - - if(next_dir==-1){ - return dmin; - } - } -} - +#if 1 +#define SNAKE_1 3 +#define SNAKE_2 2 +#else +#define SNAKE_1 7 +#define SNAKE_2 3 +#endif static inline int snake_search(MpegEncContext * s, int *best, int dmin, UINT8 *new_pic, UINT8 *old_pic, int pic_stride, int pred_x, int pred_y, UINT16 *mv_penalty, int quant, - int xmin, int ymin, int xmax, int ymax, int shift) + int xmin, int ymin, int xmax, int ymax, int shift, + uint32_t *map, uint16_t *score_map,int map_generation, + op_pixels_abs_func pix_abs) { int dir=0; int c=1; @@ -522,8 +506,15 @@ if(256*256*256*64%point==0) x+=x_dir[dir]; y+=y_dir[dir]; if(x>=xmin && x<=xmax && y>=ymin && y<=ymax){ - d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride); - d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant; + const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation; + const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1); + if(map[index]!=key){ + d = pix_abs(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride); + d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant; + map[index]=key; + score_map[index]=d; + }else + d= dmin+1; }else{ d = dmin + 10000; //FIXME smarter boundary handling } @@ -542,21 +533,90 @@ if(256*256*256*64%point==0) }else{ //bad++; if(fails){ - if(fails>=3) return dmin; + if(fails>=SNAKE_1+1) return dmin; }else{ - c= -c; + if(dir&1) dir-= c*3; + else c= -c; +// c= -c; } - dir+=c*2; + dir+=c*SNAKE_2; fails++; } dir&=7; } } +static inline int cross_search(MpegEncContext * s, int *best, int dmin, + UINT8 *new_pic, UINT8 *old_pic, int pic_stride, + int pred_x, int pred_y, UINT16 *mv_penalty, int quant, + int xmin, int ymin, int xmax, int ymax, int shift, + uint32_t *map, uint16_t *score_map,int map_generation, + op_pixels_abs_func pix_abs) +{ + static int x_dir[4]= {-1, 0, 1, 0}; + static int y_dir[4]= { 0,-1, 0, 1}; + int improvement[2]={100000, 100000}; + int dirs[2]={2, 3}; + int dir; + int last_dir= -1; + + for(;;){ + dir= dirs[ improvement[0] > improvement[1] ? 0 : 1 ]; + if(improvement[dir&1]==-1) return dmin; + + { + const int x= best[0] + x_dir[dir]; + const int y= best[1] + y_dir[dir]; + const int key= (y<<ME_MAP_MV_BITS) + x + map_generation; + const int index= ((y<<ME_MAP_SHIFT) + x)&(ME_MAP_SIZE-1); + int d; + if(x>=xmin && x<=xmax && y>=ymin && y<=ymax){ + if(map[index]!=key){ + d = pix_abs(new_pic, old_pic + x + y*pic_stride, pic_stride); + d += (mv_penalty[(x<<shift)-pred_x] + mv_penalty[(y<<shift)-pred_y])*quant; + map[index]=key; + score_map[index]=d; + if(d<dmin){ + improvement[dir&1]= dmin-d; + improvement[(dir&1)^1]++; + dmin=d; + best[0]= x; + best[1]= y; + last_dir=dir; + continue; + } + }else{ + d= score_map[index]; + } + }else{ + d= dmin + 1000; //FIXME is this a good idea? + } + /* evaluated point was cached or checked and worse */ + + if(last_dir==dir){ + improvement[dir&1]= -1; + }else{ + improvement[dir&1]= d-dmin; + last_dir= dirs[dir&1]= dir^2; + } + } + } +} + +static inline int update_map_generation(MpegEncContext * s) +{ + s->me_map_generation+= 1<<(ME_MAP_MV_BITS*2); + if(s->me_map_generation==0){ + s->me_map_generation= 1<<(ME_MAP_MV_BITS*2); + memset(s->me_map, 0, sizeof(uint32_t)*ME_MAP_SIZE); + } + return s->me_map_generation; +} + static int epzs_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr, - int P[5][2], int pred_x, int pred_y, - int xmin, int ymin, int xmax, int ymax) + int P[10][2], int pred_x, int pred_y, + int xmin, int ymin, int xmax, int ymax, uint8_t * ref_picture) { int best[2]={0, 0}; int d, dmin; @@ -566,42 +626,74 @@ static int epzs_motion_search(MpegEncContext * s, UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame int quant= s->qscale; // qscale of the prev frame const int shift= 1+s->quarter_sample; + uint32_t *map= s->me_map; + uint16_t *score_map= s->me_score_map; + int map_generation; new_pic = s->new_picture[0] + pic_xy; - old_pic = s->last_picture[0] + pic_xy; - + old_pic = ref_picture + pic_xy; + + map_generation= update_map_generation(s); + dmin = pix_abs16x16(new_pic, old_pic, pic_stride); - if(dmin<Z_THRESHOLD){ - *mx_ptr= 0; - *my_ptr= 0; -//printf("Z"); - return dmin; - } + map[0]= map_generation; + score_map[0]= dmin; /* first line */ - if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line)) { - CHECK_MV(P[1][0]>>shift, P[1][1]>>shift) + if ((s->mb_y == 0 || s->first_slice_line)) { + CHECK_MV(P_LEFT[0]>>shift, P_LEFT[1]>>shift) + CHECK_MV(P_LAST[0]>>shift, P_LAST[1]>>shift) }else{ - CHECK_MV(P[4][0]>>shift, P[4][1]>>shift) - if(dmin<Z_THRESHOLD){ - *mx_ptr= P[4][0]>>shift; - *my_ptr= P[4][1]>>shift; -//printf("M\n"); + if(dmin<256 && ( P_LEFT[0] |P_LEFT[1] + |P_TOP[0] |P_TOP[1] + |P_TOPRIGHT[0]|P_TOPRIGHT[1])==0){ + *mx_ptr= 0; + *my_ptr= 0; + s->skip_me=1; return dmin; } - CHECK_MV(P[1][0]>>shift, P[1][1]>>shift) - CHECK_MV(P[2][0]>>shift, P[2][1]>>shift) - CHECK_MV(P[3][0]>>shift, P[3][1]>>shift) + CHECK_MV(P_MEDIAN[0]>>shift, P_MEDIAN[1]>>shift) + if(dmin>256*2){ + CHECK_MV(P_LAST[0] >>shift, P_LAST[1] >>shift) + CHECK_MV(P_LEFT[0] >>shift, P_LEFT[1] >>shift) + CHECK_MV(P_TOP[0] >>shift, P_TOP[1] >>shift) + CHECK_MV(P_TOPRIGHT[0]>>shift, P_TOPRIGHT[1]>>shift) + } } - CHECK_MV(P[0][0]>>shift, P[0][1]>>shift) - + if(dmin>256*4){ + CHECK_MV(P_LAST_RIGHT[0] >>shift, P_LAST_RIGHT[1] >>shift) + CHECK_MV(P_LAST_BOTTOM[0]>>shift, P_LAST_BOTTOM[1]>>shift) + } +#if 0 //doest only slow things down + if(dmin>512*3){ + int step; + dmin= score_map[0]; + best[0]= best[1]=0; + for(step=128; step>0; step>>=1){ + const int step2= step; + int y; + for(y=-step2+best[1]; y<=step2+best[1]; y+=step){ + int x; + if(y<ymin || y>ymax) continue; + + for(x=-step2+best[0]; x<=step2+best[0]; x+=step){ + if(x<xmin || x>xmax) continue; + if(x==best[0] && y==best[1]) continue; + CHECK_MV(x,y) + } + } + } + } +#endif //check(best[0],best[1],0, b0) - if(s->full_search==ME_EPZS) + if(s->me_method==ME_EPZS) dmin= small_diamond_search(s, best, dmin, new_pic, old_pic, pic_stride, - pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, shift); + pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, + shift, map, score_map, map_generation, pix_abs16x16); else - dmin= snake_search(s, best, dmin, new_pic, old_pic, pic_stride, - pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, shift); + dmin= cross_search(s, best, dmin, new_pic, old_pic, pic_stride, + pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, + shift, map, score_map, map_generation, pix_abs16x16); //check(best[0],best[1],0, b1) *mx_ptr= best[0]; *my_ptr= best[1]; @@ -612,8 +704,8 @@ static int epzs_motion_search(MpegEncContext * s, static int epzs_motion_search4(MpegEncContext * s, int block, int *mx_ptr, int *my_ptr, - int P[6][2], int pred_x, int pred_y, - int xmin, int ymin, int xmax, int ymax) + int P[10][2], int pred_x, int pred_y, + int xmin, int ymin, int xmax, int ymax, uint8_t *ref_picture) { int best[2]={0, 0}; int d, dmin; @@ -623,34 +715,47 @@ static int epzs_motion_search4(MpegEncContext * s, int block, UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame int quant= s->qscale; // qscale of the prev frame const int shift= 1+s->quarter_sample; + uint32_t *map= s->me_map; + uint16_t *score_map= s->me_score_map; + int map_generation; new_pic = s->new_picture[0] + pic_xy; - old_pic = s->last_picture[0] + pic_xy; - - dmin = pix_abs8x8(new_pic, old_pic, pic_stride); + old_pic = ref_picture + pic_xy; + map_generation= update_map_generation(s); + + dmin = 1000000; +//printf("%d %d %d %d //",xmin, ymin, xmax, ymax); /* first line */ - if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) { - CHECK_MV4(P[1][0]>>shift, P[1][1]>>shift) + if ((s->mb_y == 0 || s->first_slice_line) && block<2) { + CHECK_MV4(P_LEFT[0]>>shift, P_LEFT[1]>>shift) + CHECK_MV4(P_LAST[0]>>shift, P_LAST[1]>>shift) + CHECK_MV4(P_MV1[0]>>shift, P_MV1[1]>>shift) }else{ - CHECK_MV4(P[4][0]>>shift, P[4][1]>>shift) - if(dmin<Z_THRESHOLD){ - *mx_ptr= P[4][0]>>shift; - *my_ptr= P[4][1]>>shift; -//printf("M\n"); - return dmin; + CHECK_MV4(P_MV1[0]>>shift, P_MV1[1]>>shift) + //FIXME try some early stop + if(dmin>64*2){ + CHECK_MV4(P_MEDIAN[0]>>shift, P_MEDIAN[1]>>shift) + CHECK_MV4(P_LEFT[0]>>shift, P_LEFT[1]>>shift) + CHECK_MV4(P_TOP[0]>>shift, P_TOP[1]>>shift) + CHECK_MV4(P_TOPRIGHT[0]>>shift, P_TOPRIGHT[1]>>shift) + CHECK_MV4(P_LAST[0]>>shift, P_LAST[1]>>shift) } - CHECK_MV4(P[1][0]>>shift, P[1][1]>>shift) - CHECK_MV4(P[2][0]>>shift, P[2][1]>>shift) - CHECK_MV4(P[3][0]>>shift, P[3][1]>>shift) } - CHECK_MV4(P[0][0]>>shift, P[0][1]>>shift) - CHECK_MV4(P[5][0]>>shift, P[5][1]>>shift) + if(dmin>64*4){ + CHECK_MV4(P_LAST_RIGHT[0]>>shift, P_LAST_RIGHT[1]>>shift) + CHECK_MV4(P_LAST_BOTTOM[0]>>shift, P_LAST_BOTTOM[1]>>shift) + } + + if(s->me_method==ME_EPZS) + dmin= small_diamond_search(s, best, dmin, new_pic, old_pic, pic_stride, + pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, + shift, map, score_map, map_generation, pix_abs8x8); + else + dmin= cross_search(s, best, dmin, new_pic, old_pic, pic_stride, + pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, + shift, map, score_map, map_generation, pix_abs8x8); -//check(best[0],best[1],0, b0) - dmin= small_diamond_search4MV(s, best, dmin, new_pic, old_pic, pic_stride, - pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, shift); -//check(best[0],best[1],0, b1) *mx_ptr= best[0]; *my_ptr= best[1]; @@ -659,56 +764,50 @@ static int epzs_motion_search4(MpegEncContext * s, int block, } #define CHECK_HALF_MV(suffix, x, y) \ - d= pix_abs16x16_ ## suffix(pix, ptr+((x)>>1), s->linesize);\ +{\ + d= pix_abs_ ## suffix(pix, ptr+((x)>>1), s->linesize);\ d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*quant;\ - if(d<dminh){\ - dminh= d;\ - mx= mx1 + x;\ - my= my1 + y;\ - } + COPY3_IF_LT(dminh, d, dx, x, dy, y)\ +} -#define CHECK_HALF_MV4(suffix, x, y) \ - d= pix_abs8x8_ ## suffix(pix, ptr+((x)>>1), s->linesize);\ - d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*quant;\ - if(d<dminh){\ - dminh= d;\ - mx= mx1 + x;\ - my= my1 + y;\ - } /* The idea would be to make half pel ME after Inter/Intra decision to save time. */ -static inline void halfpel_motion_search(MpegEncContext * s, +static inline int halfpel_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr, int dmin, int xmin, int ymin, int xmax, int ymax, - int pred_x, int pred_y) + int pred_x, int pred_y, uint8_t *ref_picture, + op_pixels_abs_func pix_abs_x2, + op_pixels_abs_func pix_abs_y2, op_pixels_abs_func pix_abs_xy2, int n) { UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame const int quant= s->qscale; - int pen_x, pen_y; - int mx, my, mx1, my1, d, xx, yy, dminh; + int mx, my, xx, yy, dminh; UINT8 *pix, *ptr; - mx = *mx_ptr; - my = *my_ptr; - ptr = s->last_picture[0] + (my * s->linesize) + mx; + if(s->skip_me){ + *mx_ptr = 0; + *my_ptr = 0; + return dmin; + }else - xx = 16 * s->mb_x; - yy = 16 * s->mb_y; + xx = 16 * s->mb_x + 8*(n&1); + yy = 16 * s->mb_y + 8*(n>>1); pix = s->new_picture[0] + (yy * s->linesize) + xx; + + mx = *mx_ptr; + my = *my_ptr; + ptr = ref_picture + ((yy + my) * s->linesize) + (xx + mx); dminh = dmin; if (mx > xmin && mx < xmax && my > ymin && my < ymax) { + int dx=0, dy=0; + int d, pen_x, pen_y; - mx= mx1= 2*(mx - xx); - my= my1= 2*(my - yy); - if(dmin < Z_THRESHOLD && mx==0 && my==0){ - *mx_ptr = 0; - *my_ptr = 0; - return; - } + mx<<=1; + my<<=1; pen_x= pred_x + mx; pen_y= pred_y + my; @@ -725,80 +824,135 @@ static inline void halfpel_motion_search(MpegEncContext * s, CHECK_HALF_MV(y2 , 0, +1) CHECK_HALF_MV(xy2, +1, +1) + mx+=dx; + my+=dy; }else{ - mx= 2*(mx - xx); - my= 2*(my - yy); + mx<<=1; + my<<=1; } *mx_ptr = mx; *my_ptr = my; + return dminh; } -static inline void halfpel_motion_search4(MpegEncContext * s, +static inline int fast_halfpel_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr, int dmin, int xmin, int ymin, int xmax, int ymax, - int pred_x, int pred_y, int block_x, int block_y) + int pred_x, int pred_y, uint8_t *ref_picture, + op_pixels_abs_func pix_abs_x2, + op_pixels_abs_func pix_abs_y2, op_pixels_abs_func pix_abs_xy2, int n) { UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame + uint16_t *score_map= s->me_score_map; const int quant= s->qscale; - int pen_x, pen_y; - int mx, my, mx1, my1, d, xx, yy, dminh; + int mx, my, xx, yy, dminh; UINT8 *pix, *ptr; - xx = 8 * block_x; - yy = 8 * block_y; + if(s->skip_me){ +// printf("S"); + *mx_ptr = 0; + *my_ptr = 0; + return dmin; + } +// printf("N"); + + xx = 16 * s->mb_x + 8*(n&1); + yy = 16 * s->mb_y + 8*(n>>1); pix = s->new_picture[0] + (yy * s->linesize) + xx; - + mx = *mx_ptr; my = *my_ptr; - ptr = s->last_picture[0] + ((yy+my) * s->linesize) + xx + mx; - + ptr = ref_picture + ((yy + my) * s->linesize) + (xx + mx); + dminh = dmin; if (mx > xmin && mx < xmax && my > ymin && my < ymax) { + int dx=0, dy=0; + int d, pen_x, pen_y; + const int index= (my<<ME_MAP_SHIFT) + mx; + const int t= score_map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)]; + const int l= score_map[(index- 1 )&(ME_MAP_SIZE-1)]; + const int r= score_map[(index+ 1 )&(ME_MAP_SIZE-1)]; + const int b= score_map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)]; + mx<<=1; + my<<=1; - mx= mx1= 2*mx; - my= my1= 2*my; - if(dmin < Z_THRESHOLD && mx==0 && my==0){ - *mx_ptr = 0; - *my_ptr = 0; - return; - } pen_x= pred_x + mx; pen_y= pred_y + my; ptr-= s->linesize; - CHECK_HALF_MV4(xy2, -1, -1) - CHECK_HALF_MV4(y2 , 0, -1) - CHECK_HALF_MV4(xy2, +1, -1) - - ptr+= s->linesize; - CHECK_HALF_MV4(x2 , -1, 0) - CHECK_HALF_MV4(x2 , +1, 0) - CHECK_HALF_MV4(xy2, -1, +1) - CHECK_HALF_MV4(y2 , 0, +1) - CHECK_HALF_MV4(xy2, +1, +1) + if(t<=b){ + CHECK_HALF_MV(y2 , 0, -1) + if(l<=r){ + CHECK_HALF_MV(xy2, -1, -1) + if(t+r<=b+l){ + CHECK_HALF_MV(xy2, +1, -1) + ptr+= s->linesize; + }else{ + ptr+= s->linesize; + CHECK_HALF_MV(xy2, -1, +1) + } + CHECK_HALF_MV(x2 , -1, 0) + }else{ + CHECK_HALF_MV(xy2, +1, -1) + if(t+l<=b+r){ + CHECK_HALF_MV(xy2, -1, -1) + ptr+= s->linesize; + }else{ + ptr+= s->linesize; + CHECK_HALF_MV(xy2, +1, +1) + } + CHECK_HALF_MV(x2 , +1, 0) + } + }else{ + if(l<=r){ + if(t+l<=b+r){ + CHECK_HALF_MV(xy2, -1, -1) + ptr+= s->linesize; + }else{ + ptr+= s->linesize; + CHECK_HALF_MV(xy2, +1, +1) + } + CHECK_HALF_MV(x2 , -1, 0) + CHECK_HALF_MV(xy2, -1, +1) + }else{ + if(t+r<=b+l){ + CHECK_HALF_MV(xy2, +1, -1) + ptr+= s->linesize; + }else{ + ptr+= s->linesize; + CHECK_HALF_MV(xy2, -1, +1) + } + CHECK_HALF_MV(x2 , +1, 0) + CHECK_HALF_MV(xy2, +1, +1) + } + CHECK_HALF_MV(y2 , 0, +1) + } + mx+=dx; + my+=dy; }else{ - mx*=2; - my*=2; + mx<<=1; + my<<=1; } *mx_ptr = mx; *my_ptr = my; + return dminh; } -static inline void set_mv_tables(MpegEncContext * s, int mx, int my) +static inline void set_p_mv_tables(MpegEncContext * s, int mx, int my, int mv4) { - const int xy= s->mb_x + s->mb_y*s->mb_width; + const int xy= s->mb_x + 1 + (s->mb_y + 1)*(s->mb_width + 2); - s->mv_table[0][xy] = mx; - s->mv_table[1][xy] = my; + s->p_mv_table[xy][0] = mx; + s->p_mv_table[xy][1] = my; /* has allready been set to the 4 MV if 4MV is done */ - if(!(s->flags&CODEC_FLAG_4MV)){ + if(mv4){ int mot_xy= s->block_index[0]; s->motion_val[mot_xy ][0]= mx; @@ -814,59 +968,158 @@ static inline void set_mv_tables(MpegEncContext * s, int mx, int my) } } -#ifndef CONFIG_TEST_MV_ENCODE - -void estimate_motion(MpegEncContext * s, - int mb_x, int mb_y) +static inline void get_limits(MpegEncContext *s, int *range, int *xmin, int *ymin, int *xmax, int *ymax, int f_code) { - UINT8 *pix, *ppix; - int sum, varc, vard, mx, my, range, dmin, xx, yy; - int xmin, ymin, xmax, ymax; - int rel_xmin, rel_ymin, rel_xmax, rel_ymax; - int pred_x=0, pred_y=0; - int P[6][2]; - const int shift= 1+s->quarter_sample; - int mb_type=0; - - range = 8 * (1 << (s->f_code - 1)); + *range = 8 * (1 << (f_code - 1)); /* XXX: temporary kludge to avoid overflow for msmpeg4 */ if (s->out_format == FMT_H263 && !s->h263_msmpeg4) - range = range * 2; + *range *= 2; if (s->unrestricted_mv) { - xmin = -16; - ymin = -16; + *xmin = -16; + *ymin = -16; if (s->h263_plus) - range *= 2; + *range *= 2; if(s->avctx==NULL || s->avctx->codec->id!=CODEC_ID_MPEG4){ - xmax = s->mb_width*16; - ymax = s->mb_height*16; + *xmax = s->mb_width*16; + *ymax = s->mb_height*16; }else { /* XXX: dunno if this is correct but ffmpeg4 decoder wont like it otherwise (cuz the drawn edge isnt large enough))*/ - xmax = s->width; - ymax = s->height; + *xmax = s->width; + *ymax = s->height; } } else { - xmin = 0; - ymin = 0; - xmax = s->mb_width*16 - 16; - ymax = s->mb_height*16 - 16; + *xmin = 0; + *ymin = 0; + *xmax = s->mb_width*16 - 16; + *ymax = s->mb_height*16 - 16; + } +} + +static inline int mv4_search(MpegEncContext *s, int xmin, int ymin, int xmax, int ymax, int mx, int my, int shift) +{ + int block; + int P[10][2]; + uint8_t *ref_picture= s->last_picture[0]; + int dmin_sum=0; + + for(block=0; block<4; block++){ + int mx4, my4; + int pred_x4, pred_y4; + int dmin4; + static const int off[4]= {2, 1, 1, -1}; + const int mot_stride = s->block_wrap[0]; + const int mot_xy = s->block_index[block]; +// const int block_x= (block&1); +// const int block_y= (block>>1); +#if 1 // this saves us a bit of cliping work and shouldnt affect compression in a negative way + const int rel_xmin4= xmin; + const int rel_xmax4= xmax; + const int rel_ymin4= ymin; + const int rel_ymax4= ymax; +#else + const int rel_xmin4= xmin - block_x*8; + const int rel_xmax4= xmax - block_x*8 + 8; + const int rel_ymin4= ymin - block_y*8; + const int rel_ymax4= ymax - block_y*8 + 8; +#endif + P_LAST[0] = s->motion_val[mot_xy ][0]; + P_LAST[1] = s->motion_val[mot_xy ][1]; + P_LEFT[0] = s->motion_val[mot_xy - 1][0]; + P_LEFT[1] = s->motion_val[mot_xy - 1][1]; + P_LAST_RIGHT[0] = s->motion_val[mot_xy + 1][0]; + P_LAST_RIGHT[1] = s->motion_val[mot_xy + 1][1]; + P_LAST_BOTTOM[0]= s->motion_val[mot_xy + 1*mot_stride][0]; + P_LAST_BOTTOM[1]= s->motion_val[mot_xy + 1*mot_stride][1]; + + if(P_LEFT[0] > (rel_xmax4<<shift)) P_LEFT[0] = (rel_xmax4<<shift); + if(P_LAST_RIGHT[0] < (rel_xmin4<<shift)) P_LAST_RIGHT[0] = (rel_xmin4<<shift); + if(P_LAST_BOTTOM[1]< (rel_ymin4<<shift)) P_LAST_BOTTOM[1]= (rel_ymin4<<shift); + + /* special case for first line */ + if ((s->mb_y == 0 || s->first_slice_line) && block<2) { + pred_x4= P_LEFT[0]; + pred_y4= P_LEFT[1]; + } else { + P_TOP[0] = s->motion_val[mot_xy - mot_stride ][0]; + P_TOP[1] = s->motion_val[mot_xy - mot_stride ][1]; + P_TOPRIGHT[0] = s->motion_val[mot_xy - mot_stride + off[block]][0]; + P_TOPRIGHT[1] = s->motion_val[mot_xy - mot_stride + off[block]][1]; + if(P_TOP[1] > (rel_ymax4<<shift)) P_TOP[1] = (rel_ymax4<<shift); + if(P_TOPRIGHT[0] < (rel_xmin4<<shift)) P_TOPRIGHT[0]= (rel_xmin4<<shift); + if(P_TOPRIGHT[0] > (rel_xmax4<<shift)) P_TOPRIGHT[0]= (rel_xmax4<<shift); + if(P_TOPRIGHT[1] > (rel_ymax4<<shift)) P_TOPRIGHT[1]= (rel_ymax4<<shift); + + P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]); + P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]); + + if(s->out_format == FMT_H263){ + pred_x4 = P_MEDIAN[0]; + pred_y4 = P_MEDIAN[1]; + }else { /* mpeg1 at least */ + pred_x4= P_LEFT[0]; + pred_y4= P_LEFT[1]; + } + } + P_MV1[0]= mx; + P_MV1[1]= my; + + dmin4 = epzs_motion_search4(s, block, &mx4, &my4, P, pred_x4, pred_y4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, ref_picture); + + dmin4= fast_halfpel_motion_search(s, &mx4, &my4, dmin4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, + pred_x4, pred_y4, ref_picture, pix_abs8x8_x2, + pix_abs8x8_y2, pix_abs8x8_xy2, block); + + s->motion_val[ s->block_index[block] ][0]= mx4; + s->motion_val[ s->block_index[block] ][1]= my4; + dmin_sum+= dmin4; } - switch(s->full_search) { + return dmin_sum; +} + +void ff_estimate_p_frame_motion(MpegEncContext * s, + int mb_x, int mb_y) +{ + UINT8 *pix, *ppix; + int sum, varc, vard, mx, my, range, dmin, xx, yy; + int xmin, ymin, xmax, ymax; + int rel_xmin, rel_ymin, rel_xmax, rel_ymax; + int pred_x=0, pred_y=0; + int P[10][2]; + const int shift= 1+s->quarter_sample; + int mb_type=0; + uint8_t *ref_picture= s->last_picture[0]; + + get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, s->f_code); + rel_xmin= xmin - mb_x*16; + rel_xmax= xmax - mb_x*16; + rel_ymin= ymin - mb_y*16; + rel_ymax= ymax - mb_y*16; + s->skip_me=0; + + switch(s->me_method) { case ME_ZERO: default: no_motion_search(s, &mx, &my); + mx-= mb_x*16; + my-= mb_y*16; dmin = 0; break; case ME_FULL: - dmin = full_motion_search(s, &mx, &my, range, xmin, ymin, xmax, ymax); + dmin = full_motion_search(s, &mx, &my, range, xmin, ymin, xmax, ymax, ref_picture); + mx-= mb_x*16; + my-= mb_y*16; break; case ME_LOG: - dmin = log_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax); + dmin = log_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax, ref_picture); + mx-= mb_x*16; + my-= mb_y*16; break; case ME_PHODS: - dmin = phods_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax); + dmin = phods_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax, ref_picture); + mx-= mb_x*16; + my-= mb_y*16; break; case ME_X1: case ME_EPZS: @@ -874,133 +1127,69 @@ void estimate_motion(MpegEncContext * s, const int mot_stride = s->block_wrap[0]; const int mot_xy = s->block_index[0]; - rel_xmin= xmin - mb_x*16; - rel_xmax= xmax - mb_x*16; - rel_ymin= ymin - mb_y*16; - rel_ymax= ymax - mb_y*16; + P_LAST[0] = s->motion_val[mot_xy ][0]; + P_LAST[1] = s->motion_val[mot_xy ][1]; + P_LEFT[0] = s->motion_val[mot_xy - 1][0]; + P_LEFT[1] = s->motion_val[mot_xy - 1][1]; + P_LAST_RIGHT[0] = s->motion_val[mot_xy + 2][0]; + P_LAST_RIGHT[1] = s->motion_val[mot_xy + 2][1]; + P_LAST_BOTTOM[0]= s->motion_val[mot_xy + 2*mot_stride][0]; + P_LAST_BOTTOM[1]= s->motion_val[mot_xy + 2*mot_stride][1]; - P[0][0] = s->motion_val[mot_xy ][0]; - P[0][1] = s->motion_val[mot_xy ][1]; - P[1][0] = s->motion_val[mot_xy - 1][0]; - P[1][1] = s->motion_val[mot_xy - 1][1]; - if(P[1][0] > (rel_xmax<<shift)) P[1][0]= (rel_xmax<<shift); + if(P_LEFT[0] > (rel_xmax<<shift)) P_LEFT[0] = (rel_xmax<<shift); + if(P_LAST_RIGHT[0] < (rel_xmin<<shift)) P_LAST_RIGHT[0] = (rel_xmin<<shift); + if(P_LAST_BOTTOM[1]< (rel_ymin<<shift)) P_LAST_BOTTOM[1]= (rel_ymin<<shift); /* special case for first line */ - if ((mb_y == 0 || s->first_slice_line || s->first_gob_line)) { - P[4][0] = P[1][0]; - P[4][1] = P[1][1]; + if ((mb_y == 0 || s->first_slice_line)) { + pred_x= P_LEFT[0]; + pred_y= P_LEFT[1]; } else { - P[2][0] = s->motion_val[mot_xy - mot_stride ][0]; - P[2][1] = s->motion_val[mot_xy - mot_stride ][1]; - P[3][0] = s->motion_val[mot_xy - mot_stride + 2 ][0]; - P[3][1] = s->motion_val[mot_xy - mot_stride + 2 ][1]; - if(P[2][1] > (rel_ymax<<shift)) P[2][1]= (rel_ymax<<shift); - if(P[3][0] < (rel_xmin<<shift)) P[3][0]= (rel_xmin<<shift); - if(P[3][1] > (rel_ymax<<shift)) P[3][1]= (rel_ymax<<shift); + P_TOP[0] = s->motion_val[mot_xy - mot_stride ][0]; + P_TOP[1] = s->motion_val[mot_xy - mot_stride ][1]; + P_TOPRIGHT[0] = s->motion_val[mot_xy - mot_stride + 2][0]; + P_TOPRIGHT[1] = s->motion_val[mot_xy - mot_stride + 2][1]; + if(P_TOP[1] > (rel_ymax<<shift)) P_TOP[1] = (rel_ymax<<shift); + if(P_TOPRIGHT[0] < (rel_xmin<<shift)) P_TOPRIGHT[0]= (rel_xmin<<shift); + if(P_TOPRIGHT[1] > (rel_ymax<<shift)) P_TOPRIGHT[1]= (rel_ymax<<shift); - P[4][0]= mid_pred(P[1][0], P[2][0], P[3][0]); - P[4][1]= mid_pred(P[1][1], P[2][1], P[3][1]); - } - if(s->out_format == FMT_H263){ - pred_x = P[4][0]; - pred_y = P[4][1]; - }else { /* mpeg1 at least */ - pred_x= P[1][0]; - pred_y= P[1][1]; + P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]); + P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]); + + if(s->out_format == FMT_H263){ + pred_x = P_MEDIAN[0]; + pred_y = P_MEDIAN[1]; + }else { /* mpeg1 at least */ + pred_x= P_LEFT[0]; + pred_y= P_LEFT[1]; + } } } - dmin = epzs_motion_search(s, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax); + dmin = epzs_motion_search(s, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, ref_picture); - mx+= mb_x*16; - my+= mb_y*16; break; } - - if(s->flags&CODEC_FLAG_4MV){ - int block; - - mb_type|= MB_TYPE_INTER4V; - - for(block=0; block<4; block++){ - int mx4, my4; - int pred_x4, pred_y4; - int dmin4; - static const int off[4]= {2, 1, 1, -1}; - const int mot_stride = s->block_wrap[0]; - const int mot_xy = s->block_index[block]; - const int block_x= mb_x*2 + (block&1); - const int block_y= mb_y*2 + (block>>1); - - const int rel_xmin4= xmin - block_x*8; - const int rel_xmax4= xmax - block_x*8 + 8; - const int rel_ymin4= ymin - block_y*8; - const int rel_ymax4= ymax - block_y*8 + 8; - - P[0][0] = s->motion_val[mot_xy ][0]; - P[0][1] = s->motion_val[mot_xy ][1]; - P[1][0] = s->motion_val[mot_xy - 1][0]; - P[1][1] = s->motion_val[mot_xy - 1][1]; - if(P[1][0] > (rel_xmax4<<shift)) P[1][0]= (rel_xmax4<<shift); - - /* special case for first line */ - if ((mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) { - P[4][0] = P[1][0]; - P[4][1] = P[1][1]; - } else { - P[2][0] = s->motion_val[mot_xy - mot_stride ][0]; - P[2][1] = s->motion_val[mot_xy - mot_stride ][1]; - P[3][0] = s->motion_val[mot_xy - mot_stride + off[block]][0]; - P[3][1] = s->motion_val[mot_xy - mot_stride + off[block]][1]; - if(P[2][1] > (rel_ymax4<<shift)) P[2][1]= (rel_ymax4<<shift); - if(P[3][0] < (rel_xmin4<<shift)) P[3][0]= (rel_xmin4<<shift); - if(P[3][0] > (rel_xmax4<<shift)) P[3][0]= (rel_xmax4<<shift); - if(P[3][1] > (rel_ymax4<<shift)) P[3][1]= (rel_ymax4<<shift); - - P[4][0]= mid_pred(P[1][0], P[2][0], P[3][0]); - P[4][1]= mid_pred(P[1][1], P[2][1], P[3][1]); - } - if(s->out_format == FMT_H263){ - pred_x4 = P[4][0]; - pred_y4 = P[4][1]; - }else { /* mpeg1 at least */ - pred_x4= P[1][0]; - pred_y4= P[1][1]; - } - P[5][0]= mx - mb_x*16; - P[5][1]= my - mb_y*16; - - dmin4 = epzs_motion_search4(s, block, &mx4, &my4, P, pred_x4, pred_y4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4); - - halfpel_motion_search4(s, &mx4, &my4, dmin4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, - pred_x4, pred_y4, block_x, block_y); - - s->motion_val[ s->block_index[block] ][0]= mx4; - s->motion_val[ s->block_index[block] ][1]= my4; - } - } /* intra / predictive decision */ xx = mb_x * 16; yy = mb_y * 16; pix = s->new_picture[0] + (yy * s->linesize) + xx; - /* At this point (mx,my) are full-pell and the absolute displacement */ - ppix = s->last_picture[0] + (my * s->linesize) + mx; + /* At this point (mx,my) are full-pell and the relative displacement */ + ppix = ref_picture + ((yy+my) * s->linesize) + (xx+mx); sum = pix_sum(pix, s->linesize); -#if 0 - varc = pix_dev(pix, s->linesize, (sum+128)>>8) + INTER_BIAS; - vard = pix_abs16x16(pix, ppix, s->linesize); -#else + sum= (sum+8)>>4; - varc = ((pix_norm1(pix, s->linesize) - sum*sum + 128 + 500)>>8); + varc = (pix_norm1(pix, s->linesize) - sum*sum + 500 + 128)>>8; vard = (pix_norm(pix, ppix, s->linesize)+128)>>8; -#endif - - s->mb_var[s->mb_width * mb_y + mb_x] = varc; - s->avg_mb_var+= varc; - s->mc_mb_var += vard; - +//printf("%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout); + s->mb_var [s->mb_width * mb_y + mb_x] = varc; + s->mc_mb_var[s->mb_width * mb_y + mb_x] = vard; + s->mb_var_sum += varc; + s->mc_mb_var_sum += vard; +//printf("E%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout); + #if 0 printf("varc=%4d avg_var=%4d (sum=%4d) vard=%4d mx=%2d my=%2d\n", varc, s->avg_mb_var, sum, vard, mx - xx, my - yy); @@ -1010,68 +1199,556 @@ void estimate_motion(MpegEncContext * s, mb_type|= MB_TYPE_INTRA; if (varc*2 + 200 > vard){ mb_type|= MB_TYPE_INTER; - halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y); + if(s->me_method >= ME_EPZS) + fast_halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, + pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, + pix_abs16x16_xy2, 0); + else + halfpel_motion_search( s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, + pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, + pix_abs16x16_xy2, 0); }else{ - mx = mx*2 - mb_x*32; - my = my*2 - mb_y*32; + mx <<=1; + my <<=1; } + if((s->flags&CODEC_FLAG_4MV) + && !s->skip_me && varc>50 && vard>10){ + mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift); + mb_type|=MB_TYPE_INTER4V; + + set_p_mv_tables(s, mx, my, 0); + }else + set_p_mv_tables(s, mx, my, 1); }else{ if (vard <= 64 || vard < varc) { mb_type|= MB_TYPE_INTER; - if (s->full_search != ME_ZERO) { - halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y); + if (s->me_method != ME_ZERO) { + if(s->me_method >= ME_EPZS) + dmin= fast_halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, + pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, + pix_abs16x16_xy2, 0); + else + dmin= halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, + pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, + pix_abs16x16_xy2, 0); + if((s->flags&CODEC_FLAG_4MV) + && !s->skip_me && varc>50 && vard>10){ + int dmin4= mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift); + if(dmin4 + 128 <dmin) + mb_type= MB_TYPE_INTER4V; + } + set_p_mv_tables(s, mx, my, mb_type!=MB_TYPE_INTER4V); + } else { - mx -= 16 * mb_x; - my -= 16 * mb_y; + mx <<=1; + my <<=1; + } +#if 0 + if (vard < 10) { + skip++; + fprintf(stderr,"\nEarly skip: %d vard: %2d varc: %5d dmin: %d", + skip, vard, varc, dmin); } +#endif }else{ mb_type|= MB_TYPE_INTRA; - mx = 0;//mx*2 - 32 * mb_x; - my = 0;//my*2 - 32 * mb_y; + mx = 0; + my = 0; } } s->mb_type[mb_y*s->mb_width + mb_x]= mb_type; - set_mv_tables(s, mx, my); } -#else +int ff_estimate_motion_b(MpegEncContext * s, + int mb_x, int mb_y, int16_t (*mv_table)[2], uint8_t *ref_picture, int f_code) +{ + int mx, my, range, dmin; + int xmin, ymin, xmax, ymax; + int rel_xmin, rel_ymin, rel_xmax, rel_ymax; + int pred_x=0, pred_y=0; + int P[10][2]; + const int shift= 1+s->quarter_sample; + const int mot_stride = s->mb_width + 2; + const int mot_xy = (mb_y + 1)*mot_stride + mb_x + 1; + + get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, f_code); + rel_xmin= xmin - mb_x*16; + rel_xmax= xmax - mb_x*16; + rel_ymin= ymin - mb_y*16; + rel_ymax= ymax - mb_y*16; + + switch(s->me_method) { + case ME_ZERO: + default: + no_motion_search(s, &mx, &my); + dmin = 0; + mx-= mb_x*16; + my-= mb_y*16; + break; + case ME_FULL: + dmin = full_motion_search(s, &mx, &my, range, xmin, ymin, xmax, ymax, ref_picture); + mx-= mb_x*16; + my-= mb_y*16; + break; + case ME_LOG: + dmin = log_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax, ref_picture); + mx-= mb_x*16; + my-= mb_y*16; + break; + case ME_PHODS: + dmin = phods_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax, ref_picture); + mx-= mb_x*16; + my-= mb_y*16; + break; + case ME_X1: + case ME_EPZS: + { + + P_LAST[0] = mv_table[mot_xy ][0]; + P_LAST[1] = mv_table[mot_xy ][1]; + P_LEFT[0] = mv_table[mot_xy - 1][0]; + P_LEFT[1] = mv_table[mot_xy - 1][1]; + P_LAST_RIGHT[0] = mv_table[mot_xy + 1][0]; + P_LAST_RIGHT[1] = mv_table[mot_xy + 1][1]; + P_LAST_BOTTOM[0] = mv_table[mot_xy + mot_stride][0]; + P_LAST_BOTTOM[1] = mv_table[mot_xy + mot_stride][1]; + + if(P_LEFT[0] > (rel_xmax<<shift)) P_LEFT[0] = (rel_xmax<<shift); + if(P_LAST_RIGHT[0] < (rel_xmin<<shift)) P_LAST_RIGHT[0] = (rel_xmin<<shift); + if(P_LAST_BOTTOM[1]< (rel_ymin<<shift)) P_LAST_BOTTOM[1]= (rel_ymin<<shift); + + /* special case for first line */ + if ((mb_y == 0 || s->first_slice_line)) { + } else { + P_TOP[0] = mv_table[mot_xy - mot_stride ][0]; + P_TOP[1] = mv_table[mot_xy - mot_stride ][1]; + P_TOPRIGHT[0] = mv_table[mot_xy - mot_stride + 1 ][0]; + P_TOPRIGHT[1] = mv_table[mot_xy - mot_stride + 1 ][1]; + if(P_TOP[1] > (rel_ymax<<shift)) P_TOP[1]= (rel_ymax<<shift); + if(P_TOPRIGHT[0] < (rel_xmin<<shift)) P_TOPRIGHT[0]= (rel_xmin<<shift); + if(P_TOPRIGHT[1] > (rel_ymax<<shift)) P_TOPRIGHT[1]= (rel_ymax<<shift); + + P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]); + P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]); + } + pred_x= P_LEFT[0]; + pred_y= P_LEFT[1]; + } + dmin = epzs_motion_search(s, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, ref_picture); + + break; + } + + dmin= fast_halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, + pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, + pix_abs16x16_xy2, 0); +//printf("%d %d %d %d//", s->mb_x, s->mb_y, mx, my); +// s->mb_type[mb_y*s->mb_width + mb_x]= mb_type; + mv_table[mot_xy][0]= mx; + mv_table[mot_xy][1]= my; + return dmin; +} + + +static inline int check_bidir_mv(MpegEncContext * s, + int mb_x, int mb_y, + int motion_fx, int motion_fy, + int motion_bx, int motion_by, + int pred_fx, int pred_fy, + int pred_bx, int pred_by) +{ + //FIXME optimize? + //FIXME direct mode penalty + UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame + uint8_t *dest_y = s->me_scratchpad; + uint8_t *ptr; + int dxy; + int src_x, src_y; + int fbmin; + + fbmin = (mv_penalty[motion_fx-pred_fx] + mv_penalty[motion_fy-pred_fy])*s->qscale; + + dxy = ((motion_fy & 1) << 1) | (motion_fx & 1); + src_x = mb_x * 16 + (motion_fx >> 1); + src_y = mb_y * 16 + (motion_fy >> 1); + + ptr = s->last_picture[0] + (src_y * s->linesize) + src_x; + put_pixels_tab[dxy](dest_y , ptr , s->linesize, 16); + put_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16); + + fbmin += (mv_penalty[motion_bx-pred_bx] + mv_penalty[motion_by-pred_by])*s->qscale; + + dxy = ((motion_by & 1) << 1) | (motion_bx & 1); + src_x = mb_x * 16 + (motion_bx >> 1); + src_y = mb_y * 16 + (motion_by >> 1); + + ptr = s->next_picture[0] + (src_y * s->linesize) + src_x; + avg_pixels_tab[dxy](dest_y , ptr , s->linesize, 16); + avg_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16); + + fbmin += pix_abs16x16(s->new_picture[0] + mb_x*16 + mb_y*16*s->linesize, dest_y, s->linesize); + return fbmin; +} -/* test version which generates valid random vectors */ -int estimate_motion(MpegEncContext * s, - int mb_x, int mb_y, - int *mx_ptr, int *my_ptr) +/* refine the bidir vectors in hq mode and return the score in both lq & hq mode*/ +static inline int bidir_refine(MpegEncContext * s, + int mb_x, int mb_y) { - int xx, yy, x1, y1, x2, y2, range; - - if ((random() % 10) >= 5) { - range = 8 * (1 << (s->f_code - 1)); - if (s->out_format == FMT_H263 && !s->h263_msmpeg4) - range = range * 2; - - xx = 16 * s->mb_x; - yy = 16 * s->mb_y; - x1 = xx - range; - if (x1 < 0) - x1 = 0; - x2 = xx + range - 1; - if (x2 > (s->width - 16)) - x2 = s->width - 16; - y1 = yy - range; - if (y1 < 0) - y1 = 0; - y2 = yy + range - 1; - if (y2 > (s->height - 16)) - y2 = s->height - 16; - - *mx_ptr = (random() % (2 * (x2 - x1 + 1))) + 2 * (x1 - xx); - *my_ptr = (random() % (2 * (y2 - y1 + 1))) + 2 * (y1 - yy); - return 0; + const int mot_stride = s->mb_width + 2; + const int xy = (mb_y + 1)*mot_stride + mb_x + 1; + int fbmin; + int pred_fx= s->b_bidir_forw_mv_table[xy-1][0]; + int pred_fy= s->b_bidir_forw_mv_table[xy-1][1]; + int pred_bx= s->b_bidir_back_mv_table[xy-1][0]; + int pred_by= s->b_bidir_back_mv_table[xy-1][1]; + int motion_fx= s->b_bidir_forw_mv_table[xy][0]= s->b_forw_mv_table[xy][0]; + int motion_fy= s->b_bidir_forw_mv_table[xy][1]= s->b_forw_mv_table[xy][1]; + int motion_bx= s->b_bidir_back_mv_table[xy][0]= s->b_back_mv_table[xy][0]; + int motion_by= s->b_bidir_back_mv_table[xy][1]= s->b_back_mv_table[xy][1]; + + //FIXME do refinement and add flag + + fbmin= check_bidir_mv(s, mb_x, mb_y, + motion_fx, motion_fy, + motion_bx, motion_by, + pred_fx, pred_fy, + pred_bx, pred_by); + + return fbmin; +} + +static inline int direct_search(MpegEncContext * s, + int mb_x, int mb_y) +{ + int P[10][2]; + const int mot_stride = s->mb_width + 2; + const int mot_xy = (mb_y + 1)*mot_stride + mb_x + 1; + int dmin, dmin2; + int motion_fx, motion_fy, motion_bx, motion_by, motion_bx0, motion_by0; + int motion_dx, motion_dy; + const int motion_px= s->p_mv_table[mot_xy][0]; + const int motion_py= s->p_mv_table[mot_xy][1]; + const int time_pp= s->pp_time; + const int time_bp= s->bp_time; + const int time_pb= time_pp - time_bp; + int bx, by; + int mx, my, mx2, my2; + uint8_t *ref_picture= s->me_scratchpad - (mb_x + 1 + (mb_y + 1)*s->linesize)*16; + int16_t (*mv_table)[2]= s->b_direct_mv_table; + uint16_t *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame + + /* thanks to iso-mpeg the rounding is different for the zero vector, so we need to handle that ... */ + motion_fx= (motion_px*time_pb)/time_pp; + motion_fy= (motion_py*time_pb)/time_pp; + motion_bx0= (-motion_px*time_bp)/time_pp; + motion_by0= (-motion_py*time_bp)/time_pp; + motion_dx= motion_dy=0; + dmin2= check_bidir_mv(s, mb_x, mb_y, + motion_fx, motion_fy, + motion_bx0, motion_by0, + motion_fx, motion_fy, + motion_bx0, motion_by0) - s->qscale; + + motion_bx= motion_fx - motion_px; + motion_by= motion_fy - motion_py; + for(by=-1; by<2; by++){ + for(bx=-1; bx<2; bx++){ + uint8_t *dest_y = s->me_scratchpad + (by+1)*s->linesize*16 + (bx+1)*16; + uint8_t *ptr; + int dxy; + int src_x, src_y; + const int width= s->width; + const int height= s->height; + + dxy = ((motion_fy & 1) << 1) | (motion_fx & 1); + src_x = (mb_x + bx) * 16 + (motion_fx >> 1); + src_y = (mb_y + by) * 16 + (motion_fy >> 1); + src_x = clip(src_x, -16, width); + if (src_x == width) dxy &= ~1; + src_y = clip(src_y, -16, height); + if (src_y == height) dxy &= ~2; + + ptr = s->last_picture[0] + (src_y * s->linesize) + src_x; + put_pixels_tab[dxy](dest_y , ptr , s->linesize, 16); + put_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16); + + dxy = ((motion_by & 1) << 1) | (motion_bx & 1); + src_x = (mb_x + bx) * 16 + (motion_bx >> 1); + src_y = (mb_y + by) * 16 + (motion_by >> 1); + src_x = clip(src_x, -16, width); + if (src_x == width) dxy &= ~1; + src_y = clip(src_y, -16, height); + if (src_y == height) dxy &= ~2; + + avg_pixels_tab[dxy](dest_y , ptr , s->linesize, 16); + avg_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16); + } + } + + P_LAST[0] = mv_table[mot_xy ][0]; + P_LAST[1] = mv_table[mot_xy ][1]; + P_LEFT[0] = mv_table[mot_xy - 1][0]; + P_LEFT[1] = mv_table[mot_xy - 1][1]; + P_LAST_RIGHT[0] = mv_table[mot_xy + 1][0]; + P_LAST_RIGHT[1] = mv_table[mot_xy + 1][1]; + P_LAST_BOTTOM[0] = mv_table[mot_xy + mot_stride][0]; + P_LAST_BOTTOM[1] = mv_table[mot_xy + mot_stride][1]; +/* + if(P_LEFT[0] > (rel_xmax<<shift)) P_LEFT[0] = (rel_xmax<<shift); + if(P_LAST_RIGHT[0] < (rel_xmin<<shift)) P_LAST_RIGHT[0] = (rel_xmin<<shift); + if(P_LAST_BOTTOM[1]< (rel_ymin<<shift)) P_LAST_BOTTOM[1]= (rel_ymin<<shift); +*/ + /* special case for first line */ + if ((mb_y == 0 || s->first_slice_line)) { } else { - *mx_ptr = 0; - *my_ptr = 0; - return 1; + P_TOP[0] = mv_table[mot_xy - mot_stride ][0]; + P_TOP[1] = mv_table[mot_xy - mot_stride ][1]; + P_TOPRIGHT[0] = mv_table[mot_xy - mot_stride + 1 ][0]; + P_TOPRIGHT[1] = mv_table[mot_xy - mot_stride + 1 ][1]; + + P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]); + P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]); } + dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, -16, -16, 15, 15, ref_picture); + if(mx==0 && my==0) dmin=99999999; // not representable, due to rounding stuff + if(dmin2<dmin){ + dmin= dmin2; + mx=0; + my=0; + } +#if 1 + mx2= mx= mx*2; + my2= my= my*2; + for(by=-1; by<2; by++){ + if(my2+by < -32) continue; + for(bx=-1; bx<2; bx++){ + if(bx==0 && by==0) continue; + if(mx2+bx < -32) continue; + dmin2= check_bidir_mv(s, mb_x, mb_y, + mx2+bx+motion_fx, my2+by+motion_fy, + mx2+bx+motion_bx, my2+by+motion_by, + mx2+bx+motion_fx, my2+by+motion_fy, + motion_bx, motion_by) - s->qscale; + + if(dmin2<dmin){ + dmin=dmin2; + mx= mx2 + bx; + my= my2 + by; + } + } + } +#else + mx*=2; my*=2; +#endif + if(mx==0 && my==0){ + motion_bx= motion_bx0; + motion_by= motion_by0; + } + + s->b_direct_mv_table[mot_xy][0]= mx; + s->b_direct_mv_table[mot_xy][1]= my; + s->b_direct_forw_mv_table[mot_xy][0]= motion_fx + mx; + s->b_direct_forw_mv_table[mot_xy][1]= motion_fy + my; + s->b_direct_back_mv_table[mot_xy][0]= motion_bx + mx; + s->b_direct_back_mv_table[mot_xy][1]= motion_by + my; + return dmin; } -#endif +void ff_estimate_b_frame_motion(MpegEncContext * s, + int mb_x, int mb_y) +{ + const int quant= s->qscale; + int fmin, bmin, dmin, fbmin; + int type=0; + + dmin= direct_search(s, mb_x, mb_y); + + fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, s->last_picture[0], s->f_code); + bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, s->next_picture[0], s->b_code) - quant; +//printf(" %d %d ", s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1]); + + fbmin= bidir_refine(s, mb_x, mb_y); + + if(s->flags&CODEC_FLAG_HQ){ + type= MB_TYPE_FORWARD | MB_TYPE_BACKWARD | MB_TYPE_BIDIR | MB_TYPE_DIRECT; + }else{ + int score= dmin; + type=MB_TYPE_DIRECT; + + if(fmin<score){ + score=fmin; + type= MB_TYPE_FORWARD; + } + if(bmin<score){ + score=bmin; + type= MB_TYPE_BACKWARD; + } + if(fbmin<score){ + score=fbmin; + type= MB_TYPE_BIDIR; + } + s->mc_mb_var_sum += score; + s->mc_mb_var[mb_y*s->mb_width + mb_x] = score; + } +/* +{ +static int count=0; +static int sum=0; +if(type==MB_TYPE_DIRECT){ + int diff= ABS(s->b_forw_mv_table) +} +}*/ + + s->mb_type[mb_y*s->mb_width + mb_x]= type; +/* if(mb_y==0 && mb_x==0) printf("\n"); + if(mb_x==0) printf("\n"); + printf("%d", av_log2(type)); +*/ +} + +/* find best f_code for ME which do unlimited searches */ +int ff_get_best_fcode(MpegEncContext * s, int16_t (*mv_table)[2], int type) +{ + if(s->me_method>=ME_EPZS){ + int score[8]; + int i, y; + UINT8 * fcode_tab= s->fcode_tab; + int best_fcode=-1; + int best_score=-10000000; + + for(i=0; i<8; i++) score[i]= s->mb_num*(8-i); //FIXME *2 and all other too so its the same but nicer + + for(y=0; y<s->mb_height; y++){ + int x; + int xy= (y+1)* (s->mb_width+2) + 1; + i= y*s->mb_width; + for(x=0; x<s->mb_width; x++){ + if(s->mb_type[i] & type){ + int fcode= MAX(fcode_tab[mv_table[xy][0] + MAX_MV], + fcode_tab[mv_table[xy][1] + MAX_MV]); + int j; + + for(j=0; j<fcode && j<8; j++){ + if(s->pict_type==B_TYPE || s->mc_mb_var[i] < s->mb_var[i]) + score[j]-= 170; + } + } + i++; + xy++; + } + } + + for(i=1; i<8; i++){ + if(score[i] > best_score){ + best_score= score[i]; + best_fcode= i; + } +// printf("%d %d\n", i, score[i]); + } + +// printf("fcode: %d type: %d\n", i, s->pict_type); + return best_fcode; +/* for(i=0; i<=MAX_FCODE; i++){ + printf("%d ", mv_num[i]); + } + printf("\n");*/ + }else{ + return 1; + } +} + +void ff_fix_long_p_mvs(MpegEncContext * s) +{ + const int f_code= s->f_code; + int y; + UINT8 * fcode_tab= s->fcode_tab; +//int clip=0; +//int noclip=0; + /* clip / convert to intra 16x16 type MVs */ + for(y=0; y<s->mb_height; y++){ + int x; + int xy= (y+1)* (s->mb_width+2)+1; + int i= y*s->mb_width; + for(x=0; x<s->mb_width; x++){ + if(s->mb_type[i]&MB_TYPE_INTER){ + if( fcode_tab[s->p_mv_table[xy][0] + MAX_MV] > f_code + || fcode_tab[s->p_mv_table[xy][0] + MAX_MV] == 0 + || fcode_tab[s->p_mv_table[xy][1] + MAX_MV] > f_code + || fcode_tab[s->p_mv_table[xy][1] + MAX_MV] == 0 ){ + s->mb_type[i] &= ~MB_TYPE_INTER; + s->mb_type[i] |= MB_TYPE_INTRA; + s->p_mv_table[xy][0] = 0; + s->p_mv_table[xy][1] = 0; +//clip++; + } +//else +// noclip++; + } + xy++; + i++; + } + } +//printf("%d no:%d %d//\n", clip, noclip, f_code); + if(s->flags&CODEC_FLAG_4MV){ + const int wrap= 2+ s->mb_width*2; + + /* clip / convert to intra 8x8 type MVs */ + for(y=0; y<s->mb_height; y++){ + int xy= (y*2 + 1)*wrap + 1; + int i= y*s->mb_width; + int x; + + for(x=0; x<s->mb_width; x++){ + if(s->mb_type[i]&MB_TYPE_INTER4V){ + int block; + for(block=0; block<4; block++){ + int off= (block& 1) + (block>>1)*wrap; + int mx= s->motion_val[ xy + off ][0]; + int my= s->motion_val[ xy + off ][1]; + + if( fcode_tab[mx + MAX_MV] > f_code + || fcode_tab[mx + MAX_MV] == 0 + || fcode_tab[my + MAX_MV] > f_code + || fcode_tab[my + MAX_MV] == 0 ){ + s->mb_type[i] &= ~MB_TYPE_INTER4V; + s->mb_type[i] |= MB_TYPE_INTRA; + } + } + xy+=2; + i++; + } + } + } + } +} + +void ff_fix_long_b_mvs(MpegEncContext * s, int16_t (*mv_table)[2], int f_code, int type) +{ + int y; + UINT8 * fcode_tab= s->fcode_tab; + + /* clip / convert to intra 16x16 type MVs */ + for(y=0; y<s->mb_height; y++){ + int x; + int xy= (y+1)* (s->mb_width+2)+1; + int i= y*s->mb_width; + for(x=0; x<s->mb_width; x++){ + if(s->mb_type[i]&type){ + if( fcode_tab[mv_table[xy][0] + MAX_MV] > f_code + || fcode_tab[mv_table[xy][0] + MAX_MV] == 0 + || fcode_tab[mv_table[xy][1] + MAX_MV] > f_code + || fcode_tab[mv_table[xy][1] + MAX_MV] == 0 ){ + if(s->mb_type[i]&(~type)) s->mb_type[i] &= ~type; + else{ + mv_table[xy][0] = 0; + mv_table[xy][1] = 0; + //this is certainly bad FIXME + } + } + } + xy++; + i++; + } + } +} diff --git a/src/libffmpeg/libavcodec/mpeg12.c b/src/libffmpeg/libavcodec/mpeg12.c index ac614d5ce..37e9b70ac 100644 --- a/src/libffmpeg/libavcodec/mpeg12.c +++ b/src/libffmpeg/libavcodec/mpeg12.c @@ -1,26 +1,25 @@ /* * MPEG1 encoder / MPEG2 decoder - * Copyright (c) 2000,2001 Gerard Lantau. + * Copyright (c) 2000,2001 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ //#define DEBUG #include "avcodec.h" #include "dsputil.h" #include "mpegvideo.h" -#include "xineutils.h" #include "mpeg12data.h" @@ -34,8 +33,6 @@ #define EXT_START_CODE 0x000001b5 #define USER_START_CODE 0x000001b2 -#define ABS(a) ((a)<0 ? -(a) : (a)) - static void mpeg1_encode_block(MpegEncContext *s, DCTELEM *block, int component); @@ -400,8 +397,11 @@ void mpeg1_encode_init(MpegEncContext *s) } } s->mv_penalty= mv_penalty; - s->fcode_tab= fcode_tab; + s->min_qcoeff=-255; + s->max_qcoeff= 255; + s->intra_quant_bias= 3<<(QUANT_BIAS_SHIFT-3); //(a + x*3/8)/x + s->inter_quant_bias= 0; } static inline void encode_dc(MpegEncContext *s, int diff, int component) @@ -853,6 +853,8 @@ static int mpeg_decode_mb(MpegEncContext *s, if (cbp & (1 << (5 - i))) { if (mpeg2_decode_block_intra(s, block[i], i) < 0) return -1; + } else { + s->block_last_index[i] = -1; } } } else { @@ -860,6 +862,8 @@ static int mpeg_decode_mb(MpegEncContext *s, if (cbp & (1 << (5 - i))) { if (mpeg2_decode_block_non_intra(s, block[i], i) < 0) return -1; + } else { + s->block_last_index[i] = -1; } } } @@ -868,6 +872,8 @@ static int mpeg_decode_mb(MpegEncContext *s, if (cbp & (1 << (5 - i))) { if (mpeg1_decode_block(s, block[i], i) < 0) return -1; + } else { + s->block_last_index[i] = -1; } } } @@ -1028,9 +1034,9 @@ static int mpeg2_decode_block_non_intra(MpegEncContext *s, UINT8 *buf_ptr; i = 0; if (n < 4) - matrix = s->non_intra_matrix; + matrix = s->inter_matrix; else - matrix = s->chroma_non_intra_matrix; + matrix = s->chroma_inter_matrix; /* special case for the first coef. no need to add a second vlc table */ SAVE_BITS(&s->gb); @@ -1184,6 +1190,9 @@ static int mpeg_decode_init(AVCodecContext *avctx) s->buf_ptr = s->buffer; s->mpeg_enc_ctx.picture_number = 0; s->repeat_field = 0; + s->mpeg_enc_ctx.codec_id= avctx->codec->id; + avctx->mbskip_table= s->mpeg_enc_ctx.mbskip_table; + s->mpeg_enc_ctx.flags= avctx->flags; return 0; } @@ -1273,6 +1282,7 @@ static void mpeg_decode_sequence_extension(MpegEncContext *s) s->frame_rate = (s->frame_rate * frame_rate_ext_n) / frame_rate_ext_d; dprintf("sequence extension\n"); s->mpeg2 = 1; + s->avctx->sub_id = 2; /* indicates mpeg2 found */ } static void mpeg_decode_quant_matrix_extension(MpegEncContext *s) @@ -1293,8 +1303,8 @@ static void mpeg_decode_quant_matrix_extension(MpegEncContext *s) for(i=0;i<64;i++) { v = get_bits(&s->gb, 8); j = zigzag_direct[i]; - s->non_intra_matrix[j] = v; - s->chroma_non_intra_matrix[j] = v; + s->inter_matrix[j] = v; + s->chroma_inter_matrix[j] = v; } } if (get_bits1(&s->gb)) { @@ -1308,7 +1318,7 @@ static void mpeg_decode_quant_matrix_extension(MpegEncContext *s) for(i=0;i<64;i++) { v = get_bits(&s->gb, 8); j = zigzag_direct[i]; - s->chroma_non_intra_matrix[j] = v; + s->chroma_inter_matrix[j] = v; } } } @@ -1334,6 +1344,8 @@ static void mpeg_decode_picture_coding_extension(MpegEncContext *s) /* composite display not parsed */ dprintf("intra_dc_precision=%d\n", s->intra_dc_precision); dprintf("picture_structure=%d\n", s->picture_structure); + dprintf("top field first=%d\n", s->top_field_first); + dprintf("repeat first field=%d\n", s->repeat_first_field); dprintf("conceal=%d\n", s->concealment_motion_vectors); dprintf("intra_vlc_format=%d\n", s->intra_vlc_format); dprintf("alternate_scan=%d\n", s->alternate_scan); @@ -1387,7 +1399,6 @@ static int mpeg_decode_slice(AVCodecContext *avctx, s->mb_x = -1; s->mb_y = start_code; s->mb_incr = 0; - /* start frame decoding */ if (s->first_slice) { s->first_slice = 0; @@ -1404,6 +1415,7 @@ static int mpeg_decode_slice(AVCodecContext *avctx, for(;;) { clear_blocks(s->block[0]); + emms_c(); ret = mpeg_decode_mb(s, s->block); dprintf("ret=%d\n", ret); if (ret < 0) @@ -1460,7 +1472,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx, Mpeg1Context *s1 = avctx->priv_data; MpegEncContext *s = &s1->mpeg_enc_ctx; int width, height, i, v, j; - + init_get_bits(&s->gb, buf, buf_size); width = get_bits(&s->gb, 12); @@ -1488,7 +1500,12 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx, s->avctx = avctx; avctx->width = width; avctx->height = height; - avctx->frame_rate = frame_rate_tab[s->frame_rate_index]; + if (s->frame_rate_index >= 9) { + /* at least give a valid frame rate (some old mpeg1 have this) */ + avctx->frame_rate = 25 * FRAME_RATE_BASE; + } else { + avctx->frame_rate = frame_rate_tab[s->frame_rate_index]; + } s->frame_rate = avctx->frame_rate; avctx->bit_rate = s->bit_rate; @@ -1526,20 +1543,20 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx, for(i=0;i<64;i++) { v = get_bits(&s->gb, 8); j = zigzag_direct[i]; - s->non_intra_matrix[j] = v; - s->chroma_non_intra_matrix[j] = v; + s->inter_matrix[j] = v; + s->chroma_inter_matrix[j] = v; } #ifdef DEBUG dprintf("non intra matrix present\n"); for(i=0;i<64;i++) - dprintf(" %d", s->non_intra_matrix[zigzag_direct[i]]); + dprintf(" %d", s->inter_matrix[zigzag_direct[i]]); printf("\n"); #endif } else { for(i=0;i<64;i++) { v = default_non_intra_matrix[i]; - s->non_intra_matrix[i] = v; - s->chroma_non_intra_matrix[i] = v; + s->inter_matrix[i] = v; + s->chroma_inter_matrix[i] = v; } } @@ -1549,6 +1566,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx, s->picture_structure = PICT_FRAME; s->frame_pred_frame_dct = 1; s->mpeg2 = 0; + avctx->sub_id = 1; /* indicates mpeg1 */ return 0; } @@ -1566,7 +1584,7 @@ static int mpeg_decode_frame(AVCodecContext *avctx, dprintf("fill_buffer\n"); *data_size = 0; - + /* special case for last picture */ if (buf_size == 0) { if (s2->picture_number > 0) { @@ -1583,15 +1601,18 @@ static int mpeg_decode_frame(AVCodecContext *avctx, buf_ptr = buf; buf_end = buf + buf_size; - - if (s->repeat_field % 2 == 1) { + +#if 0 + if (s->repeat_field % 2 == 1) { s->repeat_field++; //fprintf(stderr,"\nRepeating last frame: %d -> %d! pict: %d %d", avctx->frame_number-1, avctx->frame_number, - // s2->picture_number, s->repeat_field); - *data_size = sizeof(AVPicture); - goto the_end; + // s2->picture_number, s->repeat_field); + if (avctx->flags & CODEC_FLAG_REPEAT_FIELD) { + *data_size = sizeof(AVPicture); + goto the_end; + } } - +#endif while (buf_ptr < buf_end) { buf_start = buf_ptr; /* find start next code */ @@ -1641,13 +1662,27 @@ static int mpeg_decode_frame(AVCodecContext *avctx, if (ret == 1) { /* got a picture: exit */ /* first check if we must repeat the frame */ + avctx->repeat_pict = 0; +#if 0 if (s2->progressive_frame && s2->repeat_first_field) { //fprintf(stderr,"\nRepeat this frame: %d! pict: %d",avctx->frame_number,s2->picture_number); - s2->repeat_first_field = 0; - s2->progressive_frame = 0; + //s2->repeat_first_field = 0; + //s2->progressive_frame = 0; if (++s->repeat_field > 2) s->repeat_field = 0; + avctx->repeat_pict = 1; } +#endif + if (s2->repeat_first_field) { + if (s2->progressive_sequence) { + if (s2->top_field_first) + avctx->repeat_pict = 4; + else + avctx->repeat_pict = 2; + } else if (s2->progressive_frame) { + avctx->repeat_pict = 1; + } + } *data_size = sizeof(AVPicture); goto the_end; } diff --git a/src/libffmpeg/libavcodec/mpeg4data.h b/src/libffmpeg/libavcodec/mpeg4data.h index 91b99625f..e972a7576 100644 --- a/src/libffmpeg/libavcodec/mpeg4data.h +++ b/src/libffmpeg/libavcodec/mpeg4data.h @@ -4,13 +4,20 @@ #define BIN_ONLY_SHAPE 2 #define GRAY_SHAPE 3 +#define SIMPLE_VO_TYPE 1 +#define CORE_VO_TYPE 3 + // aspect_ratio_info -#define EXTENDET_PAR 15 +#define EXTENDED_PAR 15 //vol_sprite_usage / sprite_enable #define STATIC_SPRITE 1 #define GMC_SPRITE 2 +#define MOTION_MARKER 0x1F001 +#define DC_MARKER 0x6B001 + + /* dc encoding for mpeg4 */ const UINT8 DCtab_lum[13][2] = { @@ -122,3 +129,27 @@ static const UINT16 pixel_aspect[16][2]={ {0, 0}, {0, 0}, }; + +/* these matrixes will be permuted for the idct */ +INT16 ff_mpeg4_default_intra_matrix[64] = { + 8, 17, 18, 19, 21, 23, 25, 27, + 17, 18, 19, 21, 23, 25, 27, 28, + 20, 21, 22, 23, 24, 26, 28, 30, + 21, 22, 23, 24, 26, 28, 30, 32, + 22, 23, 24, 26, 28, 30, 32, 35, + 23, 24, 26, 28, 30, 32, 35, 38, + 25, 26, 28, 30, 32, 35, 38, 41, + 27, 28, 30, 32, 35, 38, 41, 45, +}; + +INT16 ff_mpeg4_default_non_intra_matrix[64] = { + 16, 17, 18, 19, 20, 21, 22, 23, + 17, 18, 19, 20, 21, 22, 23, 24, + 18, 19, 20, 21, 22, 23, 24, 25, + 19, 20, 21, 22, 23, 24, 26, 27, + 20, 21, 22, 23, 25, 26, 27, 28, + 21, 22, 23, 24, 26, 27, 28, 30, + 22, 23, 24, 26, 27, 28, 30, 31, + 23, 24, 25, 27, 28, 30, 31, 33, +}; + diff --git a/src/libffmpeg/libavcodec/mpegvideo.c b/src/libffmpeg/libavcodec/mpegvideo.c index 9f572c3d9..63242c9de 100644 --- a/src/libffmpeg/libavcodec/mpegvideo.c +++ b/src/libffmpeg/libavcodec/mpegvideo.c @@ -1,49 +1,42 @@ /* * The simplest mpeg encoder (well, it was the simplest!) - * Copyright (c) 2000,2001 Gerard Lantau. + * Copyright (c) 2000,2001 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * 4MV & hq encoding stuff by Michael Niedermayer <michaelni@gmx.at> + * 4MV & hq & b-frame encoding stuff by Michael Niedermayer <michaelni@gmx.at> */ -#include <stdlib.h> -#include <stdio.h> -#include <math.h> -#include <string.h> #include "avcodec.h" #include "dsputil.h" #include "mpegvideo.h" -#include "config.h" -#include "xine-utils/xineutils.h" - #ifdef USE_FASTMEMCPY #include "fastmemcpy.h" #endif static void encode_picture(MpegEncContext *s, int picture_number); -static void rate_control_init(MpegEncContext *s); -static int rate_estimate_qscale(MpegEncContext *s); static void dct_unquantize_mpeg1_c(MpegEncContext *s, DCTELEM *block, int n, int qscale); +static void dct_unquantize_mpeg2_c(MpegEncContext *s, + DCTELEM *block, int n, int qscale); static void dct_unquantize_h263_c(MpegEncContext *s, DCTELEM *block, int n, int qscale); static void draw_edges_c(UINT8 *buf, int wrap, int width, int height, int w); -static int dct_quantize_c(MpegEncContext *s, DCTELEM *block, int n, int qscale); +static int dct_quantize_c(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow); -int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale)= dct_quantize_c; +int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow)= dct_quantize_c; void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w)= draw_edges_c; #define EDGE_WIDTH 16 @@ -53,6 +46,7 @@ void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w)= draw_edg //#define DEBUG + /* for jpeg fast DCT */ #define CONST_BITS 14 @@ -75,37 +69,55 @@ static UINT8 h263_chroma_roundtab[16] = { static UINT16 default_mv_penalty[MAX_FCODE+1][MAX_MV*2+1]; static UINT8 default_fcode_tab[MAX_MV*2+1]; -/* default motion estimation */ -int motion_estimation_method = ME_LOG; - extern UINT8 zigzag_end[64]; -static void convert_matrix(int *qmat, UINT16 *qmat16, const UINT16 *quant_matrix, int qscale) +/* default motion estimation */ +int motion_estimation_method = ME_EPZS; + +static void convert_matrix(int (*qmat)[64], uint16_t (*qmat16)[64], uint16_t (*qmat16_bias)[64], + const UINT16 *quant_matrix, int bias) { - int i; + int qscale; - if (av_fdct == jpeg_fdct_ifast) { - for(i=0;i<64;i++) { - /* 16 <= qscale * quant_matrix[i] <= 7905 */ - /* 19952 <= aanscales[i] * qscale * quant_matrix[i] <= 249205026 */ - /* (1<<36)/19952 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= (1<<36)/249205026 */ - /* 3444240 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= 275 */ - - qmat[block_permute_op(i)] = (int)((UINT64_C(1) << (QMAT_SHIFT + 11)) / - (aanscales[i] * qscale * quant_matrix[block_permute_op(i)])); - } - } else { - for(i=0;i<64;i++) { - /* We can safely suppose that 16 <= quant_matrix[i] <= 255 - So 16 <= qscale * quant_matrix[i] <= 7905 - so (1<<19) / 16 >= (1<<19) / (qscale * quant_matrix[i]) >= (1<<19) / 7905 - so 32768 >= (1<<19) / (qscale * quant_matrix[i]) >= 67 - */ - qmat[i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[i]); - qmat16[i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[block_permute_op(i)]); + for(qscale=1; qscale<32; qscale++){ + int i; + if (av_fdct == fdct_ifast) { + for(i=0;i<64;i++) { + const int j= block_permute_op(i); + /* 16 <= qscale * quant_matrix[i] <= 7905 */ + /* 19952 <= aanscales[i] * qscale * quant_matrix[i] <= 249205026 */ + /* (1<<36)/19952 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= (1<<36)/249205026 */ + /* 3444240 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= 275 */ + + qmat[qscale][j] = (int)((UINT64_C(1) << (QMAT_SHIFT + 11)) / + (aanscales[i] * qscale * quant_matrix[j])); + } + } else { + for(i=0;i<64;i++) { + /* We can safely suppose that 16 <= quant_matrix[i] <= 255 + So 16 <= qscale * quant_matrix[i] <= 7905 + so (1<<19) / 16 >= (1<<19) / (qscale * quant_matrix[i]) >= (1<<19) / 7905 + so 32768 >= (1<<19) / (qscale * quant_matrix[i]) >= 67 + */ + qmat [qscale][i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[i]); + qmat16[qscale][i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[block_permute_op(i)]); + + if(qmat16[qscale][i]==0 || qmat16[qscale][i]==128*256) qmat16[qscale][i]=128*256-1; + + qmat16_bias[qscale][i]= ROUNDED_DIV(bias<<(16-QUANT_BIAS_SHIFT), qmat16[qscale][i]); + } } } } +// move into common.c perhaps +#define CHECKED_ALLOCZ(p, size)\ +{\ + p= av_mallocz(size);\ + if(p==NULL){\ + perror("malloc");\ + goto fail;\ + }\ +} /* init common structure for both encoder and decoder */ int MPV_common_init(MpegEncContext *s) @@ -113,14 +125,19 @@ int MPV_common_init(MpegEncContext *s) int c_size, i; UINT8 *pict; - if (s->out_format == FMT_H263) - s->dct_unquantize = dct_unquantize_h263_c; - else - s->dct_unquantize = dct_unquantize_mpeg1_c; + s->dct_unquantize_h263 = dct_unquantize_h263_c; + s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_c; + s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_c; #ifdef HAVE_MMX MPV_common_init_mmx(s); #endif + //setup default unquantizers (mpeg4 might change it later) + if(s->out_format == FMT_H263) + s->dct_unquantize = s->dct_unquantize_h263; + else + s->dct_unquantize = s->dct_unquantize_mpeg1; + s->mb_width = (s->width + 15) / 16; s->mb_height = (s->height + 15) / 16; s->mb_num = s->mb_width * s->mb_height; @@ -135,58 +152,79 @@ int MPV_common_init(MpegEncContext *s) c_size = (w >> shift) * (h >> shift); pict_start = (w >> shift) * (EDGE_WIDTH >> shift) + (EDGE_WIDTH >> shift); - pict = av_mallocz(c_size); - if (pict == NULL) - goto fail; + CHECKED_ALLOCZ(pict, c_size) s->last_picture_base[i] = pict; s->last_picture[i] = pict + pict_start; + if(i>0) memset(s->last_picture_base[i], 128, c_size); - pict = av_mallocz(c_size); - if (pict == NULL) - goto fail; + CHECKED_ALLOCZ(pict, c_size) s->next_picture_base[i] = pict; s->next_picture[i] = pict + pict_start; - - if (s->has_b_frames) { - pict = av_mallocz(c_size); - if (pict == NULL) - goto fail; + if(i>0) memset(s->next_picture_base[i], 128, c_size); + + if (s->has_b_frames || s->codec_id==CODEC_ID_MPEG4) { + /* Note the MPEG4 stuff is here cuz of buggy encoders which dont set the low_delay flag but + do low-delay encoding, so we cant allways distinguish b-frame containing streams from low_delay streams */ + CHECKED_ALLOCZ(pict, c_size) s->aux_picture_base[i] = pict; s->aux_picture[i] = pict + pict_start; + if(i>0) memset(s->aux_picture_base[i], 128, c_size); } } if (s->encoding) { - /* Allocate MB type table */ - s->mb_type = av_mallocz(s->mb_num * sizeof(char)); - if (s->mb_type == NULL) { - perror("malloc"); - goto fail; - } + int j; + int mv_table_size= (s->mb_width+2)*(s->mb_height+2); + + CHECKED_ALLOCZ(s->mb_var , s->mb_num * sizeof(INT16)) + CHECKED_ALLOCZ(s->mc_mb_var, s->mb_num * sizeof(INT16)) + + /* Allocate MV tables */ + CHECKED_ALLOCZ(s->p_mv_table , mv_table_size * 2 * sizeof(INT16)) + CHECKED_ALLOCZ(s->b_forw_mv_table , mv_table_size * 2 * sizeof(INT16)) + CHECKED_ALLOCZ(s->b_back_mv_table , mv_table_size * 2 * sizeof(INT16)) + CHECKED_ALLOCZ(s->b_bidir_forw_mv_table , mv_table_size * 2 * sizeof(INT16)) + CHECKED_ALLOCZ(s->b_bidir_back_mv_table , mv_table_size * 2 * sizeof(INT16)) + CHECKED_ALLOCZ(s->b_direct_forw_mv_table, mv_table_size * 2 * sizeof(INT16)) + CHECKED_ALLOCZ(s->b_direct_back_mv_table, mv_table_size * 2 * sizeof(INT16)) + CHECKED_ALLOCZ(s->b_direct_mv_table , mv_table_size * 2 * sizeof(INT16)) + + CHECKED_ALLOCZ(s->me_scratchpad, s->linesize*16*3*sizeof(uint8_t)) - s->mb_var = av_mallocz(s->mb_num * sizeof(INT16)); - if (s->mb_var == NULL) { - perror("malloc"); - goto fail; + CHECKED_ALLOCZ(s->me_map , ME_MAP_SIZE*sizeof(uint32_t)) + CHECKED_ALLOCZ(s->me_score_map, ME_MAP_SIZE*sizeof(uint16_t)) + + if(s->max_b_frames){ + for(j=0; j<REORDER_BUFFER_SIZE; j++){ + int i; + for(i=0;i<3;i++) { + int w, h, shift; + + w = s->linesize; + h = s->mb_height * 16; + shift = (i == 0) ? 0 : 1; + c_size = (w >> shift) * (h >> shift); + + CHECKED_ALLOCZ(pict, c_size); + s->picture_buffer[j][i] = pict; + } + } } - /* Allocate MV table */ - /* By now we just have one MV per MB */ - s->mv_table[0] = av_mallocz(s->mb_num * sizeof(INT16)); - s->mv_table[1] = av_mallocz(s->mb_num * sizeof(INT16)); - if (s->mv_table[1] == NULL || s->mv_table[0] == NULL) { - perror("malloc"); - goto fail; + + if(s->codec_id==CODEC_ID_MPEG4){ + CHECKED_ALLOCZ(s->tex_pb_buffer, PB_BUFFER_SIZE); + CHECKED_ALLOCZ( s->pb2_buffer, PB_BUFFER_SIZE); } } if (s->out_format == FMT_H263 || s->encoding) { int size; + /* Allocate MB type table */ + CHECKED_ALLOCZ(s->mb_type , s->mb_num * sizeof(UINT8)) + /* MV prediction */ size = (2 * s->mb_width + 2) * (2 * s->mb_height + 2); - s->motion_val = malloc(size * 2 * sizeof(INT16)); - if (s->motion_val == NULL) - goto fail; - memset(s->motion_val, 0, size * 2 * sizeof(INT16)); + CHECKED_ALLOCZ(s->motion_val, size * 2 * sizeof(INT16)); } if (s->h263_pred || s->h263_plus) { @@ -197,43 +235,40 @@ int MPV_common_init(MpegEncContext *s) y_size = (2 * s->mb_width + 2) * (2 * s->mb_height + 2); c_size = (s->mb_width + 2) * (s->mb_height + 2); size = y_size + 2 * c_size; - s->dc_val[0] = malloc(size * sizeof(INT16)); - if (s->dc_val[0] == NULL) - goto fail; + CHECKED_ALLOCZ(s->dc_val[0], size * sizeof(INT16)); s->dc_val[1] = s->dc_val[0] + y_size; s->dc_val[2] = s->dc_val[1] + c_size; for(i=0;i<size;i++) s->dc_val[0][i] = 1024; /* ac values */ - s->ac_val[0] = av_mallocz(size * sizeof(INT16) * 16); - if (s->ac_val[0] == NULL) - goto fail; + CHECKED_ALLOCZ(s->ac_val[0], size * sizeof(INT16) * 16); s->ac_val[1] = s->ac_val[0] + y_size; s->ac_val[2] = s->ac_val[1] + c_size; /* cbp values */ - s->coded_block = av_mallocz(y_size); - if (!s->coded_block) - goto fail; + CHECKED_ALLOCZ(s->coded_block, y_size); /* which mb is a intra block */ - s->mbintra_table = av_mallocz(s->mb_num); - if (!s->mbintra_table) - goto fail; + CHECKED_ALLOCZ(s->mbintra_table, s->mb_num); memset(s->mbintra_table, 1, s->mb_num); + + /* divx501 bitstream reorder buffer */ + CHECKED_ALLOCZ(s->bitstream_buffer, BITSTREAM_BUFFER_SIZE); + + /* cbp, ac_pred, pred_dir */ + CHECKED_ALLOCZ(s->cbp_table , s->mb_num * sizeof(UINT8)) + CHECKED_ALLOCZ(s->pred_dir_table, s->mb_num * sizeof(UINT8)) + + CHECKED_ALLOCZ(s->qscale_table , s->mb_num * sizeof(UINT8)) } /* default structure is frame */ s->picture_structure = PICT_FRAME; /* init macroblock skip table */ - if (!s->encoding) { - s->mbskip_table = av_mallocz(s->mb_num); - if (!s->mbskip_table) - goto fail; - } + CHECKED_ALLOCZ(s->mbskip_table, s->mb_num); - s->block= s->intra_block; + s->block= s->blocks[0]; s->context_initialized = 1; return 0; @@ -242,39 +277,49 @@ int MPV_common_init(MpegEncContext *s) return -1; } + +//extern int sads; + /* init common structure for both encoder and decoder */ void MPV_common_end(MpegEncContext *s) { int i; - if (s->mb_type) - free(s->mb_type); - if (s->mb_var) - free(s->mb_var); - if (s->mv_table[0]) - free(s->mv_table[0]); - if (s->mv_table[1]) - free(s->mv_table[1]); - if (s->motion_val) - free(s->motion_val); - if (s->dc_val[0]) - free(s->dc_val[0]); - if (s->ac_val[0]) - free(s->ac_val[0]); - if (s->coded_block) - free(s->coded_block); - if (s->mbintra_table) - free(s->mbintra_table); - - if (s->mbskip_table) - free(s->mbskip_table); + av_freep(&s->mb_type); + av_freep(&s->mb_var); + av_freep(&s->mc_mb_var); + av_freep(&s->p_mv_table); + av_freep(&s->b_forw_mv_table); + av_freep(&s->b_back_mv_table); + av_freep(&s->b_bidir_forw_mv_table); + av_freep(&s->b_bidir_back_mv_table); + av_freep(&s->b_direct_forw_mv_table); + av_freep(&s->b_direct_back_mv_table); + av_freep(&s->b_direct_mv_table); + av_freep(&s->motion_val); + av_freep(&s->dc_val[0]); + av_freep(&s->ac_val[0]); + av_freep(&s->coded_block); + av_freep(&s->mbintra_table); + av_freep(&s->cbp_table); + av_freep(&s->pred_dir_table); + av_freep(&s->qscale_table); + av_freep(&s->me_scratchpad); + av_freep(&s->me_map); + av_freep(&s->me_score_map); + + av_freep(&s->mbskip_table); + av_freep(&s->bitstream_buffer); + av_freep(&s->tex_pb_buffer); + av_freep(&s->pb2_buffer); for(i=0;i<3;i++) { - if (s->last_picture_base[i]) - free(s->last_picture_base[i]); - if (s->next_picture_base[i]) - free(s->next_picture_base[i]); - if (s->has_b_frames) - free(s->aux_picture_base[i]); + int j; + av_freep(&s->last_picture_base[i]); + av_freep(&s->next_picture_base[i]); + av_freep(&s->aux_picture_base[i]); + for(j=0; j<REORDER_BUFFER_SIZE; j++){ + av_freep(&s->picture_buffer[j][i]); + } } s->context_initialized = 0; } @@ -285,11 +330,17 @@ int MPV_encode_init(AVCodecContext *avctx) MpegEncContext *s = avctx->priv_data; int i; + avctx->pix_fmt = PIX_FMT_YUV420P; + s->bit_rate = avctx->bit_rate; s->bit_rate_tolerance = avctx->bit_rate_tolerance; s->frame_rate = avctx->frame_rate; s->width = avctx->width; s->height = avctx->height; + if(avctx->gop_size > 600){ + fprintf(stderr, "Warning keyframe interval too large! reducing it ...\n"); + avctx->gop_size=600; + } s->gop_size = avctx->gop_size; s->rtp_mode = avctx->rtp_mode; s->rtp_payload_size = avctx->rtp_payload_size; @@ -300,36 +351,56 @@ int MPV_encode_init(AVCodecContext *avctx) s->max_qdiff= avctx->max_qdiff; s->qcompress= avctx->qcompress; s->qblur= avctx->qblur; + s->b_quant_factor= avctx->b_quant_factor; + s->b_quant_offset= avctx->b_quant_offset; s->avctx = avctx; s->aspect_ratio_info= avctx->aspect_ratio_info; s->flags= avctx->flags; - + s->max_b_frames= avctx->max_b_frames; + s->rc_strategy= avctx->rc_strategy; + s->b_frame_strategy= avctx->b_frame_strategy; + s->codec_id= avctx->codec->id; + s->luma_elim_threshold = avctx->luma_elim_threshold; + s->chroma_elim_threshold= avctx->chroma_elim_threshold; + s->strict_std_compliance= avctx->strict_std_compliance; + s->data_partitioning= avctx->flags & CODEC_FLAG_PART; + if (s->gop_size <= 1) { s->intra_only = 1; s->gop_size = 12; } else { s->intra_only = 0; } - s->full_search = motion_estimation_method; - + + /* ME algorithm */ + if (avctx->me_method == 0) + /* For compatibility */ + s->me_method = motion_estimation_method; + else + s->me_method = avctx->me_method; + + /* Fixed QSCALE */ s->fixed_qscale = (avctx->flags & CODEC_FLAG_QSCALE); switch(avctx->codec->id) { case CODEC_ID_MPEG1VIDEO: s->out_format = FMT_MPEG1; + avctx->delay=0; //FIXME not sure, should check the spec break; case CODEC_ID_MJPEG: s->out_format = FMT_MJPEG; s->intra_only = 1; /* force intra only for jpeg */ s->mjpeg_write_tables = 1; /* write all tables */ + s->mjpeg_data_only_frames = 0; /* write all the needed headers */ s->mjpeg_vsample[0] = 2; /* set up default sampling factors */ s->mjpeg_vsample[1] = 1; /* the only currently supported values */ s->mjpeg_vsample[2] = 1; - s->mjpeg_hsample[0] = 2; + s->mjpeg_hsample[0] = 2; s->mjpeg_hsample[1] = 1; s->mjpeg_hsample[2] = 1; if (mjpeg_init(s) < 0) return -1; + avctx->delay=0; break; case CODEC_ID_H263: if (h263_get_picture_format(s->width, s->height) == 7) { @@ -337,6 +408,7 @@ int MPV_encode_init(AVCodecContext *avctx) return -1; } s->out_format = FMT_H263; + avctx->delay=0; break; case CODEC_ID_H263P: s->out_format = FMT_H263; @@ -344,19 +416,25 @@ int MPV_encode_init(AVCodecContext *avctx) s->rtp_payload_size = 1200; s->h263_plus = 1; s->unrestricted_mv = 1; + s->h263_aic = 1; /* These are just to be sure */ s->umvplus = 0; s->umvplus_dec = 0; + avctx->delay=0; break; case CODEC_ID_RV10: s->out_format = FMT_H263; s->h263_rv10 = 1; + avctx->delay=0; break; case CODEC_ID_MPEG4: s->out_format = FMT_H263; s->h263_pred = 1; s->unrestricted_mv = 1; + s->has_b_frames= s->max_b_frames ? 1 : 0; + s->low_delay=0; + avctx->delay= s->low_delay ? 0 : (s->max_b_frames + 1); break; case CODEC_ID_MSMPEG4V1: s->out_format = FMT_H263; @@ -364,6 +442,7 @@ int MPV_encode_init(AVCodecContext *avctx) s->h263_pred = 1; s->unrestricted_mv = 1; s->msmpeg4_version= 1; + avctx->delay=0; break; case CODEC_ID_MSMPEG4V2: s->out_format = FMT_H263; @@ -371,6 +450,7 @@ int MPV_encode_init(AVCodecContext *avctx) s->h263_pred = 1; s->unrestricted_mv = 1; s->msmpeg4_version= 2; + avctx->delay=0; break; case CODEC_ID_MSMPEG4V3: s->out_format = FMT_H263; @@ -378,16 +458,12 @@ int MPV_encode_init(AVCodecContext *avctx) s->h263_pred = 1; s->unrestricted_mv = 1; s->msmpeg4_version= 3; + avctx->delay=0; break; default: return -1; } - if((s->flags&CODEC_FLAG_4MV) && !(s->flags&CODEC_FLAG_HQ)){ - printf("4MV is currently only supported in HQ mode\n"); - return -1; - } - { /* set up some save defaults, some codecs might override them later */ static int done=0; if(!done){ @@ -410,7 +486,7 @@ int MPV_encode_init(AVCodecContext *avctx) mpeg1_encode_init(s); /* dont use mv_penalty table for crap MV as it would be confused */ - if(s->full_search<4) s->mv_penalty= default_mv_penalty; + if (s->me_method < ME_EPZS) s->mv_penalty = default_mv_penalty; s->encoding = 1; @@ -420,18 +496,32 @@ int MPV_encode_init(AVCodecContext *avctx) /* init default q matrix */ for(i=0;i<64;i++) { - s->intra_matrix[i] = default_intra_matrix[i]; - s->non_intra_matrix[i] = default_non_intra_matrix[i]; + if(s->out_format == FMT_H263) + s->intra_matrix[i] = default_non_intra_matrix[i]; + else + s->intra_matrix[i] = default_intra_matrix[i]; + + s->inter_matrix[i] = default_non_intra_matrix[i]; } - /* rate control init */ - rate_control_init(s); + /* precompute matrix */ + /* for mjpeg, we do include qscale in the matrix */ + if (s->out_format != FMT_MJPEG) { + convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, s->q_intra_matrix16_bias, + s->intra_matrix, s->intra_quant_bias); + convert_matrix(s->q_inter_matrix, s->q_inter_matrix16, s->q_inter_matrix16_bias, + s->inter_matrix, s->inter_quant_bias); + } + + if(ff_rate_control_init(s) < 0) + return -1; s->picture_number = 0; s->picture_in_gop_number = 0; s->fake_picture_number = 0; /* motion detector init */ s->f_code = 1; + s->b_code = 1; return 0; } @@ -443,6 +533,9 @@ int MPV_encode_end(AVCodecContext *avctx) #ifdef STATS print_stats(); #endif + + ff_rate_control_uninit(s); + MPV_common_end(s); if (s->out_format == FMT_MJPEG) mjpeg_close(s); @@ -485,12 +578,13 @@ void MPV_frame_start(MpegEncContext *s) UINT8 *tmp; s->mb_skiped = 0; + s->decoding_error=0; + if (s->pict_type == B_TYPE) { for(i=0;i<3;i++) { s->current_picture[i] = s->aux_picture[i]; } } else { - s->last_non_b_pict_type= s->pict_type; for(i=0;i<3;i++) { /* swap next and last */ tmp = s->last_picture[i]; @@ -504,9 +598,11 @@ void MPV_frame_start(MpegEncContext *s) /* generic function for encode/decode called after a frame has been coded/decoded */ void MPV_frame_end(MpegEncContext *s) { +// if((s->picture_number%100)==0 && s->encoding) printf("sads:%d //\n", sads); + /* draw edge for correct motion prediction if outside */ if (s->pict_type != B_TYPE && !s->intra_only) { - if(s->avctx==NULL || s->avctx->codec->id!=CODEC_ID_MPEG4 || s->divx_version==500){ + if(s->avctx==NULL || s->avctx->codec->id!=CODEC_ID_MPEG4 || s->divx_version>=500){ draw_edges(s->current_picture[0], s->linesize, s->mb_width*16, s->mb_height*16, EDGE_WIDTH); draw_edges(s->current_picture[1], s->linesize/2, s->mb_width*8, s->mb_height*8, EDGE_WIDTH/2); draw_edges(s->current_picture[2], s->linesize/2, s->mb_width*8, s->mb_height*8, EDGE_WIDTH/2); @@ -518,107 +614,195 @@ void MPV_frame_end(MpegEncContext *s) } } emms_c(); + + if(s->pict_type!=B_TYPE){ + s->last_non_b_pict_type= s->pict_type; + s->last_non_b_qscale= s->qscale; + s->last_non_b_mc_mb_var= s->mc_mb_var_sum; + s->num_available_buffers++; + if(s->num_available_buffers>2) s->num_available_buffers= 2; + } } -int MPV_encode_picture(AVCodecContext *avctx, - unsigned char *buf, int buf_size, void *data) +/* reorder input for encoding */ +void reorder_input(MpegEncContext *s, AVPicture *pict) { - MpegEncContext *s = avctx->priv_data; - AVPicture *pict = data; - int i, j; + int i, j, index; + + if(s->max_b_frames > FF_MAX_B_FRAMES) s->max_b_frames= FF_MAX_B_FRAMES; - if (s->fixed_qscale) - s->qscale = avctx->quality; +// delay= s->max_b_frames+1; (or 0 if no b frames cuz decoder diff) - init_put_bits(&s->pb, buf, buf_size, NULL, NULL); - - s->force_type= (avctx->flags&CODEC_FLAG_TYPE) ? - (avctx->key_frame ? I_TYPE : P_TYPE) : 0; - if (!s->intra_only) { - /* first picture of GOP is intra */ - if (s->picture_in_gop_number % s->gop_size==0 || s->force_type==I_TYPE){ - s->picture_in_gop_number=0; - s->pict_type = I_TYPE; - }else - s->pict_type = P_TYPE; - } else { - s->pict_type = I_TYPE; + for(j=0; j<REORDER_BUFFER_SIZE-1; j++){ + s->coded_order[j]= s->coded_order[j+1]; } - - MPV_frame_start(s); - - for(i=0;i<3;i++) { - UINT8 *src = pict->data[i]; - UINT8 *dest = s->current_picture[i]; - int src_wrap = pict->linesize[i]; - int dest_wrap = s->linesize; - int w = s->width; - int h = s->height; - - if (i >= 1) { - dest_wrap >>= 1; - w >>= 1; - h >>= 1; + s->coded_order[j].picture[0]= s->coded_order[j].picture[1]= s->coded_order[j].picture[2]= NULL; //catch uninitalized buffers + s->coded_order[j].pict_type=0; + + switch(s->input_pict_type){ + default: + case I_TYPE: + case S_TYPE: + case P_TYPE: + index= s->max_b_frames - s->b_frames_since_non_b; + s->b_frames_since_non_b=0; + break; + case B_TYPE: + index= s->max_b_frames + 1; + s->b_frames_since_non_b++; + break; + } +//printf("index:%d type:%d strides: %d %d\n", index, s->input_pict_type, pict->linesize[0], s->linesize); + if( (index==0 || (s->flags&CODEC_FLAG_INPUT_PRESERVED)) + && pict->linesize[0] == s->linesize + && pict->linesize[1] == s->linesize>>1 + && pict->linesize[2] == s->linesize>>1){ +//printf("ptr\n"); + for(i=0; i<3; i++){ + s->coded_order[index].picture[i]= pict->data[i]; } + }else{ +//printf("copy\n"); + for(i=0; i<3; i++){ + uint8_t *src = pict->data[i]; + uint8_t *dest; + int src_wrap = pict->linesize[i]; + int dest_wrap = s->linesize; + int w = s->width; + int h = s->height; + + if(index==0) dest= s->last_picture[i]+16; //is current_picture indeed but the switch hapens after reordering + else dest= s->picture_buffer[s->picture_buffer_index][i]; + + if (i >= 1) { + dest_wrap >>= 1; + w >>= 1; + h >>= 1; + } - if(dest_wrap==src_wrap){ - s->new_picture[i] = pict->data[i]; - } else { + s->coded_order[index].picture[i]= dest; for(j=0;j<h;j++) { memcpy(dest, src, w); dest += dest_wrap; src += src_wrap; } - s->new_picture[i] = s->current_picture[i]; - } + } + if(index!=0){ + s->picture_buffer_index++; + if(s->picture_buffer_index >= REORDER_BUFFER_SIZE-1) s->picture_buffer_index=0; + } + } + s->coded_order[index].pict_type = s->input_pict_type; + s->coded_order[index].qscale = s->input_qscale; + s->coded_order[index].force_type= s->force_input_type; + s->coded_order[index].picture_in_gop_number= s->input_picture_in_gop_number; + s->coded_order[index].picture_number= s->input_picture_number; + + for(i=0; i<3; i++){ + s->new_picture[i]= s->coded_order[0].picture[i]; } +} + +int MPV_encode_picture(AVCodecContext *avctx, + unsigned char *buf, int buf_size, void *data) +{ + MpegEncContext *s = avctx->priv_data; + AVPicture *pict = data; - encode_picture(s, s->picture_number); - avctx->key_frame = (s->pict_type == I_TYPE); - avctx->header_bits = s->header_bits; - avctx->mv_bits = s->mv_bits; - avctx->misc_bits = s->misc_bits; - avctx->i_tex_bits = s->i_tex_bits; - avctx->p_tex_bits = s->p_tex_bits; - avctx->i_count = s->i_count; - avctx->p_count = s->p_count; - avctx->skip_count = s->skip_count; + s->input_qscale = avctx->quality; - MPV_frame_end(s); - s->picture_number++; - s->picture_in_gop_number++; + init_put_bits(&s->pb, buf, buf_size, NULL, NULL); - if (s->out_format == FMT_MJPEG) - mjpeg_picture_trailer(s); + if(avctx->flags&CODEC_FLAG_TYPE){ + s->input_pict_type= + s->force_input_type= avctx->key_frame ? I_TYPE : P_TYPE; + }else if(s->flags&CODEC_FLAG_PASS2){ + s->input_pict_type= + s->force_input_type= s->rc_context.entry[s->input_picture_number].new_pict_type; + }else{ + s->force_input_type=0; + if (!s->intra_only) { + /* first picture of GOP is intra */ + if (s->input_picture_in_gop_number % s->gop_size==0){ + s->input_pict_type = I_TYPE; + }else if(s->max_b_frames==0){ + s->input_pict_type = P_TYPE; + }else{ + if(s->b_frames_since_non_b < s->max_b_frames) //FIXME more IQ + s->input_pict_type = B_TYPE; + else + s->input_pict_type = P_TYPE; + } + } else { + s->input_pict_type = I_TYPE; + } + } + + if(s->input_pict_type==I_TYPE) + s->input_picture_in_gop_number=0; + + reorder_input(s, pict); + + /* output? */ + if(s->coded_order[0].picture[0]){ + + s->pict_type= s->coded_order[0].pict_type; + if (s->fixed_qscale) /* the ratecontrol needs the last qscale so we dont touch it for CBR */ + s->qscale= s->coded_order[0].qscale; + s->force_type= s->coded_order[0].force_type; + s->picture_in_gop_number= s->coded_order[0].picture_in_gop_number; + s->picture_number= s->coded_order[0].picture_number; + + MPV_frame_start(s); + + encode_picture(s, s->picture_number); + avctx->key_frame = (s->pict_type == I_TYPE); + avctx->pict_type = s->pict_type; + avctx->real_pict_num = s->picture_number; + avctx->header_bits = s->header_bits; + avctx->mv_bits = s->mv_bits; + avctx->misc_bits = s->misc_bits; + avctx->i_tex_bits = s->i_tex_bits; + avctx->p_tex_bits = s->p_tex_bits; + avctx->i_count = s->i_count; + avctx->p_count = s->p_count; + avctx->skip_count = s->skip_count; + + MPV_frame_end(s); + + if (s->out_format == FMT_MJPEG) + mjpeg_picture_trailer(s); + + avctx->quality = s->qscale; + + if(s->flags&CODEC_FLAG_PASS1) + ff_write_pass1_stats(s); + + } + + s->input_picture_number++; + s->input_picture_in_gop_number++; flush_put_bits(&s->pb); - s->last_frame_bits= s->frame_bits; s->frame_bits = (pbBufPtr(&s->pb) - s->pb.buf) * 8; + if(s->pict_type==B_TYPE) s->pb_frame_bits+= s->frame_bits; + else s->pb_frame_bits= s->frame_bits; + s->total_bits += s->frame_bits; avctx->frame_bits = s->frame_bits; //printf("fcode: %d, type: %d, head: %d, mv: %d, misc: %d, frame: %d, itex: %d, ptex: %d\n", //s->f_code, avctx->key_frame, s->header_bits, s->mv_bits, s->misc_bits, s->frame_bits, s->i_tex_bits, s->p_tex_bits); - avctx->quality = s->qscale; if (avctx->get_psnr) { /* At this point pict->data should have the original frame */ /* an s->current_picture should have the coded/decoded frame */ get_psnr(pict->data, s->current_picture, pict->linesize, s->linesize, avctx); +// printf("%f\n", avctx->psnr_y); } return pbBufPtr(&s->pb) - s->pb.buf; } -static inline int clip(int a, int amin, int amax) -{ - if (a < amin) - return amin; - else if (a > amax) - return amax; - else - return a; -} - static inline void gmc1_motion(MpegEncContext *s, UINT8 *dest_y, UINT8 *dest_cb, UINT8 *dest_cr, int dest_offset, @@ -626,7 +810,7 @@ static inline void gmc1_motion(MpegEncContext *s, int h) { UINT8 *ptr; - int dxy, offset, mx, my, src_x, src_y, height, linesize; + int offset, src_x, src_y, linesize; int motion_x, motion_y; if(s->real_sprite_warping_points>1) printf("more than 1 warp point isnt supported\n"); @@ -705,6 +889,8 @@ if(s->quarter_sample) pix_op[dxy](dest_y, ptr, linesize, h); pix_op[dxy](dest_y + 8, ptr + 8, linesize, h); + if(s->flags&CODEC_FLAG_GRAY) return; + if (s->out_format == FMT_H263) { dxy = 0; if ((motion_x & 3) != 0) @@ -768,6 +954,8 @@ static inline void qpel_motion(MpegEncContext *s, qpix_op[dxy](dest_y + linesize*8 , ptr + linesize*8 , linesize, linesize, motion_x&3, motion_y&3); qpix_op[dxy](dest_y + linesize*8 + 8, ptr + linesize*8 + 8, linesize, linesize, motion_x&3, motion_y&3); + if(s->flags&CODEC_FLAG_GRAY) return; + mx= (motion_x>>1) | (motion_x&1); my= (motion_y>>1) | (motion_y&1); @@ -856,6 +1044,8 @@ static inline void MPV_motion(MpegEncContext *s, dest = dest_y + ((i & 1) * 8) + (i >> 1) * 8 * s->linesize; pix_op[dxy](dest, ptr, s->linesize, 8); } + + if(s->flags&CODEC_FLAG_GRAY) break; /* In case of 8X8, we construct a single chroma motion vector with a special rounding */ mx = 0; @@ -922,8 +1112,7 @@ static inline void put_dct(MpegEncContext *s, { if (!s->mpeg2) s->dct_unquantize(s, block, i, s->qscale); - ff_idct (block); - put_pixels_clamped(block, dest, line_size); + ff_idct_put (dest, line_size, block); } /* add block[] to dest[] */ @@ -931,14 +1120,53 @@ static inline void add_dct(MpegEncContext *s, DCTELEM *block, int i, UINT8 *dest, int line_size) { if (s->block_last_index[i] >= 0) { - if (!s->mpeg2) - if(s->encoding || (!s->h263_msmpeg4)) - s->dct_unquantize(s, block, i, s->qscale); - ff_idct (block); - add_pixels_clamped(block, dest, line_size); + ff_idct_add (dest, line_size, block); } } +static inline void add_dequant_dct(MpegEncContext *s, + DCTELEM *block, int i, UINT8 *dest, int line_size) +{ + if (s->block_last_index[i] >= 0) { + s->dct_unquantize(s, block, i, s->qscale); + + ff_idct_add (dest, line_size, block); + } +} + +/** + * cleans dc, ac, coded_block for the current non intra MB + */ +void ff_clean_intra_table_entries(MpegEncContext *s) +{ + int wrap = s->block_wrap[0]; + int xy = s->block_index[0]; + + s->dc_val[0][xy ] = + s->dc_val[0][xy + 1 ] = + s->dc_val[0][xy + wrap] = + s->dc_val[0][xy + 1 + wrap] = 1024; + /* ac pred */ + memset(s->ac_val[0][xy ], 0, 32 * sizeof(INT16)); + memset(s->ac_val[0][xy + wrap], 0, 32 * sizeof(INT16)); + if (s->msmpeg4_version>=3) { + s->coded_block[xy ] = + s->coded_block[xy + 1 ] = + s->coded_block[xy + wrap] = + s->coded_block[xy + 1 + wrap] = 0; + } + /* chroma */ + wrap = s->block_wrap[4]; + xy = s->mb_x + 1 + (s->mb_y + 1) * wrap; + s->dc_val[1][xy] = + s->dc_val[2][xy] = 1024; + /* ac pred */ + memset(s->ac_val[1][xy], 0, 16 * sizeof(INT16)); + memset(s->ac_val[2][xy], 0, 16 * sizeof(INT16)); + + s->mbintra_table[s->mb_x + s->mb_y*s->mb_width]= 0; +} + /* generic function called after a macroblock has been parsed by the decoder or after it has been encoded by the encoder. @@ -952,69 +1180,39 @@ static inline void add_dct(MpegEncContext *s, void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]) { int mb_x, mb_y; - int dct_linesize, dct_offset; - op_pixels_func *op_pix; - qpel_mc_func *op_qpix; + const int mb_xy = s->mb_y * s->mb_width + s->mb_x; mb_x = s->mb_x; mb_y = s->mb_y; #ifdef FF_POSTPROCESS + /* Obsolete. Exists for compatibility with mplayer only. */ quant_store[mb_y][mb_x]=s->qscale; //printf("[%02d][%02d] %d\n",mb_x,mb_y,s->qscale); +#else + if(s->avctx->quant_store) s->avctx->quant_store[mb_y*s->avctx->qstride+mb_x] = s->qscale; #endif /* update DC predictors for P macroblocks */ if (!s->mb_intra) { if (s->h263_pred || s->h263_aic) { - if(s->mbintra_table[mb_x + mb_y*s->mb_width]) - { - int wrap, xy, v; - s->mbintra_table[mb_x + mb_y*s->mb_width]=0; - wrap = 2 * s->mb_width + 2; - xy = 2 * mb_x + 1 + (2 * mb_y + 1) * wrap; - v = 1024; - - s->dc_val[0][xy] = v; - s->dc_val[0][xy + 1] = v; - s->dc_val[0][xy + wrap] = v; - s->dc_val[0][xy + 1 + wrap] = v; - /* ac pred */ - memset(s->ac_val[0][xy], 0, 16 * sizeof(INT16)); - memset(s->ac_val[0][xy + 1], 0, 16 * sizeof(INT16)); - memset(s->ac_val[0][xy + wrap], 0, 16 * sizeof(INT16)); - memset(s->ac_val[0][xy + 1 + wrap], 0, 16 * sizeof(INT16)); - if (s->h263_msmpeg4) { - s->coded_block[xy] = 0; - s->coded_block[xy + 1] = 0; - s->coded_block[xy + wrap] = 0; - s->coded_block[xy + 1 + wrap] = 0; - } - /* chroma */ - wrap = s->mb_width + 2; - xy = mb_x + 1 + (mb_y + 1) * wrap; - s->dc_val[1][xy] = v; - s->dc_val[2][xy] = v; - /* ac pred */ - memset(s->ac_val[1][xy], 0, 16 * sizeof(INT16)); - memset(s->ac_val[2][xy], 0, 16 * sizeof(INT16)); - } + if(s->mbintra_table[mb_xy]) + ff_clean_intra_table_entries(s); } else { - s->last_dc[0] = 128 << s->intra_dc_precision; - s->last_dc[1] = 128 << s->intra_dc_precision; + s->last_dc[0] = + s->last_dc[1] = s->last_dc[2] = 128 << s->intra_dc_precision; } } else if (s->h263_pred || s->h263_aic) - s->mbintra_table[mb_x + mb_y*s->mb_width]=1; + s->mbintra_table[mb_xy]=1; /* update motion predictor, not for B-frames as they need the motion_val from the last P/S-Frame */ - if (s->out_format == FMT_H263) { - if(s->pict_type!=B_TYPE){ - int xy, wrap, motion_x, motion_y; + if (s->out_format == FMT_H263 && s->pict_type!=B_TYPE) { //FIXME move into h263.c if possible, format specific stuff shouldnt be here + int motion_x, motion_y; - wrap = 2 * s->mb_width + 2; - xy = 2 * mb_x + 1 + (2 * mb_y + 1) * wrap; + const int wrap = s->block_wrap[0]; + const int xy = s->block_index[0]; if (s->mb_intra) { motion_x = 0; motion_y = 0; @@ -1033,20 +1231,23 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]) s->motion_val[xy + 1 + wrap][0] = motion_x; s->motion_val[xy + 1 + wrap][1] = motion_y; } - } } - if (!s->intra_only) { + if (!(s->encoding && (s->intra_only || s->pict_type==B_TYPE))) { UINT8 *dest_y, *dest_cb, *dest_cr; - UINT8 *mbskip_ptr; - - /* avoid copy if macroblock skipped in last frame too */ - if (!s->encoding && s->pict_type != B_TYPE) { - mbskip_ptr = &s->mbskip_table[s->mb_y * s->mb_width + s->mb_x]; + int dct_linesize, dct_offset; + op_pixels_func *op_pix; + qpel_mc_func *op_qpix; + + /* avoid copy if macroblock skipped in last frame too + dont touch it for B-frames as they need the skip info from the next p-frame */ + if (s->pict_type != B_TYPE) { + UINT8 *mbskip_ptr = &s->mbskip_table[mb_xy]; if (s->mb_skiped) { s->mb_skiped = 0; - /* if previous was skipped too, then nothing to do ! */ - if (*mbskip_ptr != 0) + /* if previous was skipped too, then nothing to do ! + skip only during decoding as we might trash the buffers during encoding a bit */ + if (*mbskip_ptr != 0 && !s->encoding) goto the_end; *mbskip_ptr = 1; /* indicate that this time we skiped it */ } else { @@ -1068,33 +1269,53 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]) if (!s->mb_intra) { /* motion handling */ - if (!s->no_rounding){ - op_pix = put_pixels_tab; - op_qpix= qpel_mc_rnd_tab; - }else{ - op_pix = put_no_rnd_pixels_tab; - op_qpix= qpel_mc_no_rnd_tab; - } + /* decoding or more than one mb_type (MC was allready done otherwise) */ + if((!s->encoding) || (s->mb_type[mb_xy]&(s->mb_type[mb_xy]-1))){ + if ((!s->no_rounding) || s->pict_type==B_TYPE){ + op_pix = put_pixels_tab; + op_qpix= qpel_mc_rnd_tab; + }else{ + op_pix = put_no_rnd_pixels_tab; + op_qpix= qpel_mc_no_rnd_tab; + } - if (s->mv_dir & MV_DIR_FORWARD) { - MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture, op_pix, op_qpix); - if (!s->no_rounding) - op_pix = avg_pixels_tab; - else - op_pix = avg_no_rnd_pixels_tab; - } - if (s->mv_dir & MV_DIR_BACKWARD) { - MPV_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture, op_pix, op_qpix); + if (s->mv_dir & MV_DIR_FORWARD) { + MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture, op_pix, op_qpix); + if ((!s->no_rounding) || s->pict_type==B_TYPE) + op_pix = avg_pixels_tab; + else + op_pix = avg_no_rnd_pixels_tab; + } + if (s->mv_dir & MV_DIR_BACKWARD) { + MPV_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture, op_pix, op_qpix); + } } - /* add dct residue */ - add_dct(s, block[0], 0, dest_y, dct_linesize); - add_dct(s, block[1], 1, dest_y + 8, dct_linesize); - add_dct(s, block[2], 2, dest_y + dct_offset, dct_linesize); - add_dct(s, block[3], 3, dest_y + dct_offset + 8, dct_linesize); + /* skip dequant / idct if we are really late ;) */ + if(s->hurry_up>1) goto the_end; - add_dct(s, block[4], 4, dest_cb, s->linesize >> 1); - add_dct(s, block[5], 5, dest_cr, s->linesize >> 1); + /* add dct residue */ + if(!s->mpeg2 && (s->encoding || (!s->h263_msmpeg4))){ + add_dequant_dct(s, block[0], 0, dest_y, dct_linesize); + add_dequant_dct(s, block[1], 1, dest_y + 8, dct_linesize); + add_dequant_dct(s, block[2], 2, dest_y + dct_offset, dct_linesize); + add_dequant_dct(s, block[3], 3, dest_y + dct_offset + 8, dct_linesize); + + if(!(s->flags&CODEC_FLAG_GRAY)){ + add_dequant_dct(s, block[4], 4, dest_cb, s->linesize >> 1); + add_dequant_dct(s, block[5], 5, dest_cr, s->linesize >> 1); + } + } else { + add_dct(s, block[0], 0, dest_y, dct_linesize); + add_dct(s, block[1], 1, dest_y + 8, dct_linesize); + add_dct(s, block[2], 2, dest_y + dct_offset, dct_linesize); + add_dct(s, block[3], 3, dest_y + dct_offset + 8, dct_linesize); + + if(!(s->flags&CODEC_FLAG_GRAY)){ + add_dct(s, block[4], 4, dest_cb, s->linesize >> 1); + add_dct(s, block[5], 5, dest_cr, s->linesize >> 1); + } + } } else { /* dct only in intra block */ put_dct(s, block[0], 0, dest_y, dct_linesize); @@ -1102,128 +1323,188 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]) put_dct(s, block[2], 2, dest_y + dct_offset, dct_linesize); put_dct(s, block[3], 3, dest_y + dct_offset + 8, dct_linesize); - put_dct(s, block[4], 4, dest_cb, s->linesize >> 1); - put_dct(s, block[5], 5, dest_cr, s->linesize >> 1); + if(!(s->flags&CODEC_FLAG_GRAY)){ + put_dct(s, block[4], 4, dest_cb, s->linesize >> 1); + put_dct(s, block[5], 5, dest_cr, s->linesize >> 1); + } } } the_end: emms_c(); //FIXME remove } -static void encode_mb(MpegEncContext *s) +static inline void dct_single_coeff_elimination(MpegEncContext *s, int n, int threshold, int skip_dc) +{ + static const char tab[64]= + {3,2,2,1,1,1,1,1, + 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, + 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0}; + int score=0; + int run=0; + int i; + DCTELEM *block= s->block[n]; + const int last_index= s->block_last_index[n]; + + if(skip_dc) skip_dc=1; + + /* are all which we could set to zero are allready zero? */ + if(last_index<=skip_dc - 1) return; + + for(i=0; i<=last_index; i++){ + const int j = zigzag_direct[i]; + const int level = ABS(block[j]); + if(level==1){ + if(skip_dc && i==0) continue; + score+= tab[run]; + run=0; + }else if(level>1){ + return; + }else{ + run++; + } + } + if(score >= threshold) return; + for(i=skip_dc; i<=last_index; i++){ + const int j = zigzag_direct[i]; + block[j]=0; + } + if(block[0]) s->block_last_index[n]= 0; + else s->block_last_index[n]= -1; +} + +static inline void clip_coeffs(MpegEncContext *s, DCTELEM *block, int last_index) +{ + int i; + const int maxlevel= s->max_qcoeff; + const int minlevel= s->min_qcoeff; + + for(i=0;i<=last_index; i++){ + const int j = zigzag_direct[i]; + int level = block[j]; + + if (level>maxlevel) level=maxlevel; + else if(level<minlevel) level=minlevel; + block[j]= level; + } +} + +static void encode_mb(MpegEncContext *s, int motion_x, int motion_y) { - int wrap; const int mb_x= s->mb_x; const int mb_y= s->mb_y; - UINT8 *ptr; - const int motion_x= s->mv[0][0][0]; - const int motion_y= s->mv[0][0][1]; int i; + int skip_dct[6]; +#if 0 + if (s->interlaced_dct) { + dct_linesize = s->linesize * 2; + dct_offset = s->linesize; + } else { + dct_linesize = s->linesize; + dct_offset = s->linesize * 8; + } +#endif + for(i=0; i<6; i++) skip_dct[i]=0; - /* get the pixels */ - wrap = s->linesize; - ptr = s->new_picture[0] + (mb_y * 16 * wrap) + mb_x * 16; - get_pixels(s->block[0], ptr, wrap); - get_pixels(s->block[1], ptr + 8, wrap); - get_pixels(s->block[2], ptr + 8 * wrap, wrap); - get_pixels(s->block[3], ptr + 8 * wrap + 8, wrap); - wrap = s->linesize >> 1; - ptr = s->new_picture[1] + (mb_y * 8 * wrap) + mb_x * 8; - get_pixels(s->block[4], ptr, wrap); - - wrap = s->linesize >> 1; - ptr = s->new_picture[2] + (mb_y * 8 * wrap) + mb_x * 8; - get_pixels(s->block[5], ptr, wrap); - - /* subtract previous frame if non intra */ - if (!s->mb_intra) { - int dxy, offset, mx, my; + if (s->mb_intra) { + UINT8 *ptr; + int wrap; + + wrap = s->linesize; + ptr = s->new_picture[0] + (mb_y * 16 * wrap) + mb_x * 16; + get_pixels(s->block[0], ptr , wrap); + get_pixels(s->block[1], ptr + 8, wrap); + get_pixels(s->block[2], ptr + 8 * wrap , wrap); + get_pixels(s->block[3], ptr + 8 * wrap + 8, wrap); + + if(s->flags&CODEC_FLAG_GRAY){ + skip_dct[4]= 1; + skip_dct[5]= 1; + }else{ + wrap >>=1; + ptr = s->new_picture[1] + (mb_y * 8 * wrap) + mb_x * 8; + get_pixels(s->block[4], ptr, wrap); + + ptr = s->new_picture[2] + (mb_y * 8 * wrap) + mb_x * 8; + get_pixels(s->block[5], ptr, wrap); + } + }else{ + op_pixels_func *op_pix; + qpel_mc_func *op_qpix; + UINT8 *dest_y, *dest_cb, *dest_cr; + UINT8 *ptr_y, *ptr_cb, *ptr_cr; + int wrap_y, wrap_c; + + dest_y = s->current_picture[0] + (mb_y * 16 * s->linesize ) + mb_x * 16; + dest_cb = s->current_picture[1] + (mb_y * 8 * (s->linesize >> 1)) + mb_x * 8; + dest_cr = s->current_picture[2] + (mb_y * 8 * (s->linesize >> 1)) + mb_x * 8; + wrap_y = s->linesize; + wrap_c = wrap_y>>1; + ptr_y = s->new_picture[0] + (mb_y * 16 * wrap_y) + mb_x * 16; + ptr_cb = s->new_picture[1] + (mb_y * 8 * wrap_c) + mb_x * 8; + ptr_cr = s->new_picture[2] + (mb_y * 8 * wrap_c) + mb_x * 8; + + if ((!s->no_rounding) || s->pict_type==B_TYPE){ + op_pix = put_pixels_tab; + op_qpix= qpel_mc_rnd_tab; + }else{ + op_pix = put_no_rnd_pixels_tab; + op_qpix= qpel_mc_no_rnd_tab; + } + + if (s->mv_dir & MV_DIR_FORWARD) { + MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture, op_pix, op_qpix); + if ((!s->no_rounding) || s->pict_type==B_TYPE) + op_pix = avg_pixels_tab; + else + op_pix = avg_no_rnd_pixels_tab; + } + if (s->mv_dir & MV_DIR_BACKWARD) { + MPV_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture, op_pix, op_qpix); + } + + diff_pixels(s->block[0], ptr_y , dest_y , wrap_y); + diff_pixels(s->block[1], ptr_y + 8, dest_y + 8, wrap_y); + diff_pixels(s->block[2], ptr_y + 8 * wrap_y , dest_y + 8 * wrap_y , wrap_y); + diff_pixels(s->block[3], ptr_y + 8 * wrap_y + 8, dest_y + 8 * wrap_y + 8, wrap_y); - if(s->mv_type==MV_TYPE_16X16){ - dxy = ((motion_y & 1) << 1) | (motion_x & 1); - ptr = s->last_picture[0] + - ((mb_y * 16 + (motion_y >> 1)) * s->linesize) + - (mb_x * 16 + (motion_x >> 1)); - - sub_pixels_2(s->block[0], ptr, s->linesize, dxy); - sub_pixels_2(s->block[1], ptr + 8, s->linesize, dxy); - sub_pixels_2(s->block[2], ptr + s->linesize * 8, s->linesize, dxy); - sub_pixels_2(s->block[3], ptr + 8 + s->linesize * 8, s->linesize ,dxy); - - if (s->out_format == FMT_H263) { - /* special rounding for h263 */ - dxy = 0; - if ((motion_x & 3) != 0) - dxy |= 1; - if ((motion_y & 3) != 0) - dxy |= 2; - mx = motion_x >> 2; - my = motion_y >> 2; - } else { - mx = motion_x / 2; - my = motion_y / 2; - dxy = ((my & 1) << 1) | (mx & 1); - mx >>= 1; - my >>= 1; - } - offset = ((mb_y * 8 + my) * (s->linesize >> 1)) + (mb_x * 8 + mx); - ptr = s->last_picture[1] + offset; - sub_pixels_2(s->block[4], ptr, s->linesize >> 1, dxy); - ptr = s->last_picture[2] + offset; - sub_pixels_2(s->block[5], ptr, s->linesize >> 1, dxy); + if(s->flags&CODEC_FLAG_GRAY){ + skip_dct[4]= 1; + skip_dct[5]= 1; }else{ - int src_x, src_y; - - for(i=0;i<4;i++) { - int motion_x = s->mv[0][i][0]; - int motion_y = s->mv[0][i][1]; - - dxy = ((motion_y & 1) << 1) | (motion_x & 1); - src_x = mb_x * 16 + (motion_x >> 1) + (i & 1) * 8; - src_y = mb_y * 16 + (motion_y >> 1) + (i >>1) * 8; - - ptr = s->last_picture[0] + (src_y * s->linesize) + (src_x); - sub_pixels_2(s->block[i], ptr, s->linesize, dxy); - } - /* In case of 8X8, we construct a single chroma motion vector - with a special rounding */ - mx = 0; - my = 0; - for(i=0;i<4;i++) { - mx += s->mv[0][i][0]; - my += s->mv[0][i][1]; - } - if (mx >= 0) - mx = (h263_chroma_roundtab[mx & 0xf] + ((mx >> 3) & ~1)); - else { - mx = -mx; - mx = -(h263_chroma_roundtab[mx & 0xf] + ((mx >> 3) & ~1)); - } - if (my >= 0) - my = (h263_chroma_roundtab[my & 0xf] + ((my >> 3) & ~1)); - else { - my = -my; - my = -(h263_chroma_roundtab[my & 0xf] + ((my >> 3) & ~1)); - } - dxy = ((my & 1) << 1) | (mx & 1); - mx >>= 1; - my >>= 1; - - src_x = mb_x * 8 + mx; - src_y = mb_y * 8 + my; - src_x = clip(src_x, -8, s->width/2); - if (src_x == s->width/2) - dxy &= ~1; - src_y = clip(src_y, -8, s->height/2); - if (src_y == s->height/2) - dxy &= ~2; - - offset = (src_y * (s->linesize >> 1)) + src_x; - ptr = s->last_picture[1] + offset; - sub_pixels_2(s->block[4], ptr, s->linesize >> 1, dxy); - ptr = s->last_picture[2] + offset; - sub_pixels_2(s->block[5], ptr, s->linesize >> 1, dxy); + diff_pixels(s->block[4], ptr_cb, dest_cb, wrap_c); + diff_pixels(s->block[5], ptr_cr, dest_cr, wrap_c); } + + /* pre quantization */ + if(s->mc_mb_var[s->mb_width*mb_y+ mb_x]<2*s->qscale*s->qscale){ + if(pix_abs8x8(ptr_y , dest_y , wrap_y) < 20*s->qscale) skip_dct[0]= 1; + if(pix_abs8x8(ptr_y + 8, dest_y + 8, wrap_y) < 20*s->qscale) skip_dct[1]= 1; + if(pix_abs8x8(ptr_y + 8*wrap_y , dest_y + 8*wrap_y , wrap_y) < 20*s->qscale) skip_dct[2]= 1; + if(pix_abs8x8(ptr_y + 8*wrap_y + 8, dest_y + 8*wrap_y + 8, wrap_y) < 20*s->qscale) skip_dct[3]= 1; + if(pix_abs8x8(ptr_cb , dest_cb , wrap_y) < 20*s->qscale) skip_dct[4]= 1; + if(pix_abs8x8(ptr_cr , dest_cr , wrap_y) < 20*s->qscale) skip_dct[5]= 1; +#if 0 +{ + static int stat[7]; + int num=0; + for(i=0; i<6; i++) + if(skip_dct[i]) num++; + stat[num]++; + + if(s->mb_x==0 && s->mb_y==0){ + for(i=0; i<7; i++){ + printf("%6d %1d\n", stat[i], i); + } + } +} +#endif + } + } #if 0 @@ -1240,17 +1521,47 @@ static void encode_mb(MpegEncContext *s) } #endif /* DCT & quantize */ - if (s->h263_msmpeg4) { - msmpeg4_dc_scale(s); - } else if (s->h263_pred) { + if (s->h263_pred && !(s->msmpeg4_version==1 || s->msmpeg4_version==2)) { h263_dc_scale(s); + } else if (s->h263_aic) { + s->y_dc_scale = 2*s->qscale; + s->c_dc_scale = 2*s->qscale; } else { /* default quantization values */ s->y_dc_scale = 8; s->c_dc_scale = 8; } - for(i=0;i<6;i++) { - s->block_last_index[i] = dct_quantize(s, s->block[i], i, s->qscale); + if(s->out_format==FMT_MJPEG){ + for(i=0;i<6;i++) { + int overflow; + s->block_last_index[i] = dct_quantize(s, s->block[i], i, 8, &overflow); + if (overflow) clip_coeffs(s, s->block[i], s->block_last_index[i]); + } + }else{ + for(i=0;i<6;i++) { + if(!skip_dct[i]){ + int overflow; + s->block_last_index[i] = dct_quantize(s, s->block[i], i, s->qscale, &overflow); + // FIXME we could decide to change to quantizer instead of clipping + // JS: I don't think that would be a good idea it could lower quality instead + // of improve it. Just INTRADC clipping deserves changes in quantizer + if (overflow) clip_coeffs(s, s->block[i], s->block_last_index[i]); + }else + s->block_last_index[i]= -1; + } + if(s->luma_elim_threshold && !s->mb_intra) + for(i=0; i<4; i++) + dct_single_coeff_elimination(s, i, s->luma_elim_threshold, 0); + if(s->chroma_elim_threshold && !s->mb_intra) + for(i=4; i<6; i++) + dct_single_coeff_elimination(s, i, s->chroma_elim_threshold, 1); + } + + if((s->flags&CODEC_FLAG_GRAY) && s->mb_intra){ + s->block_last_index[4]= + s->block_last_index[5]= 0; + s->block[4][0]= + s->block[5][0]= 128; } /* huffman encode */ @@ -1272,14 +1583,113 @@ static void encode_mb(MpegEncContext *s) } } -static void copy_bits(PutBitContext *pb, UINT8 *src, int length) +void ff_copy_bits(PutBitContext *pb, UINT8 *src, int length) { +#if 1 + int bytes= length>>4; + int bits= length&15; + int i; + + if(length==0) return; + + for(i=0; i<bytes; i++) put_bits(pb, 16, be2me_16(((uint16_t*)src)[i])); + put_bits(pb, bits, be2me_16(((uint16_t*)src)[i])>>(16-bits)); +#else int bytes= length>>3; int bits= length&7; int i; for(i=0; i<bytes; i++) put_bits(pb, 8, src[i]); put_bits(pb, bits, src[i]>>(8-bits)); +#endif +} + +static inline void copy_context_before_encode(MpegEncContext *d, MpegEncContext *s, int type){ + int i; + + memcpy(d->last_mv, s->last_mv, 2*2*2*sizeof(int)); //FIXME is memcpy faster then a loop? + + /* mpeg1 */ + d->mb_incr= s->mb_incr; + for(i=0; i<3; i++) + d->last_dc[i]= s->last_dc[i]; + + /* statistics */ + d->mv_bits= s->mv_bits; + d->i_tex_bits= s->i_tex_bits; + d->p_tex_bits= s->p_tex_bits; + d->i_count= s->i_count; + d->p_count= s->p_count; + d->skip_count= s->skip_count; + d->misc_bits= s->misc_bits; + d->last_bits= 0; + + d->mb_skiped= s->mb_skiped; +} + +static inline void copy_context_after_encode(MpegEncContext *d, MpegEncContext *s, int type){ + int i; + + memcpy(d->mv, s->mv, 2*4*2*sizeof(int)); + memcpy(d->last_mv, s->last_mv, 2*2*2*sizeof(int)); //FIXME is memcpy faster then a loop? + + /* mpeg1 */ + d->mb_incr= s->mb_incr; + for(i=0; i<3; i++) + d->last_dc[i]= s->last_dc[i]; + + /* statistics */ + d->mv_bits= s->mv_bits; + d->i_tex_bits= s->i_tex_bits; + d->p_tex_bits= s->p_tex_bits; + d->i_count= s->i_count; + d->p_count= s->p_count; + d->skip_count= s->skip_count; + d->misc_bits= s->misc_bits; + + d->mb_intra= s->mb_intra; + d->mb_skiped= s->mb_skiped; + d->mv_type= s->mv_type; + d->mv_dir= s->mv_dir; + d->pb= s->pb; + if(s->data_partitioning){ + d->pb2= s->pb2; + d->tex_pb= s->tex_pb; + } + d->block= s->block; + for(i=0; i<6; i++) + d->block_last_index[i]= s->block_last_index[i]; +} + +static inline void encode_mb_hq(MpegEncContext *s, MpegEncContext *backup, MpegEncContext *best, int type, + PutBitContext pb[2], PutBitContext pb2[2], PutBitContext tex_pb[2], + int *dmin, int *next_block, int motion_x, int motion_y) +{ + int bits_count; + + copy_context_before_encode(s, backup, type); + + s->block= s->blocks[*next_block]; + s->pb= pb[*next_block]; + if(s->data_partitioning){ + s->pb2 = pb2 [*next_block]; + s->tex_pb= tex_pb[*next_block]; + } + + encode_mb(s, motion_x, motion_y); + + bits_count= get_bit_count(&s->pb); + if(s->data_partitioning){ + bits_count+= get_bit_count(&s->pb2); + bits_count+= get_bit_count(&s->tex_pb); + } + + if(bits_count<*dmin){ + *dmin= bits_count; + *next_block^=1; + + copy_context_after_encode(best, s, type); + } } static void encode_picture(MpegEncContext *s, int picture_number) @@ -1287,8 +1697,17 @@ static void encode_picture(MpegEncContext *s, int picture_number) int mb_x, mb_y, last_gob, pdif = 0; int i; int bits; - MpegEncContext best_s; - UINT8 bit_buf[4][3000]; //FIXME check that this is ALLWAYS large enogh for a MB + MpegEncContext best_s, backup_s; + UINT8 bit_buf[2][3000]; + UINT8 bit_buf2[2][3000]; + UINT8 bit_buf_tex[2][3000]; + PutBitContext pb[2], pb2[2], tex_pb[2]; + + for(i=0; i<2; i++){ + init_put_bits(&pb [i], bit_buf [i], 3000, NULL, NULL); + init_put_bits(&pb2 [i], bit_buf2 [i], 3000, NULL, NULL); + init_put_bits(&tex_pb[i], bit_buf_tex[i], 3000, NULL, NULL); + } s->picture_number = picture_number; @@ -1299,12 +1718,16 @@ static void encode_picture(MpegEncContext *s, int picture_number) s->block_wrap[4]= s->block_wrap[5]= s->mb_width + 2; - s->last_mc_mb_var = s->mc_mb_var; /* Reset the average MB variance */ - s->avg_mb_var = 0; - s->mc_mb_var = 0; + s->mb_var_sum = 0; + s->mc_mb_var_sum = 0; + + /* we need to initialize some time vars before we can encode b-frames */ + if (s->h263_pred && !s->h263_msmpeg4) + ff_set_mpeg4_time(s, s->picture_number); + /* Estimate motion for every MB */ - if(s->pict_type == P_TYPE){ + if(s->pict_type != I_TYPE){ for(mb_y=0; mb_y < s->mb_height; mb_y++) { s->block_index[0]= s->block_wrap[0]*(mb_y*2 + 1) - 1; s->block_index[1]= s->block_wrap[0]*(mb_y*2 + 1); @@ -1319,127 +1742,61 @@ static void encode_picture(MpegEncContext *s, int picture_number) s->block_index[3]+=2; /* compute motion vector & mb_type and store in context */ - estimate_motion(s, mb_x, mb_y); + if(s->pict_type==B_TYPE) + ff_estimate_b_frame_motion(s, mb_x, mb_y); + else + ff_estimate_p_frame_motion(s, mb_x, mb_y); // s->mb_type[mb_y*s->mb_width + mb_x]=MB_TYPE_INTER; } } emms_c(); - }else{ + }else /* if(s->pict_type == I_TYPE) */{ /* I-Frame */ //FIXME do we need to zero them? memset(s->motion_val[0], 0, sizeof(INT16)*(s->mb_width*2 + 2)*(s->mb_height*2 + 2)*2); - memset(s->mv_table[0] , 0, sizeof(INT16)*s->mb_width*s->mb_height); - memset(s->mv_table[1] , 0, sizeof(INT16)*s->mb_width*s->mb_height); + memset(s->p_mv_table , 0, sizeof(INT16)*(s->mb_width+2)*(s->mb_height+2)*2); memset(s->mb_type , MB_TYPE_INTRA, sizeof(UINT8)*s->mb_width*s->mb_height); } - if(s->avg_mb_var < s->mc_mb_var && s->pict_type != B_TYPE && (!s->force_type)){ //FIXME subtract MV bits + if(s->mb_var_sum < s->mc_mb_var_sum && s->pict_type == P_TYPE){ //FIXME subtract MV bits s->pict_type= I_TYPE; - s->picture_in_gop_number=0; memset(s->mb_type , MB_TYPE_INTRA, sizeof(UINT8)*s->mb_width*s->mb_height); + if(s->max_b_frames==0){ + s->input_pict_type= I_TYPE; + s->input_picture_in_gop_number=0; + } //printf("Scene change detected, encoding as I Frame\n"); } - - /* find best f_code for ME which do unlimited searches */ - if(s->pict_type==P_TYPE && s->full_search>3){ - int mv_num[8]; - int i; - int loose=0; - UINT8 * fcode_tab= s->fcode_tab; - - for(i=0; i<8; i++) mv_num[i]=0; - - for(i=0; i<s->mb_num; i++){ - if(s->mb_type[i] & MB_TYPE_INTER){ - mv_num[ fcode_tab[s->mv_table[0][i] + MAX_MV] ]++; - mv_num[ fcode_tab[s->mv_table[1][i] + MAX_MV] ]++; -//printf("%d %d %d\n", s->mv_table[0][i], fcode_tab[s->mv_table[0][i] + MAX_MV], i); - } -//else printf("I"); - } - - for(i=MAX_FCODE; i>1; i--){ - loose+= mv_num[i]; - if(loose > 10) break; //FIXME this is pretty ineffective - } - s->f_code= i; -/* for(i=0; i<=MAX_FCODE; i++){ - printf("%d ", mv_num[i]); - } - printf("\n");*/ - }else{ - s->f_code= 1; + + if(s->pict_type==P_TYPE || s->pict_type==S_TYPE) + s->f_code= ff_get_best_fcode(s, s->p_mv_table, MB_TYPE_INTER); + ff_fix_long_p_mvs(s); + if(s->pict_type==B_TYPE){ + s->f_code= ff_get_best_fcode(s, s->b_forw_mv_table, MB_TYPE_FORWARD); + s->b_code= ff_get_best_fcode(s, s->b_back_mv_table, MB_TYPE_BACKWARD); + + ff_fix_long_b_mvs(s, s->b_forw_mv_table, s->f_code, MB_TYPE_FORWARD); + ff_fix_long_b_mvs(s, s->b_back_mv_table, s->b_code, MB_TYPE_BACKWARD); + ff_fix_long_b_mvs(s, s->b_bidir_forw_mv_table, s->f_code, MB_TYPE_BIDIR); + ff_fix_long_b_mvs(s, s->b_bidir_back_mv_table, s->b_code, MB_TYPE_BIDIR); } - + //printf("f_code %d ///\n", s->f_code); - /* convert MBs with too long MVs to I-Blocks */ - if(s->pict_type==P_TYPE){ - int i, x, y; - const int f_code= s->f_code; - UINT8 * fcode_tab= s->fcode_tab; -//FIXME try to clip instead of intra izing ;) - /* clip / convert to intra 16x16 type MVs */ - for(i=0; i<s->mb_num; i++){ - if(s->mb_type[i]&MB_TYPE_INTER){ - if( fcode_tab[s->mv_table[0][i] + MAX_MV] > f_code - || fcode_tab[s->mv_table[0][i] + MAX_MV] == 0 - || fcode_tab[s->mv_table[1][i] + MAX_MV] > f_code - || fcode_tab[s->mv_table[1][i] + MAX_MV] == 0 ){ - s->mb_type[i] &= ~MB_TYPE_INTER; - s->mb_type[i] |= MB_TYPE_INTRA; - s->mv_table[0][i] = 0; - s->mv_table[1][i] = 0; - } - } - } - - if(s->flags&CODEC_FLAG_4MV){ - int wrap= 2+ s->mb_width*2; - - /* clip / convert to intra 8x8 type MVs */ - for(y=0; y<s->mb_height; y++){ - int xy= (y*2 + 1)*wrap + 1; - i= y*s->mb_width; - - for(x=0; x<s->mb_width; x++){ - if(s->mb_type[i]&MB_TYPE_INTER4V){ - int block; - for(block=0; block<4; block++){ - int off= (block& 1) + (block>>1)*wrap; - int mx= s->motion_val[ xy + off ][0]; - int my= s->motion_val[ xy + off ][1]; - - if( fcode_tab[mx + MAX_MV] > f_code - || fcode_tab[mx + MAX_MV] == 0 - || fcode_tab[my + MAX_MV] > f_code - || fcode_tab[my + MAX_MV] == 0 ){ - s->mb_type[i] &= ~MB_TYPE_INTER4V; - s->mb_type[i] |= MB_TYPE_INTRA; - } - } - xy+=2; - i++; - } - } - } - } - } // printf("%d %d\n", s->avg_mb_var, s->mc_mb_var); - if (!s->fixed_qscale) - s->qscale = rate_estimate_qscale(s); + if(s->flags&CODEC_FLAG_PASS2) + s->qscale = ff_rate_estimate_qscale_pass2(s); + else if (!s->fixed_qscale) + s->qscale = ff_rate_estimate_qscale(s); - /* precompute matrix */ if (s->out_format == FMT_MJPEG) { /* for mjpeg, we do include qscale in the matrix */ s->intra_matrix[0] = default_intra_matrix[0]; for(i=1;i<64;i++) - s->intra_matrix[i] = (default_intra_matrix[i] * s->qscale) >> 3; - convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, s->intra_matrix, 8); - } else { - convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, s->intra_matrix, s->qscale); - convert_matrix(s->q_non_intra_matrix, s->q_non_intra_matrix16, s->non_intra_matrix, s->qscale); + s->intra_matrix[i] = CLAMP_TO_8BIT((default_intra_matrix[i] * s->qscale) >> 3); + convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, + s->q_intra_matrix16_bias, s->intra_matrix, s->intra_quant_bias); } s->last_bits= get_bit_count(&s->pb); @@ -1489,21 +1846,31 @@ static void encode_picture(MpegEncContext *s, int picture_number) s->gob_index = 2; else s->gob_index = 4; + }else if(s->codec_id==CODEC_ID_MPEG4){ + s->gob_index = 1; } - - s->avg_mb_var = s->avg_mb_var / s->mb_num; - + + if(s->codec_id==CODEC_ID_MPEG4 && s->data_partitioning && s->pict_type!=B_TYPE) + ff_mpeg4_init_partitions(s); + + s->resync_mb_x=0; + s->resync_mb_y=0; for(mb_y=0; mb_y < s->mb_height; mb_y++) { - /* Put GOB header based on RTP MTU */ + /* Put GOB header based on RTP MTU for formats which support it per line (H263*)*/ /* TODO: Put all this stuff in a separate generic function */ if (s->rtp_mode) { if (!mb_y) { s->ptr_lastgob = s->pb.buf; s->ptr_last_mb_line = s->pb.buf; } else if (s->out_format == FMT_H263 && !s->h263_pred && !s->h263_msmpeg4 && !(mb_y % s->gob_index)) { + // MN: we could move the space check from h263 -> here, as its not h263 specific last_gob = h263_encode_gob_header(s, mb_y); if (last_gob) { - s->first_gob_line = 1; + s->first_slice_line = 1; + }else{ + /*MN: we reset it here instead at the end of each line cuz mpeg4 can have + slice lines starting & ending in the middle*/ + s->first_slice_line = 0; } } } @@ -1516,10 +1883,9 @@ static void encode_picture(MpegEncContext *s, int picture_number) s->block_index[5]= s->block_wrap[4]*(mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2); for(mb_x=0; mb_x < s->mb_width; mb_x++) { const int mb_type= s->mb_type[mb_y * s->mb_width + mb_x]; - PutBitContext pb; - int d; + const int xy= (mb_y+1) * (s->mb_width+2) + mb_x + 1; +// int d; int dmin=10000000; - int best=0; s->mb_x = mb_x; s->mb_y = mb_y; @@ -1529,114 +1895,216 @@ static void encode_picture(MpegEncContext *s, int picture_number) s->block_index[3]+=2; s->block_index[4]++; s->block_index[5]++; + + /* write gob / video packet header for formats which support it at any MB (MPEG4) */ + if(s->rtp_mode && s->mb_y>0 && s->codec_id==CODEC_ID_MPEG4){ + int pdif= pbBufPtr(&s->pb) - s->ptr_lastgob; + + //the *2 is there so we stay below the requested size + if(pdif + s->mb_line_avgsize/s->mb_width >= s->rtp_payload_size){ + if(s->codec_id==CODEC_ID_MPEG4){ + if(s->data_partitioning && s->pict_type!=B_TYPE){ + ff_mpeg4_merge_partitions(s); + ff_mpeg4_init_partitions(s); + } + ff_mpeg4_encode_video_packet_header(s); + + if(s->flags&CODEC_FLAG_PASS1){ + int bits= get_bit_count(&s->pb); + s->misc_bits+= bits - s->last_bits; + s->last_bits= bits; + } + ff_mpeg4_clean_buffers(s); + } + s->ptr_lastgob = pbBufPtr(&s->pb); + s->first_slice_line=1; + s->resync_mb_x=mb_x; + s->resync_mb_y=mb_y; + } + + if( (s->resync_mb_x == s->mb_x) + && s->resync_mb_y+1 == s->mb_y){ + s->first_slice_line=0; + } + } - s->mv_dir = MV_DIR_FORWARD; if(mb_type & (mb_type-1)){ // more than 1 MB type possible - pb= s->pb; + int next_block=0; + int pb_bits_count, pb2_bits_count, tex_pb_bits_count; + + copy_context_before_encode(&backup_s, s, -1); + backup_s.pb= s->pb; + best_s.data_partitioning= s->data_partitioning; + if(s->data_partitioning){ + backup_s.pb2= s->pb2; + backup_s.tex_pb= s->tex_pb; + } + if(mb_type&MB_TYPE_INTER){ + s->mv_dir = MV_DIR_FORWARD; s->mv_type = MV_TYPE_16X16; s->mb_intra= 0; - s->mv[0][0][0] = s->mv_table[0][mb_y * s->mb_width + mb_x]; - s->mv[0][0][1] = s->mv_table[1][mb_y * s->mb_width + mb_x]; - init_put_bits(&s->pb, bit_buf[1], 3000, NULL, NULL); - s->block= s->inter_block; - - encode_mb(s); - d= get_bit_count(&s->pb); - if(d<dmin){ - flush_put_bits(&s->pb); - dmin=d; - best_s.mv[0][0][0]= s->mv[0][0][0]; - best_s.mv[0][0][1]= s->mv[0][0][1]; - best_s.mb_intra= 0; - best_s.mv_type = MV_TYPE_16X16; - best_s.pb=s->pb; - best_s.block= s->block; - best=1; - for(i=0; i<6; i++) - best_s.block_last_index[i]= s->block_last_index[i]; - } + s->mv[0][0][0] = s->p_mv_table[xy][0]; + s->mv[0][0][1] = s->p_mv_table[xy][1]; + encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_INTER, pb, pb2, tex_pb, + &dmin, &next_block, s->mv[0][0][0], s->mv[0][0][1]); } - if(mb_type&MB_TYPE_INTER4V){ + if(mb_type&MB_TYPE_INTER4V){ + s->mv_dir = MV_DIR_FORWARD; s->mv_type = MV_TYPE_8X8; s->mb_intra= 0; for(i=0; i<4; i++){ s->mv[0][i][0] = s->motion_val[s->block_index[i]][0]; s->mv[0][i][1] = s->motion_val[s->block_index[i]][1]; } - init_put_bits(&s->pb, bit_buf[2], 3000, NULL, NULL); - s->block= s->inter4v_block; - - encode_mb(s); - d= get_bit_count(&s->pb); - if(d<dmin){ - flush_put_bits(&s->pb); - dmin=d; - for(i=0; i<4; i++){ - best_s.mv[0][i][0] = s->mv[0][i][0]; - best_s.mv[0][i][1] = s->mv[0][i][1]; - } - best_s.mb_intra= 0; - best_s.mv_type = MV_TYPE_8X8; - best_s.pb=s->pb; - best_s.block= s->block; - best=2; - for(i=0; i<6; i++) - best_s.block_last_index[i]= s->block_last_index[i]; - } + encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_INTER4V, pb, pb2, tex_pb, + &dmin, &next_block, 0, 0); + } + if(mb_type&MB_TYPE_FORWARD){ + s->mv_dir = MV_DIR_FORWARD; + s->mv_type = MV_TYPE_16X16; + s->mb_intra= 0; + s->mv[0][0][0] = s->b_forw_mv_table[xy][0]; + s->mv[0][0][1] = s->b_forw_mv_table[xy][1]; + encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_FORWARD, pb, pb2, tex_pb, + &dmin, &next_block, s->mv[0][0][0], s->mv[0][0][1]); + } + if(mb_type&MB_TYPE_BACKWARD){ + s->mv_dir = MV_DIR_BACKWARD; + s->mv_type = MV_TYPE_16X16; + s->mb_intra= 0; + s->mv[1][0][0] = s->b_back_mv_table[xy][0]; + s->mv[1][0][1] = s->b_back_mv_table[xy][1]; + encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_BACKWARD, pb, pb2, tex_pb, + &dmin, &next_block, s->mv[1][0][0], s->mv[1][0][1]); + } + if(mb_type&MB_TYPE_BIDIR){ + s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD; + s->mv_type = MV_TYPE_16X16; + s->mb_intra= 0; + s->mv[0][0][0] = s->b_bidir_forw_mv_table[xy][0]; + s->mv[0][0][1] = s->b_bidir_forw_mv_table[xy][1]; + s->mv[1][0][0] = s->b_bidir_back_mv_table[xy][0]; + s->mv[1][0][1] = s->b_bidir_back_mv_table[xy][1]; + encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_BIDIR, pb, pb2, tex_pb, + &dmin, &next_block, 0, 0); + } + if(mb_type&MB_TYPE_DIRECT){ + s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT; + s->mv_type = MV_TYPE_16X16; //FIXME + s->mb_intra= 0; + s->mv[0][0][0] = s->b_direct_forw_mv_table[xy][0]; + s->mv[0][0][1] = s->b_direct_forw_mv_table[xy][1]; + s->mv[1][0][0] = s->b_direct_back_mv_table[xy][0]; + s->mv[1][0][1] = s->b_direct_back_mv_table[xy][1]; + encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_DIRECT, pb, pb2, tex_pb, + &dmin, &next_block, s->b_direct_mv_table[xy][0], s->b_direct_mv_table[xy][1]); } if(mb_type&MB_TYPE_INTRA){ + s->mv_dir = MV_DIR_FORWARD; s->mv_type = MV_TYPE_16X16; s->mb_intra= 1; s->mv[0][0][0] = 0; s->mv[0][0][1] = 0; - init_put_bits(&s->pb, bit_buf[0], 3000, NULL, NULL); - s->block= s->intra_block; - - encode_mb(s); - d= get_bit_count(&s->pb); - if(d<dmin){ - flush_put_bits(&s->pb); - dmin=d; - best_s.mv[0][0][0]= 0; - best_s.mv[0][0][1]= 0; - best_s.mb_intra= 1; - best_s.mv_type = MV_TYPE_16X16; - best_s.pb=s->pb; - best_s.block= s->block; - for(i=0; i<6; i++) - best_s.block_last_index[i]= s->block_last_index[i]; - best=0; - } - /* force cleaning of ac/dc if needed ... */ - s->mbintra_table[mb_x + mb_y*s->mb_width]=1; + encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_INTRA, pb, pb2, tex_pb, + &dmin, &next_block, 0, 0); + /* force cleaning of ac/dc pred stuff if needed ... */ + if(s->h263_pred || s->h263_aic) + s->mbintra_table[mb_x + mb_y*s->mb_width]=1; } - for(i=0; i<4; i++){ - s->mv[0][i][0] = best_s.mv[0][i][0]; - s->mv[0][i][1] = best_s.mv[0][i][1]; + copy_context_after_encode(s, &best_s, -1); + + pb_bits_count= get_bit_count(&s->pb); + flush_put_bits(&s->pb); + ff_copy_bits(&backup_s.pb, bit_buf[next_block^1], pb_bits_count); + s->pb= backup_s.pb; + + if(s->data_partitioning){ + pb2_bits_count= get_bit_count(&s->pb2); + flush_put_bits(&s->pb2); + ff_copy_bits(&backup_s.pb2, bit_buf2[next_block^1], pb2_bits_count); + s->pb2= backup_s.pb2; + + tex_pb_bits_count= get_bit_count(&s->tex_pb); + flush_put_bits(&s->tex_pb); + ff_copy_bits(&backup_s.tex_pb, bit_buf_tex[next_block^1], tex_pb_bits_count); + s->tex_pb= backup_s.tex_pb; } - s->mb_intra= best_s.mb_intra; - s->mv_type= best_s.mv_type; - for(i=0; i<6; i++) - s->block_last_index[i]= best_s.block_last_index[i]; - copy_bits(&pb, bit_buf[best], dmin); - s->block= best_s.block; - s->pb= pb; + s->last_bits= get_bit_count(&s->pb); } else { + int motion_x, motion_y; + s->mv_type=MV_TYPE_16X16; // only one MB-Type possible - if(mb_type&MB_TYPE_INTRA){ + switch(mb_type){ + case MB_TYPE_INTRA: + s->mv_dir = MV_DIR_FORWARD; s->mb_intra= 1; - s->mv[0][0][0] = 0; - s->mv[0][0][1] = 0; - }else{ + motion_x= s->mv[0][0][0] = 0; + motion_y= s->mv[0][0][1] = 0; + break; + case MB_TYPE_INTER: + s->mv_dir = MV_DIR_FORWARD; + s->mb_intra= 0; + motion_x= s->mv[0][0][0] = s->p_mv_table[xy][0]; + motion_y= s->mv[0][0][1] = s->p_mv_table[xy][1]; + break; + case MB_TYPE_INTER4V: + s->mv_dir = MV_DIR_FORWARD; + s->mv_type = MV_TYPE_8X8; s->mb_intra= 0; - s->mv[0][0][0] = s->mv_table[0][mb_y * s->mb_width + mb_x]; - s->mv[0][0][1] = s->mv_table[1][mb_y * s->mb_width + mb_x]; + for(i=0; i<4; i++){ + s->mv[0][i][0] = s->motion_val[s->block_index[i]][0]; + s->mv[0][i][1] = s->motion_val[s->block_index[i]][1]; + } + motion_x= motion_y= 0; + break; + case MB_TYPE_DIRECT: + s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT; + s->mb_intra= 0; + motion_x=s->b_direct_mv_table[xy][0]; + motion_y=s->b_direct_mv_table[xy][1]; + s->mv[0][0][0] = s->b_direct_forw_mv_table[xy][0]; + s->mv[0][0][1] = s->b_direct_forw_mv_table[xy][1]; + s->mv[1][0][0] = s->b_direct_back_mv_table[xy][0]; + s->mv[1][0][1] = s->b_direct_back_mv_table[xy][1]; + break; + case MB_TYPE_BIDIR: + s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD; + s->mb_intra= 0; + motion_x=0; + motion_y=0; + s->mv[0][0][0] = s->b_bidir_forw_mv_table[xy][0]; + s->mv[0][0][1] = s->b_bidir_forw_mv_table[xy][1]; + s->mv[1][0][0] = s->b_bidir_back_mv_table[xy][0]; + s->mv[1][0][1] = s->b_bidir_back_mv_table[xy][1]; + break; + case MB_TYPE_BACKWARD: + s->mv_dir = MV_DIR_BACKWARD; + s->mb_intra= 0; + motion_x= s->mv[1][0][0] = s->b_back_mv_table[xy][0]; + motion_y= s->mv[1][0][1] = s->b_back_mv_table[xy][1]; + break; + case MB_TYPE_FORWARD: + s->mv_dir = MV_DIR_FORWARD; + s->mb_intra= 0; + motion_x= s->mv[0][0][0] = s->b_forw_mv_table[xy][0]; + motion_y= s->mv[0][0][1] = s->b_forw_mv_table[xy][1]; +// printf(" %d %d ", motion_x, motion_y); + break; + default: + motion_x=motion_y=0; //gcc warning fix + printf("illegal MB type\n"); } - encode_mb(s); + encode_mb(s, motion_x, motion_y); + } + /* clean the MV table in IPS frames for direct mode in B frames */ + if(s->mb_intra /* && I,P,S_TYPE */){ + s->p_mv_table[xy][0]=0; + s->p_mv_table[xy][1]=0; } MPV_decode_mb(s, s->block); +//printf("MB %d %d bits\n", s->mb_x+s->mb_y*s->mb_width, get_bit_count(&s->pb)); } @@ -1650,14 +2118,20 @@ static void encode_picture(MpegEncContext *s, int picture_number) } //fprintf(stderr, "\nMB line: %d\tSize: %u\tAvg. Size: %u", s->mb_y, // (s->pb.buf_ptr - s->ptr_last_mb_line), s->mb_line_avgsize); - s->first_gob_line = 0; + if(s->codec_id!=CODEC_ID_MPEG4) s->first_slice_line = 0; //FIXME clean } } emms_c(); - if (s->h263_msmpeg4 && s->pict_type == I_TYPE) + if(s->codec_id==CODEC_ID_MPEG4 && s->data_partitioning && s->pict_type!=B_TYPE) + ff_mpeg4_merge_partitions(s); + + if (s->msmpeg4_version && s->msmpeg4_version<4 && s->pict_type == I_TYPE) msmpeg4_encode_ext_header(s); + if(s->codec_id==CODEC_ID_MPEG4) + ff_mpeg4_stuffing(&s->pb); + //if (s->gob_number) // fprintf(stderr,"\nNumber of GOB: %d", s->gob_number); @@ -1675,30 +2149,14 @@ static void encode_picture(MpegEncContext *s, int picture_number) static int dct_quantize_c(MpegEncContext *s, DCTELEM *block, int n, - int qscale) + int qscale, int *overflow) { int i, j, level, last_non_zero, q; const int *qmat; - int minLevel, maxLevel; - - if(s->avctx!=NULL && s->avctx->codec->id==CODEC_ID_MPEG4){ - /* mpeg4 */ - minLevel= -2048; - maxLevel= 2047; - }else if(s->out_format==FMT_MPEG1){ - /* mpeg1 */ - minLevel= -255; - maxLevel= 255; - }else if(s->out_format==FMT_MJPEG){ - /* (m)jpeg */ - minLevel= -1023; - maxLevel= 1023; - }else{ - /* h263 / msmpeg4 */ - minLevel= -128; - maxLevel= 127; - } - + int bias; + int max=0; + unsigned int threshold1, threshold2; + av_fdct (block); /* we need this permutation so that we correct the IDCT @@ -1706,81 +2164,54 @@ static int dct_quantize_c(MpegEncContext *s, block_permute(block); if (s->mb_intra) { - if (n < 4) - q = s->y_dc_scale; - else - q = s->c_dc_scale; - q = q << 3; - + if (!s->h263_aic) { + if (n < 4) + q = s->y_dc_scale; + else + q = s->c_dc_scale; + q = q << 3; + } else + /* For AIC we skip quant/dequant of INTRADC */ + q = 1 << 3; + /* note: block[0] is assumed to be positive */ block[0] = (block[0] + (q >> 1)) / q; i = 1; last_non_zero = 0; - if (s->out_format == FMT_H263) { - qmat = s->q_non_intra_matrix; - } else { - qmat = s->q_intra_matrix; - } + qmat = s->q_intra_matrix[qscale]; + bias= s->intra_quant_bias<<(QMAT_SHIFT - 3 - QUANT_BIAS_SHIFT); } else { i = 0; last_non_zero = -1; - qmat = s->q_non_intra_matrix; + qmat = s->q_inter_matrix[qscale]; + bias= s->inter_quant_bias<<(QMAT_SHIFT - 3 - QUANT_BIAS_SHIFT); } + threshold1= (1<<(QMAT_SHIFT - 3)) - bias - 1; + threshold2= threshold1<<1; for(;i<64;i++) { j = zigzag_direct[i]; level = block[j]; level = level * qmat[j]; -#ifdef PARANOID - { - static int count = 0; - int level1, level2, qmat1; - double val; - if (qmat == s->q_non_intra_matrix) { - qmat1 = default_non_intra_matrix[j] * s->qscale; - } else { - qmat1 = default_intra_matrix[j] * s->qscale; - } - if (av_fdct != jpeg_fdct_ifast) - val = ((double)block[j] * 8.0) / (double)qmat1; - else - val = ((double)block[j] * 8.0 * 2048.0) / - ((double)qmat1 * aanscales[j]); - level1 = (int)val; - level2 = level / (1 << (QMAT_SHIFT - 3)); - if (level1 != level2) { - fprintf(stderr, "%d: quant error qlevel=%d wanted=%d level=%d qmat1=%d qmat=%d wantedf=%0.6f\n", - count, level2, level1, block[j], qmat1, qmat[j], - val); - count++; - } - } -#endif - /* XXX: slight error for the low range. Test should be equivalent to - (level <= -(1 << (QMAT_SHIFT - 3)) || level >= (1 << - (QMAT_SHIFT - 3))) - */ - if (((level << (31 - (QMAT_SHIFT - 3))) >> (31 - (QMAT_SHIFT - 3))) != - level) { - level = level / (1 << (QMAT_SHIFT - 3)); - /* XXX: currently, this code is not optimal. the range should be: - mpeg1: -255..255 - mpeg2: -2048..2047 - h263: -128..127 - mpeg4: -2048..2047 - */ - if (level > maxLevel) - level = maxLevel; - else if (level < minLevel) - level = minLevel; - - block[j] = level; +// if( bias+level >= (1<<(QMAT_SHIFT - 3)) +// || bias-level >= (1<<(QMAT_SHIFT - 3))){ + if(((unsigned)(level+threshold1))>threshold2){ + if(level>0){ + level= (bias + level)>>(QMAT_SHIFT - 3); + block[j]= level; + }else{ + level= (bias - level)>>(QMAT_SHIFT - 3); + block[j]= -level; + } + max |=level; last_non_zero = i; - } else { - block[j] = 0; + }else{ + block[j]=0; } } + *overflow= s->max_qcoeff < max; //overflow might have happend + return last_non_zero; } @@ -1822,7 +2253,7 @@ static void dct_unquantize_mpeg1_c(MpegEncContext *s, } } else { i = 0; - quant_matrix = s->non_intra_matrix; + quant_matrix = s->inter_matrix; for(;i<nCoeffs;i++) { int j= zigzag_direct[i]; level = block[j]; @@ -1848,6 +2279,69 @@ static void dct_unquantize_mpeg1_c(MpegEncContext *s, } } +static void dct_unquantize_mpeg2_c(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int i, level, nCoeffs; + const UINT16 *quant_matrix; + + if(s->alternate_scan) nCoeffs= 64; + else nCoeffs= s->block_last_index[n]+1; + + if (s->mb_intra) { + if (n < 4) + block[0] = block[0] * s->y_dc_scale; + else + block[0] = block[0] * s->c_dc_scale; + quant_matrix = s->intra_matrix; + for(i=1;i<nCoeffs;i++) { + int j= zigzag_direct[i]; + level = block[j]; + if (level) { + if (level < 0) { + level = -level; + level = (int)(level * qscale * quant_matrix[j]) >> 3; + level = -level; + } else { + level = (int)(level * qscale * quant_matrix[j]) >> 3; + } +#ifdef PARANOID + if (level < -2048 || level > 2047) + fprintf(stderr, "unquant error %d %d\n", i, level); +#endif + block[j] = level; + } + } + } else { + int sum=-1; + i = 0; + quant_matrix = s->inter_matrix; + for(;i<nCoeffs;i++) { + int j= zigzag_direct[i]; + level = block[j]; + if (level) { + if (level < 0) { + level = -level; + level = (((level << 1) + 1) * qscale * + ((int) (quant_matrix[j]))) >> 4; + level = -level; + } else { + level = (((level << 1) + 1) * qscale * + ((int) (quant_matrix[j]))) >> 4; + } +#ifdef PARANOID + if (level < -2048 || level > 2047) + fprintf(stderr, "unquant error %d %d\n", i, level); +#endif + block[j] = level; + sum+=level; + } + } + block[63]^=sum&1; + } +} + + static void dct_unquantize_h263_c(MpegEncContext *s, DCTELEM *block, int n, int qscale) { @@ -1891,188 +2385,117 @@ static void dct_unquantize_h263_c(MpegEncContext *s, } } -/* rate control */ - -/* an I frame is I_FRAME_SIZE_RATIO bigger than a P frame */ -#define I_FRAME_SIZE_RATIO 3.0 -#define QSCALE_K 20 - -static void rate_control_init(MpegEncContext *s) +static void remove_ac(MpegEncContext *s, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int mb_x, int mb_y) { -#if 1 - emms_c(); - - //initial values, they dont really matter as they will be totally different within a few frames - s->i_pred.coeff= s->p_pred.coeff= 7.0; - s->i_pred.count= s->p_pred.count= 1.0; - - s->i_pred.decay= s->p_pred.decay= 0.4; - - // use more bits at the beginning, otherwise high motion at the begin will look like shit - s->qsum=100; - s->qcount=100; - - s->short_term_qsum=0.001; - s->short_term_qcount=0.001; -#else - s->wanted_bits = 0; - - if (s->intra_only) { - s->I_frame_bits = ((INT64)s->bit_rate * FRAME_RATE_BASE) / s->frame_rate; - s->P_frame_bits = s->I_frame_bits; - } else { - s->P_frame_bits = (int) ((float)(s->gop_size * s->bit_rate) / - (float)((float)s->frame_rate / FRAME_RATE_BASE * (I_FRAME_SIZE_RATIO + s->gop_size - 1))); - s->I_frame_bits = (int)(s->P_frame_bits * I_FRAME_SIZE_RATIO); + int dc, dcb, dcr, y, i; + for(i=0; i<4; i++){ + dc= s->dc_val[0][mb_x*2+1 + (i&1) + (mb_y*2+1 + (i>>1))*(s->mb_width*2+2)]; + for(y=0; y<8; y++){ + int x; + for(x=0; x<8; x++){ + dest_y[x + (i&1)*8 + (y + (i>>1)*8)*s->linesize]= dc/8; + } + } + } + dcb = s->dc_val[1][mb_x+1 + (mb_y+1)*(s->mb_width+2)]; + dcr= s->dc_val[2][mb_x+1 + (mb_y+1)*(s->mb_width+2)]; + for(y=0; y<8; y++){ + int x; + for(x=0; x<8; x++){ + dest_cb[x + y*(s->linesize>>1)]= dcb/8; + dest_cr[x + y*(s->linesize>>1)]= dcr/8; + } } - -#if defined(DEBUG) - printf("I_frame_size=%d P_frame_size=%d\n", - s->I_frame_bits, s->P_frame_bits); -#endif -#endif -} - -static double predict(Predictor *p, double q, double var) -{ - return p->coeff*var / (q*p->count); -} - -static void update_predictor(Predictor *p, double q, double var, double size) -{ - double new_coeff= size*q / (var + 1); - if(var<1000) return; -/*{ -int pred= predict(p, q, var); -int error= abs(pred-size); -static double sum=0; -static int count=0; -if(count>5) sum+=error; -count++; -if(256*256*256*64%count==0){ - printf("%d %f %f\n", count, sum/count, p->coeff); -} -}*/ - p->count*= p->decay; - p->coeff*= p->decay; - p->count++; - p->coeff+= new_coeff; } -static int rate_estimate_qscale(MpegEncContext *s) +/** + * will conceal past errors, and allso drop b frames if needed + * + */ +void ff_conceal_past_errors(MpegEncContext *s, int unknown_pos) { -#if 1 - int qmin= s->qmin; - int qmax= s->qmax; - int rate_q=5; - float q; - int qscale; - float br_compensation; - double diff; - double short_term_q; - double long_term_q; - int last_qscale= s->qscale; - double fps; - INT64 wanted_bits; - emms_c(); - - fps= (double)s->frame_rate / FRAME_RATE_BASE; - wanted_bits= s->bit_rate*(double)s->picture_number/fps; - + int mb_x= s->mb_x; + int mb_y= s->mb_y; + int mb_dist=0; + int i, intra_count=0, inter_count=0; + int intra_conceal= s->msmpeg4_version ? 50 : 50; //FIXME finetune + int inter_conceal= s->msmpeg4_version ? 50 : 50; - if(s->picture_number>2){ - /* update predictors */ - if(s->last_pict_type == I_TYPE){ - //FIXME - }else{ //P Frame -//printf("%d %d %d %f\n", s->qscale, s->last_mc_mb_var, s->frame_bits, s->p_pred.coeff); - update_predictor(&s->p_pred, s->qscale, s->last_mc_mb_var, s->frame_bits); - } - } + // for last block + if(mb_x>=s->mb_width) mb_x= s->mb_width -1; + if(mb_y>=s->mb_height) mb_y= s->mb_height-1; - if(s->pict_type == I_TYPE){ - //FIXME - rate_q= s->qsum/s->qcount; - }else{ //P Frame - int i; - int diff, best_diff=1000000000; - for(i=1; i<=31; i++){ - diff= predict(&s->p_pred, i, s->mc_mb_var) - (double)s->bit_rate/fps; - if(diff<0) diff= -diff; - if(diff<best_diff){ - best_diff= diff; - rate_q= i; - } - } + if(s->decoding_error==0 && unknown_pos){ + if(s->data_partitioning && s->pict_type!=B_TYPE) + s->decoding_error= DECODING_AC_LOST; + else + s->decoding_error= DECODING_DESYNC; } - s->short_term_qsum*=s->qblur; - s->short_term_qcount*=s->qblur; + if(s->decoding_error==DECODING_DESYNC && s->pict_type!=B_TYPE) s->next_p_frame_damaged=1; - s->short_term_qsum+= rate_q; - s->short_term_qcount++; - short_term_q= s->short_term_qsum/s->short_term_qcount; + for(i=mb_x + mb_y*s->mb_width; i>=0; i--){ + if(s->mbintra_table[i]) intra_count++; + else inter_count++; + } - long_term_q= s->qsum/s->qcount*(s->total_bits+1)/(wanted_bits+1); //+1 to avoid nan & 0 - -// q= (long_term_q - short_term_q)*s->qcompress + short_term_q; - q= 1/((1/long_term_q - 1/short_term_q)*s->qcompress + 1/short_term_q); + if(s->decoding_error==DECODING_AC_LOST){ + intra_conceal*=2; + inter_conceal*=2; + }else if(s->decoding_error==DECODING_ACDC_LOST){ + intra_conceal*=2; + inter_conceal*=2; + } - diff= s->total_bits - wanted_bits; - br_compensation= (s->bit_rate_tolerance - diff)/s->bit_rate_tolerance; - if(br_compensation<=0.0) br_compensation=0.001; - q/=br_compensation; + if(unknown_pos && (intra_count<inter_count)){ + intra_conceal= inter_conceal= s->mb_num; +// printf("%d %d\n",intra_count, inter_count); + } - qscale= (int)(q + 0.5); - if (qscale<qmin) qscale=qmin; - else if(qscale>qmax) qscale=qmax; - - if (qscale<last_qscale-s->max_qdiff) qscale=last_qscale-s->max_qdiff; - else if(qscale>last_qscale+s->max_qdiff) qscale=last_qscale+s->max_qdiff; + fprintf(stderr, "concealing errors\n"); + + /* for all MBs from the current one back until the last resync marker */ + for(; mb_y>=0 && mb_y>=s->resync_mb_y; mb_y--){ + for(; mb_x>=0; mb_x--){ + uint8_t *dest_y = s->current_picture[0] + (mb_y * 16* s->linesize ) + mb_x * 16; + uint8_t *dest_cb = s->current_picture[1] + (mb_y * 8 * (s->linesize >> 1)) + mb_x * 8; + uint8_t *dest_cr = s->current_picture[2] + (mb_y * 8 * (s->linesize >> 1)) + mb_x * 8; + int mb_x_backup= s->mb_x; //FIXME pass xy to mpeg_motion + int mb_y_backup= s->mb_y; + s->mb_x=mb_x; + s->mb_y=mb_y; + if(s->mbintra_table[mb_y*s->mb_width + mb_x] && mb_dist<intra_conceal){ + if(s->decoding_error==DECODING_AC_LOST){ + remove_ac(s, dest_y, dest_cb, dest_cr, mb_x, mb_y); +// printf("remove ac to %d %d\n", mb_x, mb_y); + }else{ + mpeg_motion(s, dest_y, dest_cb, dest_cr, 0, + s->last_picture, 0, 0, put_pixels_tab, + 0/*mx*/, 0/*my*/, 16); + } + } + else if(!s->mbintra_table[mb_y*s->mb_width + mb_x] && mb_dist<inter_conceal){ + int mx=0; + int my=0; + + if(s->decoding_error!=DECODING_DESYNC){ + int xy= mb_x*2+1 + (mb_y*2+1)*(s->mb_width*2+2); + mx= s->motion_val[ xy ][0]; + my= s->motion_val[ xy ][1]; + } - s->qsum+= qscale; - s->qcount++; + mpeg_motion(s, dest_y, dest_cb, dest_cr, 0, + s->last_picture, 0, 0, put_pixels_tab, + mx, my, 16); + } + s->mb_x= mb_x_backup; + s->mb_y= mb_y_backup; - s->last_pict_type= s->pict_type; -//printf("q:%d diff:%d comp:%f rate_q:%d st_q:%f fvar:%d last_size:%d\n", qscale, (int)diff, br_compensation, -// rate_q, short_term_q, s->mc_mb_var, s->frame_bits); -//printf("%d %d\n", s->bit_rate, (int)fps); - return qscale; -#else - INT64 diff, total_bits = s->total_bits; - float q; - int qscale; - if (s->pict_type == I_TYPE) { - s->wanted_bits += s->I_frame_bits; - } else { - s->wanted_bits += s->P_frame_bits; - } - diff = s->wanted_bits - total_bits; - q = 31.0 - (float)diff / (QSCALE_K * s->mb_height * s->mb_width); - /* adjust for I frame */ - if (s->pict_type == I_TYPE && !s->intra_only) { - q /= I_FRAME_SIZE_RATIO; - } - - /* using a too small Q scale leeds to problems in mpeg1 and h263 - because AC coefficients are clamped to 255 or 127 */ - qmin = 3; - if (q < qmin) - q = qmin; - else if (q > 31) - q = 31; - qscale = (int)(q + 0.5); -#if defined(DEBUG) - printf("\n%d: total=%0.0f wanted=%0.0f br=%0.1f diff=%d qest=%2.1f\n", - s->picture_number, - (double)total_bits, - (double)s->wanted_bits, - (float)s->frame_rate / FRAME_RATE_BASE * - total_bits / s->picture_number, - (int)diff, q); -#endif - return qscale; -#endif + if(mb_x== s->resync_mb_x && mb_y== s->resync_mb_y) return; + if(!s->mbskip_table[mb_x + mb_y*s->mb_width]) mb_dist++; + } + mb_x=s->mb_width-1; + } } AVCodec mpeg1video_encoder = { diff --git a/src/libffmpeg/libavcodec/mpegvideo.h b/src/libffmpeg/libavcodec/mpegvideo.h index f809a1255..2e957451b 100644 --- a/src/libffmpeg/libavcodec/mpegvideo.h +++ b/src/libffmpeg/libavcodec/mpegvideo.h @@ -1,32 +1,31 @@ /* * Generic DCT based hybrid video encoder - * Copyright (c) 2000,2001 Gerard Lantau. + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -/* Macros for picture code type. */ -#define I_TYPE 1 -#define P_TYPE 2 -#define B_TYPE 3 -#define S_TYPE 4 //S(GMC)-VOP MPEG4 +#ifndef AVCODEC_MPEGVIDEO_H +#define AVCODEC_MPEGVIDEO_H + +#define FRAME_SKIPED 100 // return value for header parsers if frame is not coded enum OutputFormat { FMT_MPEG1, FMT_H263, - FMT_MJPEG, + FMT_MJPEG, }; #define MPEG_BUF_SIZE (16 * 1024) @@ -36,6 +35,11 @@ enum OutputFormat { #define MAX_FCODE 7 #define MAX_MV 2048 +#define REORDER_BUFFER_SIZE (FF_MAX_B_FRAMES+2) + +#define ME_MAP_SIZE 64 +#define ME_MAP_SHIFT 3 +#define ME_MAP_MV_BITS 11 typedef struct Predictor{ double coeff; @@ -43,6 +47,33 @@ typedef struct Predictor{ double decay; } Predictor; +typedef struct RateControlEntry{ + int pict_type; + int qscale; + int mv_bits; + int i_tex_bits; + int p_tex_bits; + int misc_bits; + UINT64 expected_bits; + int new_pict_type; + float new_qscale; +}RateControlEntry; + +typedef struct RateControlContext{ + FILE *stats_file; + int num_entries; + RateControlEntry *entry; +}RateControlContext; + +typedef struct ReorderBuffer{ + UINT8 *picture[3]; + int pict_type; + int qscale; + int force_type; + int picture_number; + int picture_in_gop_number; +} ReorderBuffer; + typedef struct MpegEncContext { struct AVCodecContext *avctx; /* the following parameters must be initialized before encoding */ @@ -53,11 +84,15 @@ typedef struct MpegEncContext { int bit_rate; /* wanted bit rate */ int bit_rate_tolerance; /* amount of +- bits (>0)*/ enum OutputFormat out_format; /* output format */ + int h263_pred; /* use mpeg4/h263 ac/dc predictions */ + +/* the following codec id fields are deprecated in favor of codec_id */ int h263_plus; /* h263 plus headers */ int h263_rv10; /* use RV10 variation for H263 */ - int h263_pred; /* use mpeg4/h263 ac/dc predictions */ - int h263_msmpeg4; /* generate MSMPEG4 compatible stream */ + int h263_msmpeg4; /* generate MSMPEG4 compatible stream (deprecated, use msmpeg4_version instead)*/ int h263_intel; /* use I263 intel h263 header */ + + int codec_id; /* see CODEC_ID_xxx */ int fixed_qscale; /* fixed qscale if non zero */ float qcompress; /* amount of qscale change between easy & hard scenes (0.0-1.0) */ float qblur; /* amount of qscale smoothing over time (0.0-1.0) */ @@ -66,7 +101,16 @@ typedef struct MpegEncContext { int max_qdiff; /* max qscale difference between frames */ int encoding; /* true if we are encoding (vs decoding) */ int flags; /* AVCodecContext.flags (HQ, MV4, ...) */ - int force_type; /* 0= no force, otherwise I_TYPE, P_TYPE, ... */ + int force_input_type;/* 0= no force, otherwise I_TYPE, P_TYPE, ... */ + int max_b_frames; /* max number of b-frames for encoding */ + float b_quant_factor;/* qscale factor between ips and b frames */ + float b_quant_offset;/* qscale offset between ips and b frames */ + int rc_strategy; + int b_frame_strategy; + int luma_elim_threshold; + int chroma_elim_threshold; + int strict_std_compliance; /* strictly follow the std (MPEG4, ...) */ + int workaround_bugs; /* workaround bugs in encoders which cannot be detected automatically */ /* the following fields are managed internally by the encoder */ /* bit output */ @@ -74,46 +118,71 @@ typedef struct MpegEncContext { /* sequence parameters */ int context_initialized; + int input_picture_number; + int input_picture_in_gop_number; /* 0-> first pic in gop, ... */ int picture_number; int fake_picture_number; /* picture number at the bitstream frame rate */ int gop_picture_number; /* index of the first picture of a GOP based on fake_pic_num & mpeg1 specific */ int picture_in_gop_number; /* 0-> first pic in gop, ... */ - int mb_width, mb_height; + int b_frames_since_non_b; /* used for encoding, relative to not yet reordered input */ + int mb_width, mb_height; /* number of MBs horizontally & vertically */ int mb_num; /* number of MBs of a picture */ int linesize; /* line size, in bytes, may be different from width */ UINT8 *new_picture[3]; /* picture to be compressed */ - UINT8 *last_picture[3]; /* previous picture */ + UINT8 *picture_buffer[REORDER_BUFFER_SIZE][3]; /* internal buffers used for reordering of input pictures */ + int picture_buffer_index; + ReorderBuffer coded_order[REORDER_BUFFER_SIZE]; + UINT8 *last_picture[3]; /* previous picture */ UINT8 *last_picture_base[3]; /* real start of the picture */ - UINT8 *next_picture[3]; /* previous picture (for bidir pred) */ + UINT8 *next_picture[3]; /* previous picture (for bidir pred) */ UINT8 *next_picture_base[3]; /* real start of the picture */ - UINT8 *aux_picture[3]; /* aux picture (for B frames only) */ - UINT8 *aux_picture_base[3]; /* real start of the picture */ - UINT8 *current_picture[3]; /* buffer to store the decompressed current picture */ - int last_dc[3]; /* last DC values for MPEG1 */ - INT16 *dc_val[3]; /* used for mpeg4 DC prediction, all 3 arrays must be continuous */ + UINT8 *aux_picture[3]; /* aux picture (for B frames only) */ + UINT8 *aux_picture_base[3]; /* real start of the picture */ + UINT8 *current_picture[3]; /* buffer to store the decompressed current picture */ + int num_available_buffers; /* is 0 at the start & after seeking, after the first I frame its 1 after next I/P 2 */ + int last_dc[3]; /* last DC values for MPEG1 */ + INT16 *dc_val[3]; /* used for mpeg4 DC prediction, all 3 arrays must be continuous */ int y_dc_scale, c_dc_scale; - UINT8 *coded_block; /* used for coded block pattern prediction */ - INT16 (*ac_val[3])[16]; /* used for for mpeg4 AC prediction, all 3 arrays must be continuous */ + UINT8 *coded_block; /* used for coded block pattern prediction (msmpeg4v3, wmv1)*/ + INT16 (*ac_val[3])[16]; /* used for for mpeg4 AC prediction, all 3 arrays must be continuous */ int ac_pred; int mb_skiped; /* MUST BE SET only during DECODING */ - UINT8 *mbskip_table; /* used to avoid copy if macroblock - skipped (for black regions for example) */ - UINT8 *mbintra_table; /* used to kill a few memsets */ - - int qscale; - int pict_type; - int last_non_b_pict_type; /* used for mpeg4 gmc b-frames */ - int last_pict_type; /* used for bit rate stuff (needs that to update the right predictor) */ + UINT8 *mbskip_table; /* used to avoid copy if macroblock skipped (for black regions for example) + and used for b-frame encoding & decoding (contains skip table of next P Frame) */ + UINT8 *mbintra_table; /* used to avoid setting {ac, dc, cbp}-pred stuff to zero on inter MB decoding */ + UINT8 *cbp_table; /* used to store cbp, ac_pred for partitioned decoding */ + UINT8 *pred_dir_table; /* used to store pred_dir for partitioned decoding */ + INT8 *qscale_table; /* used to store qscale for partitioned decoding (& postprocessing FIXME export) */ + + int input_qscale; /* qscale prior to reordering of frames */ + int input_pict_type; /* pict_type prior to reordering of frames */ + int force_type; /* 0= no force, otherwise I_TYPE, P_TYPE, ... */ + int qscale; /* QP */ + int last_non_b_qscale; /* QP of last non b frame used for b frame qscale*/ + int pict_type; /* I_TYPE, P_TYPE, B_TYPE, ... */ + int last_non_b_pict_type; /* used for mpeg4 gmc b-frames & ratecontrol */ int frame_rate_index; /* motion compensation */ int unrestricted_mv; int h263_long_vectors; /* use horrible h263v1 long vector mode */ - int f_code; /* resolution */ - int b_code; /* backward resolution for B Frames (mpeg4) */ - INT16 *mv_table[2]; /* MV table (1MV per MB)*/ - INT16 (*motion_val)[2]; /* used for MV prediction (4MV per MB)*/ - int full_search; + int f_code; /* forward MV resolution */ + int b_code; /* backward MV resolution for B Frames (mpeg4) */ + INT16 (*motion_val)[2]; /* used for MV prediction (4MV per MB) */ + INT16 (*p_mv_table)[2]; /* MV table (1MV per MB) p-frame encoding */ + INT16 (*b_forw_mv_table)[2]; /* MV table (1MV per MB) forward mode b-frame encoding */ + INT16 (*b_back_mv_table)[2]; /* MV table (1MV per MB) backward mode b-frame encoding */ + INT16 (*b_bidir_forw_mv_table)[2]; /* MV table (1MV per MB) bidir mode b-frame encoding */ + INT16 (*b_bidir_back_mv_table)[2]; /* MV table (1MV per MB) bidir mode b-frame encoding */ + INT16 (*b_direct_forw_mv_table)[2];/* MV table (1MV per MB) direct mode b-frame encoding */ + INT16 (*b_direct_back_mv_table)[2];/* MV table (1MV per MB) direct mode b-frame encoding */ + INT16 (*b_direct_mv_table)[2]; /* MV table (1MV per MB) direct mode b-frame encoding */ + int me_method; /* ME algorithm */ + uint8_t *me_scratchpad; /* data area for the me algo, so that the ME doesnt need to malloc/free */ + uint32_t *me_map; /* map to avoid duplicate evaluations */ + uint16_t *me_score_map; /* map to store the SADs */ + int me_map_generation; + int skip_me; /* set if ME is skiped for the current MB */ int mv_dir; #define MV_DIR_BACKWARD 1 #define MV_DIR_FORWARD 2 @@ -131,62 +200,76 @@ typedef struct MpegEncContext { */ int mv[2][4][2]; int field_select[2][2]; - int last_mv[2][2][2]; + int last_mv[2][2][2]; /* last MV, used for MV prediction in MPEG1 & B-frame MPEG4 */ UINT16 (*mv_penalty)[MAX_MV*2+1]; /* amount of bits needed to encode a MV, used for ME */ UINT8 *fcode_tab; /* smallest fcode needed for each MV */ int has_b_frames; - int no_rounding; /* apply no rounding to motion estimation (MPEG4) */ + int no_rounding; /* apply no rounding to motion compensation (MPEG4, msmpeg4, ...) + for b-frames rounding mode is allways 0 */ + + int hurry_up; /* when set to 1 during decoding, b frames will be skiped + when set to 2 idct/dequant will be skipped too */ /* macroblock layer */ int mb_x, mb_y; int mb_incr; int mb_intra; - UINT16 *mb_var; /* Table for MB variances */ - UINT8 *mb_type; /* Table for MB type */ + UINT16 *mb_var; /* Table for MB variances */ + UINT16 *mc_mb_var; /* Table for motion compensated MB variances */ + UINT8 *mb_type; /* Table for MB type */ #define MB_TYPE_INTRA 0x01 #define MB_TYPE_INTER 0x02 #define MB_TYPE_INTER4V 0x04 #define MB_TYPE_SKIPED 0x08 +#define MB_TYPE_GMC 0x10 + #define MB_TYPE_DIRECT 0x10 #define MB_TYPE_FORWARD 0x20 -#define MB_TYPE_BACKWAD 0x40 +#define MB_TYPE_BACKWARD 0x40 #define MB_TYPE_BIDIR 0x80 - int block_index[6]; + int block_index[6]; /* index to current MB in block based arrays with edges*/ int block_wrap[6]; /* matrix transmitted in the bitstream */ UINT16 intra_matrix[64]; UINT16 chroma_intra_matrix[64]; - UINT16 non_intra_matrix[64]; - UINT16 chroma_non_intra_matrix[64]; + UINT16 inter_matrix[64]; + UINT16 chroma_inter_matrix[64]; +#define QUANT_BIAS_SHIFT 4 + int intra_quant_bias; /* bias for the quantizer */ + int inter_quant_bias; /* bias for the quantizer */ + int min_qcoeff; /* minimum encodable coefficient */ + int max_qcoeff; /* maximum encodable coefficient */ /* precomputed matrix (combine qscale and DCT renorm) */ - int q_intra_matrix[64]; - int q_non_intra_matrix[64]; + int q_intra_matrix[32][64]; + int q_inter_matrix[32][64]; /* identical to the above but for MMX & these are not permutated */ - UINT16 __align8 q_intra_matrix16[64] ; - UINT16 __align8 q_non_intra_matrix16[64]; + UINT16 __align8 q_intra_matrix16[32][64]; + UINT16 __align8 q_inter_matrix16[32][64]; + UINT16 __align8 q_intra_matrix16_bias[32][64]; + UINT16 __align8 q_inter_matrix16_bias[32][64]; int block_last_index[6]; /* last non zero coefficient in block */ void *opaque; /* private data for the user */ /* bit rate control */ - int I_frame_bits; /* wanted number of bits per I frame */ - int P_frame_bits; /* same for P frame */ - int avg_mb_var; /* average MB variance for current frame */ - int mc_mb_var; /* motion compensated MB variance for current frame */ - int last_mc_mb_var; /* motion compensated MB variance for last frame */ + int I_frame_bits; //FIXME used in mpeg12 ... + int mb_var_sum; /* sum of MB variance for current frame */ + int mc_mb_var_sum; /* motion compensated MB variance for current frame */ + int last_non_b_mc_mb_var;/* motion compensated MB variance for last non b frame */ INT64 wanted_bits; INT64 total_bits; - int frame_bits; /* bits used for the current frame */ - int last_frame_bits; /* bits used for the last frame */ + int frame_bits; /* bits used for the current frame */ + int pb_frame_bits; /* bits of the last b...bp group */ Predictor i_pred; Predictor p_pred; double qsum; /* sum of qscales */ double qcount; /* count of qscales */ double short_term_qsum; /* sum of recent qscales */ double short_term_qcount; /* count of recent qscales */ + RateControlContext rc_context; /* statistics, used for 2-pass encoding */ int mv_bits; @@ -198,11 +281,24 @@ typedef struct MpegEncContext { int skip_count; int misc_bits; // cbp, mb_type int last_bits; //temp var used for calculating the above vars + + /* error concealment / resync */ + int resync_mb_x; /* x position of last resync marker */ + int resync_mb_y; /* y position of last resync marker */ + int mb_num_left; /* number of MBs left in this video packet */ + GetBitContext next_resync_gb; /* starts at the next resync marker */ + int next_resync_qscale; /* qscale of next resync marker */ + int next_resync_pos; /* bitstream position of next resync marker */ +#define DECODING_AC_LOST -1 +#define DECODING_ACDC_LOST -2 +#define DECODING_DESYNC -3 + int decoding_error; + int next_p_frame_damaged; /* set if the next p frame is damaged, to avoid showing trashed b frames */ + int error_resilience; /* H.263 specific */ int gob_number; int gob_index; - int first_gob_line; /* H.263+ specific */ int umvplus; @@ -212,11 +308,13 @@ typedef struct MpegEncContext { /* mpeg4 specific */ int time_increment_resolution; - int time_increment_bits; - int time_increment; - int time_base; - int time; - int last_non_b_time[2]; + int time_increment_bits; /* number of bits to represent the fractional part of time */ + int last_time_base; + int time_base; /* time in seconds of last I,P,S Frame */ + INT64 time; /* time of current frame */ + INT64 last_non_b_time; + UINT16 pp_time; /* time distance between the last 2 p,s,i frames */ + UINT16 bp_time; /* time distance between the last b and p,s,i frame */ int shape; int vol_sprite_usage; int sprite_width; @@ -231,21 +329,32 @@ typedef struct MpegEncContext { int sprite_shift[2][2]; int mcsel; int quant_precision; - int quarter_sample; + int quarter_sample; /* 1->qpel, 0->half pel ME/MC */ int scalability; int new_pred; int reduced_res_vop; int aspect_ratio_info; int sprite_warping_accuracy; int low_latency_sprite; - int data_partioning; - int resync_marker; - int resync_x_pos; + int data_partitioning; + int rvlc; /* reversible vlc */ + int resync_marker; /* could this stream contain resync markers*/ + int low_delay; /* no reordering needed / has no b-frames */ + int vo_type; + int vol_control_parameters; /* does the stream contain the low_delay flag, used to workaround buggy encoders */ + PutBitContext tex_pb; /* used for data partitioned VOPs */ + PutBitContext pb2; /* used for data partitioned VOPs */ +#define PB_BUFFER_SIZE 1024*256 + uint8_t *tex_pb_buffer; + uint8_t *pb2_buffer; /* divx specific, used to workaround (many) bugs in divx5 */ int divx_version; int divx_build; - +#define BITSTREAM_BUFFER_SIZE 1024*256 + UINT8 *bitstream_buffer; //Divx 5.01 puts several frames in a single one, this is used to reorder them + int bitstream_buffer_size; + /* RV10 specific */ int rv10_version; /* RV10 version: 0 or 3 */ int rv10_first_dc_coded[3]; @@ -256,6 +365,7 @@ typedef struct MpegEncContext { int mjpeg_hsample[3]; /* horizontal sampling factors, default = {2, 1, 1} */ int mjpeg_write_tables; /* do we want to have quantisation- and huffmantables in the jpeg file ? */ + int mjpeg_data_only_frames; /* frames only with SOI, SOS and EOI markers */ /* MSMPEG4 specific */ int mv_table_index; @@ -266,8 +376,7 @@ typedef struct MpegEncContext { int slice_height; /* in macroblocks */ int first_slice_line; /* used in mpeg4 too to handle resync markers */ int flipflop_rounding; - int bitrate; - int msmpeg4_version; /* 1=mp41, 2=mp42, 3=mp43/divx3 */ + int msmpeg4_version; /* 0=not msmpeg4, 1=mp41, 2=mp42, 3=mp43/divx3 */ /* decompression specific */ GetBitContext gb; @@ -306,10 +415,14 @@ typedef struct MpegEncContext { UINT32 mb_line_avgsize; DCTELEM (*block)[64]; /* points to one of the following blocks */ - DCTELEM intra_block[6][64] __align8; - DCTELEM inter_block[6][64] __align8; - DCTELEM inter4v_block[6][64] __align8; - void (*dct_unquantize)(struct MpegEncContext *s, + DCTELEM blocks[2][6][64] __align8; // for HQ mode we need to keep the best block + void (*dct_unquantize_mpeg1)(struct MpegEncContext *s, + DCTELEM *block, int n, int qscale); + void (*dct_unquantize_mpeg2)(struct MpegEncContext *s, + DCTELEM *block, int n, int qscale); + void (*dct_unquantize_h263)(struct MpegEncContext *s, + DCTELEM *block, int n, int qscale); + void (*dct_unquantize)(struct MpegEncContext *s, // unquantizer to use (mpeg4 can use both) DCTELEM *block, int n, int qscale); } MpegEncContext; @@ -321,11 +434,20 @@ void MPV_frame_end(MpegEncContext *s); #ifdef HAVE_MMX void MPV_common_init_mmx(MpegEncContext *s); #endif +extern int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow); +extern void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w); +void ff_conceal_past_errors(MpegEncContext *s, int conceal_all); +void ff_copy_bits(PutBitContext *pb, UINT8 *src, int length); +void ff_clean_intra_table_entries(MpegEncContext *s); /* motion_est.c */ - -void estimate_motion(MpegEncContext *s, - int mb_x, int mb_y); +void ff_estimate_p_frame_motion(MpegEncContext * s, + int mb_x, int mb_y); +void ff_estimate_b_frame_motion(MpegEncContext * s, + int mb_x, int mb_y); +int ff_get_best_fcode(MpegEncContext * s, int16_t (*mv_table)[2], int type); +void ff_fix_long_p_mvs(MpegEncContext * s); +void ff_fix_long_b_mvs(MpegEncContext * s, int16_t (*mv_table)[2], int f_code, int type); /* mpeg12.c */ extern INT16 default_intra_matrix[64]; @@ -382,6 +504,7 @@ INT16 *h263_pred_motion(MpegEncContext * s, int block, int *px, int *py); void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n, int dir); +void ff_set_mpeg4_time(MpegEncContext * s, int picture_number); void mpeg4_encode_picture_header(MpegEncContext *s, int picture_number); void h263_encode_init(MpegEncContext *s); @@ -393,6 +516,13 @@ int intel_h263_decode_picture_header(MpegEncContext *s); int h263_decode_mb(MpegEncContext *s, DCTELEM block[6][64]); int h263_get_picture_format(int width, int height); +int ff_mpeg4_decode_video_packet_header(MpegEncContext *s); +int ff_mpeg4_resync(MpegEncContext *s); +void ff_mpeg4_encode_video_packet_header(MpegEncContext *s); +void ff_mpeg4_clean_buffers(MpegEncContext *s); +void ff_mpeg4_stuffing(PutBitContext * pbc); +void ff_mpeg4_init_partitions(MpegEncContext *s); +void ff_mpeg4_merge_partitions(MpegEncContext *s); /* rv10.c */ void rv10_encode_picture_header(MpegEncContext *s, int picture_number); @@ -404,12 +534,12 @@ void msmpeg4_encode_ext_header(MpegEncContext * s); void msmpeg4_encode_mb(MpegEncContext * s, DCTELEM block[6][64], int motion_x, int motion_y); -void msmpeg4_dc_scale(MpegEncContext * s); int msmpeg4_decode_picture_header(MpegEncContext * s); int msmpeg4_decode_ext_header(MpegEncContext * s, int buf_size); int msmpeg4_decode_mb(MpegEncContext *s, DCTELEM block[6][64]); int msmpeg4_decode_init_vlc(MpegEncContext *s); +void ff_old_msmpeg4_dc_scale(MpegEncContext *s); /* mjpegenc.c */ @@ -419,3 +549,12 @@ void mjpeg_encode_mb(MpegEncContext *s, DCTELEM block[6][64]); void mjpeg_picture_header(MpegEncContext *s); void mjpeg_picture_trailer(MpegEncContext *s); + +/* rate control */ +int ff_rate_control_init(MpegEncContext *s); +int ff_rate_estimate_qscale(MpegEncContext *s); +int ff_rate_estimate_qscale_pass2(MpegEncContext *s); +void ff_write_pass1_stats(MpegEncContext *s); +void ff_rate_control_uninit(MpegEncContext *s); + +#endif /* AVCODEC_MPEGVIDEO_H */ diff --git a/src/libffmpeg/libavcodec/msmpeg4.c b/src/libffmpeg/libavcodec/msmpeg4.c index 66fc5255e..629c74497 100644 --- a/src/libffmpeg/libavcodec/msmpeg4.c +++ b/src/libffmpeg/libavcodec/msmpeg4.c @@ -1,27 +1,27 @@ /* * MSMPEG4 backend for ffmpeg encoder and decoder - * Copyright (c) 2001 Gerard Lantau. + * Copyright (c) 2001 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at> */ -#include <stdlib.h> -#include <stdio.h> -#include "common.h" +#include "avcodec.h" #include "dsputil.h" #include "mpegvideo.h" -#include "avcodec.h" + /* * You can also call this codec : MPEG4 with a twist ! @@ -53,7 +53,7 @@ static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr); static int msmpeg4_decode_motion(MpegEncContext * s, int *mx_ptr, int *my_ptr); static void msmpeg4v2_encode_motion(MpegEncContext * s, int val); -static void init_h263_dc_for_msmpeg4(); +static void init_h263_dc_for_msmpeg4(void); extern UINT32 inverse[256]; @@ -137,7 +137,7 @@ static void init_mv_table(MVTable *tab) { int i, x, y; - tab->table_mv_index = malloc(sizeof(UINT16) * 4096); + tab->table_mv_index = av_malloc(sizeof(UINT16) * 4096); /* mark all entries as not used */ for(i=0;i<4096;i++) tab->table_mv_index[i] = tab->n; @@ -159,7 +159,7 @@ static void code012(PutBitContext *pb, int n) } } -/* write MSMPEG4 V3 compatible frame header */ +/* write MSMPEG4 compatible frame header */ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number) { int i; @@ -171,7 +171,7 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number) put_bits(&s->pb, 5, s->qscale); s->rl_table_index = 2; - if(s->msmpeg4_version==2) + if(s->msmpeg4_version<=2) s->rl_chroma_table_index = 2; /* only for I frame */ else s->rl_chroma_table_index = 1; /* only for I frame */ @@ -183,7 +183,7 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number) if (s->pict_type == I_TYPE) { put_bits(&s->pb, 5, 0x17); /* indicate only one "slice" */ - if(s->msmpeg4_version!=2){ + if(s->msmpeg4_version>2){ code012(&s->pb, s->rl_chroma_table_index); code012(&s->pb, s->rl_table_index); @@ -194,7 +194,7 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number) put_bits(&s->pb, 1, s->use_skip_mb_code); s->rl_chroma_table_index = s->rl_table_index; - if(s->msmpeg4_version!=2){ + if(s->msmpeg4_version>2){ code012(&s->pb, s->rl_table_index); put_bits(&s->pb, 1, s->dc_table_index); @@ -228,14 +228,16 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number) void msmpeg4_encode_ext_header(MpegEncContext * s) { - s->flipflop_rounding=1; - s->bitrate= 910; // FIXME - put_bits(&s->pb, 5, s->frame_rate / FRAME_RATE_BASE); //yes 29.97 -> 29 - put_bits(&s->pb, 11, s->bitrate); + put_bits(&s->pb, 11, MIN(s->bit_rate, 2047)); - put_bits(&s->pb, 1, s->flipflop_rounding); + if(s->msmpeg4_version<3) + s->flipflop_rounding=0; + else{ + s->flipflop_rounding=1; + put_bits(&s->pb, 1, s->flipflop_rounding); + } } /* predict coded block */ @@ -328,7 +330,7 @@ void msmpeg4_encode_mb(MpegEncContext * s, if (s->use_skip_mb_code) put_bits(&s->pb, 1, 0); /* mb coded */ - if(s->msmpeg4_version==2){ + if(s->msmpeg4_version<=2){ put_bits(&s->pb, v2_mb_type[cbp&3][1], v2_mb_type[cbp&3][0]); @@ -373,7 +375,7 @@ void msmpeg4_encode_mb(MpegEncContext * s, printf("cbp=%x %x\n", cbp, coded_cbp); #endif - if(s->msmpeg4_version==2){ + if(s->msmpeg4_version<=2){ if (s->pict_type == I_TYPE) { put_bits(&s->pb, v2_intra_cbpc[cbp&3][1], v2_intra_cbpc[cbp&3][0]); @@ -410,11 +412,10 @@ void msmpeg4_encode_mb(MpegEncContext * s, } } - -/* strongly inspirated from MPEG4, but not exactly the same ! */ -void msmpeg4_dc_scale(MpegEncContext * s) +/* old ffmpeg msmpeg4v3 mode */ +void ff_old_msmpeg4_dc_scale(MpegEncContext * s) { - if (s->qscale < 5 || s->msmpeg4_version==2){ + if (s->qscale < 5){ s->y_dc_scale = 8; s->c_dc_scale = 8; }else if (s->qscale < 9){ @@ -426,6 +427,21 @@ void msmpeg4_dc_scale(MpegEncContext * s) } } +static int msmpeg4v1_pred_dc(MpegEncContext * s, int n, + INT32 **dc_val_ptr) +{ + int i; + + if (n < 4) { + i= 0; + } else { + i= n-3; + } + + *dc_val_ptr= &s->last_dc[i]; + return s->last_dc[i]; +} + /* dir = 0: left, dir = 1: top prediction */ static int msmpeg4_pred_dc(MpegEncContext * s, int n, INT16 **dc_val_ptr, int *dir_ptr) @@ -439,6 +455,7 @@ static int msmpeg4_pred_dc(MpegEncContext * s, int n, } else { scale = s->c_dc_scale; } + wrap = s->block_wrap[n]; dc_val= s->dc_val[0] + s->block_index[n]; @@ -508,21 +525,29 @@ static void msmpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr { int sign, code; int pred; - INT16 *dc_val; - pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr); + if(s->msmpeg4_version==1){ + INT32 *dc_val; + pred = msmpeg4v1_pred_dc(s, n, &dc_val); + + /* update predictor */ + *dc_val= level; + }else{ + INT16 *dc_val; + pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr); - /* update predictor */ - if (n < 4) { - *dc_val = level * s->y_dc_scale; - } else { - *dc_val = level * s->c_dc_scale; + /* update predictor */ + if (n < 4) { + *dc_val = level * s->y_dc_scale; + } else { + *dc_val = level * s->c_dc_scale; + } } /* do the prediction */ level -= pred; - if(s->msmpeg4_version==2){ + if(s->msmpeg4_version<=2){ if (n < 4) { put_bits(&s->pb, v2_dc_lum_table[level+256][1], @@ -589,7 +614,7 @@ static void msmpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n) } else { i = 0; rl = &rl_table[3 + s->rl_table_index]; - if(s->msmpeg4_version==2) + if(s->msmpeg4_version<=2) run_diff = 0; else run_diff = 1; @@ -669,9 +694,11 @@ static VLC cbpy_vlc; static VLC v2_intra_cbpc_vlc; static VLC v2_mb_type_vlc; static VLC v2_mv_vlc; +static VLC v1_intra_cbpc_vlc; +static VLC v1_inter_cbpc_vlc; /* this table is practically identical to the one from h263 except that its inverted */ -static void init_h263_dc_for_msmpeg4() +static void init_h263_dc_for_msmpeg4(void) { static int inited=0; @@ -733,60 +760,73 @@ static void init_h263_dc_for_msmpeg4() /* init all vlc decoding tables */ int msmpeg4_decode_init_vlc(MpegEncContext *s) { + static int done = 0; int i; MVTable *mv; - for(i=0;i<NB_RL_TABLES;i++) { - init_rl(&rl_table[i]); - init_vlc_rl(&rl_table[i]); - } - for(i=0;i<2;i++) { - mv = &mv_tables[i]; - init_vlc(&mv->vlc, 9, mv->n + 1, - mv->table_mv_bits, 1, 1, - mv->table_mv_code, 2, 2); - } - init_vlc(&dc_lum_vlc[0], 9, 120, - &table0_dc_lum[0][1], 8, 4, - &table0_dc_lum[0][0], 8, 4); - init_vlc(&dc_chroma_vlc[0], 9, 120, - &table0_dc_chroma[0][1], 8, 4, - &table0_dc_chroma[0][0], 8, 4); - init_vlc(&dc_lum_vlc[1], 9, 120, - &table1_dc_lum[0][1], 8, 4, - &table1_dc_lum[0][0], 8, 4); - init_vlc(&dc_chroma_vlc[1], 9, 120, - &table1_dc_chroma[0][1], 8, 4, - &table1_dc_chroma[0][0], 8, 4); + if (!done) { + done = 1; + + for(i=0;i<NB_RL_TABLES;i++) { + init_rl(&rl_table[i]); + init_vlc_rl(&rl_table[i]); + } + for(i=0;i<2;i++) { + mv = &mv_tables[i]; + init_vlc(&mv->vlc, 9, mv->n + 1, + mv->table_mv_bits, 1, 1, + mv->table_mv_code, 2, 2); + } + + init_vlc(&dc_lum_vlc[0], 9, 120, + &table0_dc_lum[0][1], 8, 4, + &table0_dc_lum[0][0], 8, 4); + init_vlc(&dc_chroma_vlc[0], 9, 120, + &table0_dc_chroma[0][1], 8, 4, + &table0_dc_chroma[0][0], 8, 4); + init_vlc(&dc_lum_vlc[1], 9, 120, + &table1_dc_lum[0][1], 8, 4, + &table1_dc_lum[0][0], 8, 4); + init_vlc(&dc_chroma_vlc[1], 9, 120, + &table1_dc_chroma[0][1], 8, 4, + &table1_dc_chroma[0][0], 8, 4); - init_h263_dc_for_msmpeg4(); - init_vlc(&v2_dc_lum_vlc, 9, 512, - &v2_dc_lum_table[0][1], 8, 4, - &v2_dc_lum_table[0][0], 8, 4); - init_vlc(&v2_dc_chroma_vlc, 9, 512, - &v2_dc_chroma_table[0][1], 8, 4, - &v2_dc_chroma_table[0][0], 8, 4); + init_h263_dc_for_msmpeg4(); + init_vlc(&v2_dc_lum_vlc, 9, 512, + &v2_dc_lum_table[0][1], 8, 4, + &v2_dc_lum_table[0][0], 8, 4); + init_vlc(&v2_dc_chroma_vlc, 9, 512, + &v2_dc_chroma_table[0][1], 8, 4, + &v2_dc_chroma_table[0][0], 8, 4); - init_vlc(&cbpy_vlc, 6, 16, - &cbpy_tab[0][1], 2, 1, - &cbpy_tab[0][0], 2, 1); - init_vlc(&v2_intra_cbpc_vlc, 3, 4, - &v2_intra_cbpc[0][1], 2, 1, - &v2_intra_cbpc[0][0], 2, 1); - init_vlc(&v2_mb_type_vlc, 5, 8, - &v2_mb_type[0][1], 2, 1, - &v2_mb_type[0][0], 2, 1); - init_vlc(&v2_mv_vlc, 9, 33, - &mvtab[0][1], 2, 1, - &mvtab[0][0], 2, 1); - - init_vlc(&mb_non_intra_vlc, 9, 128, - &table_mb_non_intra[0][1], 8, 4, - &table_mb_non_intra[0][0], 8, 4); - init_vlc(&mb_intra_vlc, 9, 64, - &table_mb_intra[0][1], 4, 2, - &table_mb_intra[0][0], 4, 2); + init_vlc(&cbpy_vlc, 6, 16, + &cbpy_tab[0][1], 2, 1, + &cbpy_tab[0][0], 2, 1); + init_vlc(&v2_intra_cbpc_vlc, 3, 4, + &v2_intra_cbpc[0][1], 2, 1, + &v2_intra_cbpc[0][0], 2, 1); + init_vlc(&v2_mb_type_vlc, 5, 8, + &v2_mb_type[0][1], 2, 1, + &v2_mb_type[0][0], 2, 1); + init_vlc(&v2_mv_vlc, 9, 33, + &mvtab[0][1], 2, 1, + &mvtab[0][0], 2, 1); + + init_vlc(&mb_non_intra_vlc, 9, 128, + &table_mb_non_intra[0][1], 8, 4, + &table_mb_non_intra[0][0], 8, 4); + init_vlc(&mb_intra_vlc, 9, 64, + &table_mb_intra[0][1], 4, 2, + &table_mb_intra[0][0], 4, 2); + + init_vlc(&v1_intra_cbpc_vlc, 6, 8, + intra_MCBPC_bits, 1, 1, + intra_MCBPC_code, 1, 1); + init_vlc(&v1_inter_cbpc_vlc, 6, 25, + inter_MCBPC_bits, 1, 1, + inter_MCBPC_code, 1, 1); + } return 0; } @@ -802,31 +842,84 @@ static int decode012(GetBitContext *gb) int msmpeg4_decode_picture_header(MpegEncContext * s) { - int code; + int code, code2; + +#if 0 +{ +int i; +for(i=0; i<s->gb.size*8; i++) + printf("%d", get_bits1(&s->gb)); +// get_bits1(&s->gb); +printf("END\n"); +return -1; +} +#endif + + if(s->msmpeg4_version==1){ + int start_code, num; + start_code = (get_bits(&s->gb, 16)<<16) | get_bits(&s->gb, 16); + if(start_code!=0x00000100){ + fprintf(stderr, "invalid startcode\n"); + return -1; + } + + num= get_bits(&s->gb, 5); // frame number */ + } s->pict_type = get_bits(&s->gb, 2) + 1; if (s->pict_type != I_TYPE && - s->pict_type != P_TYPE) + s->pict_type != P_TYPE){ + fprintf(stderr, "invalid picture type\n"); return -1; + } s->qscale = get_bits(&s->gb, 5); if (s->pict_type == I_TYPE) { code = get_bits(&s->gb, 5); - /* 0x17: one slice, 0x18: two slices */ - if (code < 0x17) - return -1; - s->slice_height = s->mb_height / (code - 0x16); - if(s->msmpeg4_version==2){ + if(s->msmpeg4_version==1){ + if(code==0 || code>s->mb_height){ + fprintf(stderr, "invalid slice height %d\n", code); + return -1; + } + + s->slice_height = code; + }else{ + /* 0x17: one slice, 0x18: two slices, ... */ + if (code < 0x17) + return -1; + + s->slice_height = s->mb_height / (code - 0x16); + } + + switch(s->msmpeg4_version){ + case 1: + case 2: s->rl_chroma_table_index = 2; s->rl_table_index = 2; s->dc_table_index = 0; //not used - }else{ + break; + case 3: s->rl_chroma_table_index = decode012(&s->gb); s->rl_table_index = decode012(&s->gb); s->dc_table_index = get_bits1(&s->gb); + break; + case 4: + msmpeg4_decode_ext_header(s, 999 /* bufer size (useless here) */); + printf("%X\n", show_bits(&s->gb, 24)); + code= get_bits(&s->gb, 2); + if(code==1){ + code2= get_bits(&s->gb, 3); + if(code2==7) skip_bits(&s->gb, 1); + } + printf("%X\n", show_bits(&s->gb, 24)); + s->rl_chroma_table_index = 2; + s->rl_table_index = 2; + + s->dc_table_index = 0; + break; } s->no_rounding = 1; /* printf(" %d %d %d %d \n", @@ -835,22 +928,28 @@ int msmpeg4_decode_picture_header(MpegEncContext * s) s->rl_table_index, s->dc_table_index);*/ } else { - s->use_skip_mb_code = get_bits1(&s->gb); - if(s->msmpeg4_version==2){ + switch(s->msmpeg4_version){ + case 1: + case 2: + if(s->msmpeg4_version==1) + s->use_skip_mb_code = 1; + else + s->use_skip_mb_code = get_bits1(&s->gb); s->rl_table_index = 2; s->rl_chroma_table_index = s->rl_table_index; - s->dc_table_index = 0; //not used - s->mv_table_index = 0; - }else{ + break; + case 3: + s->use_skip_mb_code = get_bits1(&s->gb); s->rl_table_index = decode012(&s->gb); s->rl_chroma_table_index = s->rl_table_index; s->dc_table_index = get_bits1(&s->gb); s->mv_table_index = get_bits1(&s->gb); + break; } /* printf(" %d %d %d %d %d \n", s->use_skip_mb_code, @@ -864,6 +963,7 @@ int msmpeg4_decode_picture_header(MpegEncContext * s) s->no_rounding = 0; } // printf("%d", s->no_rounding); +//return -1; } #if 0 @@ -886,27 +986,36 @@ return -1; int msmpeg4_decode_ext_header(MpegEncContext * s, int buf_size) { + int left= buf_size*8 - get_bits_count(&s->gb); + int length= s->msmpeg4_version>=3 ? 17 : 16; /* the alt_bitstream reader could read over the end so we need to check it */ - if(get_bits_count(&s->gb) + 16 < buf_size*8) + if(left>=length && left<length+8) { int fps; fps= get_bits(&s->gb, 5); - s->bitrate= get_bits(&s->gb, 11); - s->flipflop_rounding= get_bits1(&s->gb); + s->bit_rate= get_bits(&s->gb, 11); + if(s->msmpeg4_version>=3) + s->flipflop_rounding= get_bits1(&s->gb); + else + s->flipflop_rounding= 0; -// printf("fps:%2d bps:%2d roundingType:%1d\n", fps, s->bitrate, s->flipflop_rounding); +// printf("fps:%2d bps:%2d roundingType:%1d\n", fps, s->bit_rate, s->flipflop_rounding); } - else + else if(left<length+8) { s->flipflop_rounding= 0; - s->bitrate= 0; + printf("ext header missing, %d left\n", left); + } + else + { + fprintf(stderr, "I frame too long, ignoring ext header\n"); } return 0; } -static inline void memsetw(short *tab, int val, int n) +static inline void msmpeg4_memsetw(short *tab, int val, int n) { int i; for(i=0;i<n;i++) @@ -952,6 +1061,7 @@ static int msmpeg4v2_decode_motion(MpegEncContext * s, int pred, int f_code) int code, val, sign, shift; code = get_vlc(&s->gb, &v2_mv_vlc); +// printf("MV code %d at %d %d pred: %d\n", code, s->mb_x,s->mb_y, pred); if (code < 0) return 0xffff; @@ -965,8 +1075,8 @@ static int msmpeg4v2_decode_motion(MpegEncContext * s, int pred, int f_code) val++; if (sign) val = -val; - val += pred; + val += pred; if (val <= -64) val += 64; else if (val >= 64) @@ -976,7 +1086,7 @@ static int msmpeg4v2_decode_motion(MpegEncContext * s, int pred, int f_code) } -int msmpeg4v2_decode_mb(MpegEncContext *s, +static int msmpeg4v12_decode_mb(MpegEncContext *s, DCTELEM block[6][64]) { int cbp, code, i; @@ -996,20 +1106,41 @@ int msmpeg4v2_decode_mb(MpegEncContext *s, } } - code = get_vlc(&s->gb, &v2_mb_type_vlc); + if(s->msmpeg4_version==2) + code = get_vlc(&s->gb, &v2_mb_type_vlc); + else + code = get_vlc(&s->gb, &v1_inter_cbpc_vlc); + if(code<0 || code>7){ + fprintf(stderr, "cbpc %d invalid at %d %d\n", code, s->mb_x, s->mb_y); + return -1; + } + s->mb_intra = code >>2; cbp = code & 0x3; } else { s->mb_intra = 1; - cbp= get_vlc(&s->gb, &v2_intra_cbpc_vlc); + if(s->msmpeg4_version==2) + cbp= get_vlc(&s->gb, &v2_intra_cbpc_vlc); + else + cbp= get_vlc(&s->gb, &v1_intra_cbpc_vlc); + if(cbp<0 || cbp>3){ + fprintf(stderr, "cbpc %d invalid at %d %d\n", cbp, s->mb_x, s->mb_y); + return -1; + } } if (!s->mb_intra) { - int mx, my; + int mx, my, cbpy; + + cbpy= get_vlc(&s->gb, &cbpy_vlc); + if(cbpy<0){ + fprintf(stderr, "cbpy %d invalid at %d %d\n", cbp, s->mb_x, s->mb_y); + return -1; + } - cbp|= get_vlc(&s->gb, &cbpy_vlc)<<2; - if((cbp&3) != 3) cbp^= 0x3C; + cbp|= cbpy<<2; + if(s->msmpeg4_version==1 || (cbp&3) != 3) cbp^= 0x3C; h263_pred_motion(s, 0, &mx, &my); mx= msmpeg4v2_decode_motion(s, mx, 1); @@ -1020,14 +1151,20 @@ int msmpeg4v2_decode_mb(MpegEncContext *s, s->mv[0][0][0] = mx; s->mv[0][0][1] = my; } else { - s->ac_pred = get_bits1(&s->gb); - cbp|= get_vlc(&s->gb, &cbpy_vlc)<<2; + if(s->msmpeg4_version==2){ + s->ac_pred = get_bits1(&s->gb); + cbp|= get_vlc(&s->gb, &cbpy_vlc)<<2; //FIXME check errors + } else{ + s->ac_pred = 0; + cbp|= get_vlc(&s->gb, &cbpy_vlc)<<2; //FIXME check errors + if(s->pict_type==P_TYPE) cbp^=0x3C; + } } for (i = 0; i < 6; i++) { if (msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1) < 0) { - fprintf(stderr,"\nIgnoring error while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i); + fprintf(stderr,"\nerror while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i); return -1; } } @@ -1046,23 +1183,23 @@ int msmpeg4_decode_mb(MpegEncContext *s, int wrap; /* reset DC pred (set previous line to 1024) */ wrap = 2 * s->mb_width + 2; - memsetw(&s->dc_val[0][(1) + (2 * s->mb_y) * wrap], - 1024, 2 * s->mb_width); - wrap = s->mb_width + 2; - memsetw(&s->dc_val[1][(1) + (s->mb_y) * wrap], - 1024, s->mb_width); - memsetw(&s->dc_val[2][(1) + (s->mb_y) * wrap], - 1024, s->mb_width); - - /* reset AC pred (set previous line to 0) */ - wrap = s->mb_width * 2 + 2; - memsetw(s->ac_val[0][0] + (1 + (2 * s->mb_y) * wrap)*16, - 0, 2 * s->mb_width*16); - wrap = s->mb_width + 2; - memsetw(s->ac_val[1][0] + (1 + (s->mb_y) * wrap)*16, - 0, s->mb_width*16); - memsetw(s->ac_val[2][0] + (1 + (s->mb_y) * wrap)*16, - 0, s->mb_width*16); + msmpeg4_memsetw(&s->dc_val[0][(1) + (2 * s->mb_y) * wrap], + 1024, 2 * s->mb_width); + wrap = s->mb_width + 2; + msmpeg4_memsetw(&s->dc_val[1][(1) + (s->mb_y) * wrap], + 1024, s->mb_width); + msmpeg4_memsetw(&s->dc_val[2][(1) + (s->mb_y) * wrap], + 1024, s->mb_width); + + /* reset AC pred (set previous line to 0) */ + wrap = s->mb_width * 2 + 2; + msmpeg4_memsetw(s->ac_val[0][0] + (1 + (2 * s->mb_y) * wrap)*16, + 0, 2 * s->mb_width*16); + wrap = s->mb_width + 2; + msmpeg4_memsetw(s->ac_val[1][0] + (1 + (s->mb_y) * wrap)*16, + 0, s->mb_width*16); + msmpeg4_memsetw(s->ac_val[2][0] + (1 + (s->mb_y) * wrap)*16, + 0, s->mb_width*16); s->first_slice_line = 1; } else { @@ -1070,7 +1207,7 @@ int msmpeg4_decode_mb(MpegEncContext *s, } } - if(s->msmpeg4_version==2) return msmpeg4v2_decode_mb(s, block); //FIXME merge if possible + if(s->msmpeg4_version<=2) return msmpeg4v12_decode_mb(s, block); //FIXME export function & call from outside perhaps if (s->pict_type == P_TYPE) { set_stat(ST_INTER_MB); @@ -1133,10 +1270,11 @@ int msmpeg4_decode_mb(MpegEncContext *s, for (i = 0; i < 6; i++) { if (msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1) < 0) { - fprintf(stderr,"\nIgnoring error while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i); - // return -1; + fprintf(stderr,"\nerror while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i); + return -1; } } + return 0; } @@ -1156,14 +1294,24 @@ static int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block, /* DC coef */ set_stat(ST_DC); level = msmpeg4_decode_dc(s, n, &dc_pred_dir); - if (level < 0) + if (level < 0){ + fprintf(stderr, "dc overflow-\n"); return -1; - block[0] = level; + } if (n < 4) { rl = &rl_table[s->rl_table_index]; + if(level > 256*s->y_dc_scale){ + fprintf(stderr, "dc overflow+\n"); + return -1; + } } else { rl = &rl_table[3 + s->rl_chroma_table_index]; + if(level > 256*s->c_dc_scale){ + fprintf(stderr, "dc overflow+\n"); + return -1; + } } + block[0] = level; run_diff = 0; i = 1; @@ -1204,16 +1352,42 @@ static int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block, return -1; if (code == rl->n) { /* escape */ - if (get_bits1(&s->gb) == 0) { - if (get_bits1(&s->gb) == 0) { + if (s->msmpeg4_version==1 || get_bits1(&s->gb) == 0) { + if (s->msmpeg4_version==1 || get_bits1(&s->gb) == 0) { /* third escape */ last = get_bits1(&s->gb); run = get_bits(&s->gb, 6); level = get_bits(&s->gb, 8); level = (level << 24) >> 24; /* sign extend */ +#if 0 // waste of time / this will detect very few errors + { + const int abs_level= ABS(level); + const int run1= run - rl->max_run[last][abs_level] - run_diff; + if(abs_level<=MAX_LEVEL && run<=MAX_RUN){ + if(abs_level <= rl->max_level[last][run]){ + fprintf(stderr, "illegal 3. esc, vlc encoding possible\n"); + return DECODING_AC_LOST; + } + if(abs_level <= rl->max_level[last][run]*2){ + fprintf(stderr, "illegal 3. esc, esc 1 encoding possible\n"); + return DECODING_AC_LOST; + } + if(abs_level <= rl->max_level[last][run1] && 0){ + fprintf(stderr, "illegal 3. esc, esc 2 encoding possible\n"); + return DECODING_AC_LOST; + } + } + } +#endif //level = level * qmul + (level>0) * qadd - (level<=0) * qadd ; if (level>0) level= level * qmul + qadd; - else level= level * qmul - qadd; + else level= level * qmul - qadd; +#if 0 // waste of time too :( + if(level>2048 || level<-2048){ + fprintf(stderr, "|level| overflow in 3. esc\n"); + return DECODING_AC_LOST; + } +#endif } else { /* second escape */ code = get_vlc(&s->gb, &rl->vlc); @@ -1250,6 +1424,7 @@ static int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block, i += run; if (i >= 64) return -1; + j = scan_table[i]; block[j] = level; i++; @@ -1271,9 +1446,8 @@ static int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block, static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr) { int level, pred; - INT16 *dc_val; - if(s->msmpeg4_version==2){ + if(s->msmpeg4_version<=2){ if (n < 4) { level = get_vlc(&s->gb, &v2_dc_lum_vlc); } else { @@ -1288,8 +1462,10 @@ static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr) } else { level = get_vlc(&s->gb, &dc_chroma_vlc[s->dc_table_index]); } - if (level < 0) + if (level < 0){ + fprintf(stderr, "illegal dc vlc\n"); return -1; + } if (level == DC_MAX) { level = get_bits(&s->gb, 8); @@ -1301,14 +1477,24 @@ static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr) } } - pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr); - level += pred; + if(s->msmpeg4_version==1){ + INT32 *dc_val; + pred = msmpeg4v1_pred_dc(s, n, &dc_val); + level += pred; + + /* update predictor */ + *dc_val= level; + }else{ + INT16 *dc_val; + pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr); + level += pred; - /* update predictor */ - if (n < 4) { - *dc_val = level * s->y_dc_scale; - } else { - *dc_val = level * s->c_dc_scale; + /* update predictor */ + if (n < 4) { + *dc_val = level * s->y_dc_scale; + } else { + *dc_val = level * s->c_dc_scale; + } } return level; diff --git a/src/libffmpeg/libavcodec/msmpeg4data.h b/src/libffmpeg/libavcodec/msmpeg4data.h index 9dcb8276f..66e0a3d89 100644 --- a/src/libffmpeg/libavcodec/msmpeg4data.h +++ b/src/libffmpeg/libavcodec/msmpeg4data.h @@ -3,7 +3,7 @@ */ /* intra picture macro block coded block pattern */ -const UINT16 table_mb_intra[64][2] = { +static const UINT16 table_mb_intra[64][2] = { { 0x1, 1 },{ 0x17, 6 },{ 0x9, 5 },{ 0x5, 5 }, { 0x6, 5 },{ 0x47, 9 },{ 0x20, 7 },{ 0x10, 7 }, { 0x2, 5 },{ 0x7c, 9 },{ 0x3a, 7 },{ 0x1d, 7 }, @@ -23,7 +23,7 @@ const UINT16 table_mb_intra[64][2] = { }; /* non intra picture macro block coded block pattern + mb type */ -const UINT32 table_mb_non_intra[128][2] = { +static const UINT32 table_mb_non_intra[128][2] = { { 0x40, 7 },{ 0x13c9, 13 },{ 0x9fd, 12 },{ 0x1fc, 15 }, { 0x9fc, 12 },{ 0xa83, 18 },{ 0x12d34, 17 },{ 0x83bc, 16 }, { 0x83a, 12 },{ 0x7f8, 17 },{ 0x3fd, 16 },{ 0x3ff, 16 }, @@ -128,7 +128,7 @@ static const UINT32 table0_dc_chroma[120][2] = { /* dc table 1 */ -const UINT32 table1_dc_lum[120][2] = { +static const UINT32 table1_dc_lum[120][2] = { { 0x2, 2 },{ 0x3, 2 },{ 0x3, 3 },{ 0x2, 4 }, { 0x5, 4 },{ 0x1, 5 },{ 0x3, 5 },{ 0x8, 5 }, { 0x0, 6 },{ 0x5, 6 },{ 0xd, 6 },{ 0xf, 6 }, @@ -161,7 +161,7 @@ const UINT32 table1_dc_lum[120][2] = { { 0x1e6964, 26 },{ 0x1e6965, 26 },{ 0x1e6966, 26 },{ 0x1e6967, 26 }, }; -const UINT32 table1_dc_chroma[120][2] = { +static const UINT32 table1_dc_chroma[120][2] = { { 0x0, 2 },{ 0x1, 2 },{ 0x4, 3 },{ 0x7, 3 }, { 0xb, 4 },{ 0xd, 4 },{ 0x15, 5 },{ 0x28, 6 }, { 0x30, 6 },{ 0x32, 6 },{ 0x52, 7 },{ 0x62, 7 }, @@ -233,7 +233,7 @@ static const UINT16 table0_vlc[133][2] = { { 0x16, 7 }, }; -const INT8 table0_level[132] = { +static const INT8 table0_level[132] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, @@ -253,7 +253,7 @@ const INT8 table0_level[132] = { 1, 1, 1, 1, }; -const INT8 table0_run[132] = { +static const INT8 table0_run[132] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, @@ -275,7 +275,7 @@ const INT8 table0_run[132] = { /* vlc table 1, for intra chroma and P macro blocks */ -const UINT16 table1_vlc[149][2] = { +static const UINT16 table1_vlc[149][2] = { { 0x4, 3 },{ 0x14, 5 },{ 0x17, 7 },{ 0x7f, 8 }, { 0x154, 9 },{ 0x1f2, 10 },{ 0xbf, 11 },{ 0x65, 12 }, { 0xaaa, 12 },{ 0x630, 13 },{ 0x1597, 13 },{ 0x3b7, 14 }, @@ -316,7 +316,7 @@ const UINT16 table1_vlc[149][2] = { { 0xd, 9 }, }; -const INT8 table1_level[148] = { +static const INT8 table1_level[148] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, @@ -338,7 +338,7 @@ const INT8 table1_level[148] = { 1, 1, 1, 1, }; -const INT8 table1_run[148] = { +static const INT8 table1_run[148] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, @@ -362,7 +362,7 @@ const INT8 table1_run[148] = { /* third vlc table */ -const UINT16 table2_vlc[186][2] = { +static const UINT16 table2_vlc[186][2] = { { 0x1, 2 },{ 0x5, 3 },{ 0xd, 4 },{ 0x12, 5 }, { 0xe, 6 },{ 0x15, 7 },{ 0x13, 8 },{ 0x3f, 8 }, { 0x4b, 9 },{ 0x11f, 9 },{ 0xb8, 10 },{ 0x3e3, 10 }, @@ -412,7 +412,7 @@ const UINT16 table2_vlc[186][2] = { { 0x23dc, 14 },{ 0x4a, 9 }, }; -const INT8 table2_level[185] = { +static const INT8 table2_level[185] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1, 2, 3, 4, 5, @@ -439,7 +439,7 @@ const INT8 table2_level[185] = { 1, }; -const INT8 table2_run[185] = { +static const INT8 table2_run[185] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, @@ -467,7 +467,7 @@ const INT8 table2_run[185] = { }; /* second non intra vlc table */ -const UINT16 table4_vlc[169][2] = { +static const UINT16 table4_vlc[169][2] = { { 0x0, 3 },{ 0x3, 4 },{ 0xb, 5 },{ 0x14, 6 }, { 0x3f, 6 },{ 0x5d, 7 },{ 0xa2, 8 },{ 0xac, 9 }, { 0x16e, 9 },{ 0x20a, 10 },{ 0x2e2, 10 },{ 0x432, 11 }, @@ -513,7 +513,7 @@ const UINT16 table4_vlc[169][2] = { { 0x169, 9 }, }; -const INT8 table4_level[168] = { +static const INT8 table4_level[168] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 1, @@ -537,7 +537,7 @@ const INT8 table4_level[168] = { 1, 1, 1, 1, 1, 1, 1, 1, }; -const INT8 table4_run[168] = { +static const INT8 table4_run[168] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, @@ -575,6 +575,11 @@ extern const UINT8 DCtab_chrom[13][2]; extern const UINT8 cbpy_tab[16][2]; extern const UINT8 mvtab[33][2]; +extern const UINT8 intra_MCBPC_code[8]; +extern const UINT8 intra_MCBPC_bits[8]; + +extern const UINT8 inter_MCBPC_code[8]; +extern const UINT8 inter_MCBPC_bits[8]; #define NB_RL_TABLES 6 @@ -627,7 +632,7 @@ static RLTable rl_table[NB_RL_TABLES] = { /* motion vector table 0 */ -const UINT16 table0_mv_code[1100] = { +static const UINT16 table0_mv_code[1100] = { 0x0001, 0x0003, 0x0005, 0x0007, 0x0003, 0x0008, 0x000c, 0x0001, 0x0002, 0x001b, 0x0006, 0x000b, 0x0015, 0x0002, 0x000e, 0x000f, 0x0014, 0x0020, 0x0022, 0x0025, 0x0027, 0x0029, 0x002d, 0x004b, @@ -768,7 +773,7 @@ const UINT16 table0_mv_code[1100] = { 0x5f0d, 0x5f0e, 0x5f0f, 0x0000, }; -const UINT8 table0_mv_bits[1100] = { +static const UINT8 table0_mv_bits[1100] = { 1, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, @@ -909,7 +914,7 @@ const UINT8 table0_mv_bits[1100] = { 17, 17, 17, 8, }; -const UINT8 table0_mvx[1099] = { +static const UINT8 table0_mvx[1099] = { 32, 32, 31, 32, 33, 31, 33, 31, 33, 32, 34, 32, 30, 32, 31, 34, 35, 32, 34, 33, 29, 33, 30, 30, @@ -1050,7 +1055,7 @@ const UINT8 table0_mvx[1099] = { 61, 19, 19, }; -const UINT8 table0_mvy[1099] = { +static const UINT8 table0_mvy[1099] = { 32, 31, 32, 33, 32, 31, 31, 33, 33, 34, 32, 30, 32, 35, 34, 31, 32, 29, 33, 30, 32, 34, 33, 31, @@ -1192,7 +1197,7 @@ const UINT8 table0_mvy[1099] = { }; /* motion vector table 1 */ -const UINT16 table1_mv_code[1100] = { +static const UINT16 table1_mv_code[1100] = { 0x0000, 0x0007, 0x0009, 0x000f, 0x000a, 0x0011, 0x001a, 0x001c, 0x0011, 0x0031, 0x0025, 0x002d, 0x002f, 0x006f, 0x0075, 0x0041, 0x004c, 0x004e, 0x005c, 0x0060, 0x0062, 0x0066, 0x0068, 0x0069, @@ -1333,7 +1338,7 @@ const UINT16 table1_mv_code[1100] = { 0x2473, 0x26a2, 0x26a3, 0x000b, }; -const UINT8 table1_mv_bits[1100] = { +static const UINT8 table1_mv_bits[1100] = { 2, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, @@ -1474,7 +1479,7 @@ const UINT8 table1_mv_bits[1100] = { 15, 15, 15, 4, }; -const UINT8 table1_mvx[1099] = { +static const UINT8 table1_mvx[1099] = { 32, 31, 32, 31, 33, 32, 33, 33, 31, 34, 30, 32, 32, 34, 35, 32, 34, 33, 29, 30, 30, 32, 31, 31, @@ -1615,7 +1620,7 @@ const UINT8 table1_mvx[1099] = { 0, 12, 27, }; -const UINT8 table1_mvy[1099] = { +static const UINT8 table1_mvy[1099] = { 32, 32, 31, 31, 32, 33, 31, 33, 33, 32, 32, 30, 34, 31, 32, 29, 33, 30, 32, 33, 31, 35, 34, 30, diff --git a/src/libffmpeg/libavcodec/ratecontrol.c b/src/libffmpeg/libavcodec/ratecontrol.c new file mode 100644 index 000000000..8395eefad --- /dev/null +++ b/src/libffmpeg/libavcodec/ratecontrol.c @@ -0,0 +1,402 @@ +/* + * Rate control for video encoders + * + * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "avcodec.h" +#include "dsputil.h" +#include "mpegvideo.h" + +#define STATS_FILE "lavc_stats.txt" + +static int init_pass2(MpegEncContext *s); + +void ff_write_pass1_stats(MpegEncContext *s){ + RateControlContext *rcc= &s->rc_context; +// fprintf(c->stats_file, "type:%d q:%d icount:%d pcount:%d scount:%d itex:%d ptex%d mv:%d misc:%d fcode:%d bcode:%d\") + fprintf(rcc->stats_file, "in:%d out:%d type:%d q:%d itex:%d ptex:%d mv:%d misc:%d fcode:%d bcode:%d\n", + s->picture_number, s->input_picture_number - s->max_b_frames, s->pict_type, + s->qscale, s->i_tex_bits, s->p_tex_bits, s->mv_bits, s->misc_bits, s->f_code, s->b_code); +} + +int ff_rate_control_init(MpegEncContext *s) +{ + RateControlContext *rcc= &s->rc_context; + emms_c(); + + if(s->flags&CODEC_FLAG_PASS1){ + rcc->stats_file= fopen(STATS_FILE, "w"); + if(!rcc->stats_file){ + fprintf(stderr, "failed to open " STATS_FILE "\n"); + return -1; + } + } else if(s->flags&CODEC_FLAG_PASS2){ + int size; + int i; + + rcc->stats_file= fopen(STATS_FILE, "r"); + if(!rcc->stats_file){ + fprintf(stderr, "failed to open " STATS_FILE "\n"); + return -1; + } + + /* find number of pics without reading the file twice :) */ + fseek(rcc->stats_file, 0, SEEK_END); + size= ftell(rcc->stats_file); + fseek(rcc->stats_file, 0, SEEK_SET); + + size/= 64; // we need at least 64 byte to store a line ... + rcc->entry = (RateControlEntry*)av_mallocz(size*sizeof(RateControlEntry)); + + for(i=0; !feof(rcc->stats_file); i++){ + RateControlEntry *rce; + int picture_number; + int e; + + e= fscanf(rcc->stats_file, "in:%d ", &picture_number); + rce= &rcc->entry[picture_number]; + e+=fscanf(rcc->stats_file, "out:%*d type:%d q:%d itex:%d ptex:%d mv:%d misc:%d fcode:%*d bcode:%*d\n", + &rce->pict_type, &rce->qscale, &rce->i_tex_bits, &rce->p_tex_bits, &rce->mv_bits, &rce->misc_bits); + if(e!=7){ + fprintf(stderr, STATS_FILE " is damaged\n"); + return -1; + } + } + rcc->num_entries= i; + + if(init_pass2(s) < 0) return -1; + } + + /* no 2pass stuff, just normal 1-pass */ + //initial values, they dont really matter as they will be totally different within a few frames + s->i_pred.coeff= s->p_pred.coeff= 7.0; + s->i_pred.count= s->p_pred.count= 1.0; + + s->i_pred.decay= s->p_pred.decay= 0.4; + + // use more bits at the beginning, otherwise high motion at the begin will look like shit + s->qsum=100 * s->qmin; + s->qcount=100; + + s->short_term_qsum=0.001; + s->short_term_qcount=0.001; + + return 0; +} + +void ff_rate_control_uninit(MpegEncContext *s) +{ + RateControlContext *rcc= &s->rc_context; + emms_c(); + + if(rcc->stats_file) + fclose(rcc->stats_file); + rcc->stats_file = NULL; + av_freep(&rcc->entry); +} + +//---------------------------------- +// 1 Pass Code + +static double predict(Predictor *p, double q, double var) +{ + return p->coeff*var / (q*p->count); +} + +static void update_predictor(Predictor *p, double q, double var, double size) +{ + double new_coeff= size*q / (var + 1); + if(var<1000) return; + + p->count*= p->decay; + p->coeff*= p->decay; + p->count++; + p->coeff+= new_coeff; +} + +int ff_rate_estimate_qscale(MpegEncContext *s) +{ + int qmin= s->qmin; + int qmax= s->qmax; + int rate_q=5; + float q; + int qscale; + float br_compensation; + double diff; + double short_term_q; + double long_term_q; + double fps; + int picture_number= s->input_picture_number - s->max_b_frames; + int64_t wanted_bits; + emms_c(); + + fps= (double)s->frame_rate / FRAME_RATE_BASE; + wanted_bits= (uint64_t)(s->bit_rate*(double)picture_number/fps); +// printf("%d %d %d\n", picture_number, (int)wanted_bits, (int)s->total_bits); + + if(s->pict_type==B_TYPE){ + qmin= (int)(qmin*s->b_quant_factor+s->b_quant_offset + 0.5); + qmax= (int)(qmax*s->b_quant_factor+s->b_quant_offset + 0.5); + } + if(qmin<1) qmin=1; + if(qmax>31) qmax=31; + if(qmax<=qmin) qmax= qmin; + + /* update predictors */ + if(picture_number>2){ + if(s->pict_type!=B_TYPE && s->last_non_b_pict_type == P_TYPE){ +//printf("%d %d %d %f\n", s->qscale, s->last_mc_mb_var, s->frame_bits, s->p_pred.coeff); + update_predictor(&s->p_pred, s->last_non_b_qscale, s->last_non_b_mc_mb_var, s->pb_frame_bits); + } + } + + if(s->pict_type == I_TYPE){ + short_term_q= s->short_term_qsum/s->short_term_qcount; + + long_term_q= s->qsum/s->qcount*(s->total_bits+1)/(wanted_bits+1); //+1 to avoid nan & 0 + + q= 1/((1/long_term_q - 1/short_term_q)*s->qcompress + 1/short_term_q); + }else if(s->pict_type==B_TYPE){ + q= (int)(s->last_non_b_qscale*s->b_quant_factor+s->b_quant_offset + 0.5); + }else{ //P Frame + int i; + int diff, best_diff=1000000000; + for(i=1; i<=31; i++){ + diff= predict(&s->p_pred, i, s->mc_mb_var_sum) - (double)s->bit_rate/fps; + if(diff<0) diff= -diff; + if(diff<best_diff){ + best_diff= diff; + rate_q= i; + } + } + s->short_term_qsum*=s->qblur; + s->short_term_qcount*=s->qblur; + + s->short_term_qsum+= rate_q; + s->short_term_qcount++; + short_term_q= s->short_term_qsum/s->short_term_qcount; + + long_term_q= s->qsum/s->qcount*(s->total_bits+1)/(wanted_bits+1); //+1 to avoid nan & 0 + +// q= (long_term_q - short_term_q)*s->qcompress + short_term_q; + q= 1/((1/long_term_q - 1/short_term_q)*s->qcompress + 1/short_term_q); + } + + diff= s->total_bits - wanted_bits; + br_compensation= (s->bit_rate_tolerance - diff)/s->bit_rate_tolerance; + if(br_compensation<=0.0) br_compensation=0.001; + q/=br_compensation; +//printf("%f %f %f\n", q, br_compensation, short_term_q); + qscale= (int)(q + 0.5); + if (qscale<qmin) qscale=qmin; + else if(qscale>qmax) qscale=qmax; + + if(s->pict_type!=B_TYPE){ + s->qsum+= qscale; + s->qcount++; + if (qscale<s->last_non_b_qscale-s->max_qdiff) qscale=s->last_non_b_qscale-s->max_qdiff; + else if(qscale>s->last_non_b_qscale+s->max_qdiff) qscale=s->last_non_b_qscale+s->max_qdiff; + } +//printf("q:%d diff:%d comp:%f rate_q:%d st_q:%f fvar:%d last_size:%d\n", qscale, (int)diff, br_compensation, +// rate_q, short_term_q, s->mc_mb_var, s->frame_bits); +//printf("%d %d\n", s->bit_rate, (int)fps); + return qscale; +} + +//---------------------------------------------- +// 2-Pass code + +static int init_pass2(MpegEncContext *s) +{ + RateControlContext *rcc= &s->rc_context; + int i; + double fps= (double)s->frame_rate / FRAME_RATE_BASE; + double complexity[5]={0,0,0,0,0}; // aproximate bits at quant=1 + double avg_quantizer[5]; + uint64_t const_bits[5]={0,0,0,0,0}; // quantizer idependant bits + uint64_t available_bits[5]; + uint64_t all_const_bits; + uint64_t all_available_bits= (uint64_t)(s->bit_rate*(double)rcc->num_entries/fps); + int num_frames[5]={0,0,0,0,0}; + double rate_factor=0; + double step; + int last_i_frame=-10000000; + + /* find complexity & const_bits & decide the pict_types */ + for(i=0; i<rcc->num_entries; i++){ + RateControlEntry *rce= &rcc->entry[i]; + + if(s->b_frame_strategy==0 || s->max_b_frames==0){ + rce->new_pict_type= rce->pict_type; + }else{ + int j; + int next_non_b_type=P_TYPE; + + switch(rce->pict_type){ + case I_TYPE: + if(i-last_i_frame>s->gop_size/2){ //FIXME this is not optimal + rce->new_pict_type= I_TYPE; + last_i_frame= i; + }else{ + rce->new_pict_type= P_TYPE; // will be caught by the scene detection anyway + } + break; + case P_TYPE: + rce->new_pict_type= P_TYPE; + break; + case B_TYPE: + for(j=i+1; j<i+s->max_b_frames+2 && j<rcc->num_entries; j++){ + if(rcc->entry[j].pict_type != B_TYPE){ + next_non_b_type= rcc->entry[j].pict_type; + break; + } + } + if(next_non_b_type==I_TYPE) + rce->new_pict_type= P_TYPE; + else + rce->new_pict_type= B_TYPE; + break; + } + } + + complexity[rce->new_pict_type]+= (rce->i_tex_bits+ rce->p_tex_bits)*(double)rce->qscale; + const_bits[rce->new_pict_type]+= rce->mv_bits + rce->misc_bits; + num_frames[rce->new_pict_type]++; + } + all_const_bits= const_bits[I_TYPE] + const_bits[P_TYPE] + const_bits[B_TYPE]; + + if(all_available_bits < all_const_bits){ + fprintf(stderr, "requested bitrate is to low\n"); + return -1; + } + +// avg_complexity= complexity/rcc->num_entries; + avg_quantizer[P_TYPE]= + avg_quantizer[I_TYPE]= (complexity[I_TYPE]+complexity[P_TYPE] + complexity[B_TYPE]/s->b_quant_factor) + / (all_available_bits - all_const_bits); + avg_quantizer[B_TYPE]= avg_quantizer[P_TYPE]*s->b_quant_factor + s->b_quant_offset; +//printf("avg quantizer: %f %f\n", avg_quantizer[P_TYPE], avg_quantizer[B_TYPE]); + + for(i=0; i<5; i++){ + available_bits[i]= const_bits[i] + complexity[i]/avg_quantizer[i]; + } +//printf("%lld %lld %lld %lld\n", available_bits[I_TYPE], available_bits[P_TYPE], available_bits[B_TYPE], all_available_bits); + + for(step=256*256; step>0.0000001; step*=0.5){ + uint64_t expected_bits=0; + rate_factor+= step; + /* find qscale */ + for(i=0; i<rcc->num_entries; i++){ + RateControlEntry *rce= &rcc->entry[i]; + double short_term_q, q, bits_left; + const int pict_type= rce->new_pict_type; + int qmin= s->qmin; + int qmax= s->qmax; + + if(pict_type==B_TYPE){ + qmin= (int)(qmin*s->b_quant_factor+s->b_quant_offset + 0.5); + qmax= (int)(qmax*s->b_quant_factor+s->b_quant_offset + 0.5); + } + if(qmin<1) qmin=1; + if(qmax>31) qmax=31; + if(qmax<=qmin) qmax= qmin; + + switch(s->rc_strategy){ + case 0: + bits_left= available_bits[pict_type]/num_frames[pict_type]*rate_factor - rce->misc_bits - rce->mv_bits; + if(bits_left<1.0) bits_left=1.0; + short_term_q= rce->qscale*(rce->i_tex_bits + rce->p_tex_bits)/bits_left; + break; + case 1: + bits_left= (available_bits[pict_type] - const_bits[pict_type])/num_frames[pict_type]*rate_factor; + if(bits_left<1.0) bits_left=1.0; + short_term_q= rce->qscale*(rce->i_tex_bits + rce->p_tex_bits)/bits_left; + break; + case 2: + bits_left= available_bits[pict_type]/num_frames[pict_type]*rate_factor; + if(bits_left<1.0) bits_left=1.0; + short_term_q= rce->qscale*(rce->i_tex_bits + rce->p_tex_bits + rce->misc_bits + rce->mv_bits)/bits_left; + break; + default: + fprintf(stderr, "unknown strategy\n"); + short_term_q=3; //gcc warning fix + } + + if(short_term_q>31.0) short_term_q=31.0; + else if (short_term_q<1.0) short_term_q=1.0; + + q= 1/((1/avg_quantizer[pict_type] - 1/short_term_q)*s->qcompress + 1/short_term_q); + if (q<qmin) q=qmin; + else if(q>qmax) q=qmax; +//printf("lq:%f, sq:%f t:%f q:%f\n", avg_quantizer[rce->pict_type], short_term_q, bits_left, q); + rce->new_qscale= q; + } + + /* smooth curve */ + + /* find expected bits */ + for(i=0; i<rcc->num_entries; i++){ + RateControlEntry *rce= &rcc->entry[i]; + double factor= rce->qscale / rce->new_qscale; + + rce->expected_bits= expected_bits; + expected_bits += (int)(rce->misc_bits + rce->mv_bits + (rce->i_tex_bits + rce->p_tex_bits)*factor + 0.5); + } + +// printf("%d %d %f\n", (int)expected_bits, (int)all_available_bits, rate_factor); + if(expected_bits > all_available_bits) rate_factor-= step; + } + + return 0; +} + +int ff_rate_estimate_qscale_pass2(MpegEncContext *s) +{ + int qmin= s->qmin; + int qmax= s->qmax; + float q; + int qscale; + float br_compensation; + double diff; + int picture_number= s->picture_number; + RateControlEntry *rce= &s->rc_context.entry[picture_number]; + int64_t wanted_bits= rce->expected_bits; + emms_c(); + +// printf("%d %d %d\n", picture_number, (int)wanted_bits, (int)s->total_bits); + + if(s->pict_type==B_TYPE){ + qmin= (int)(qmin*s->b_quant_factor+s->b_quant_offset + 0.5); + qmax= (int)(qmax*s->b_quant_factor+s->b_quant_offset + 0.5); + } + if(qmin<1) qmin=1; + if(qmax>31) qmax=31; + if(qmax<=qmin) qmax= qmin; + + q= rce->new_qscale; + + diff= s->total_bits - wanted_bits; + br_compensation= (s->bit_rate_tolerance - diff)/s->bit_rate_tolerance; + if(br_compensation<=0.0) br_compensation=0.001; + q/=br_compensation; + + qscale= (int)(q + 0.5); + if (qscale<qmin) qscale=qmin; + else if(qscale>qmax) qscale=qmax; +// printf("%d %d %d %d type:%d\n", qmin, qscale, qmax, picture_number, s->pict_type); fflush(stdout); + return qscale; +} diff --git a/src/libffmpeg/libavcodec/rv10.c b/src/libffmpeg/libavcodec/rv10.c index f4ebc9016..261c889de 100644 --- a/src/libffmpeg/libavcodec/rv10.c +++ b/src/libffmpeg/libavcodec/rv10.c @@ -1,27 +1,23 @@ /* * RV10 codec - * Copyright (c) 2000,2001 Gerard Lantau. + * Copyright (c) 2000,2001 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include "common.h" -#include "dsputil.h" #include "avcodec.h" +#include "dsputil.h" #include "mpegvideo.h" //#define DEBUG @@ -337,9 +333,9 @@ static int rv10_decode_picture_header(MpegEncContext *s) static int rv10_decode_init(AVCodecContext *avctx) { MpegEncContext *s = avctx->priv_data; - int i; static int done; +// s->avctx= avctx; s->out_format = FMT_H263; s->width = avctx->width; @@ -351,11 +347,6 @@ static int rv10_decode_init(AVCodecContext *avctx) if (MPV_common_init(s) < 0) return -1; - /* XXX: suppress this matrix init, only needed because using mpeg1 - dequantize in mmx case */ - for(i=0;i<64;i++) - s->non_intra_matrix[i] = default_non_intra_matrix[i]; - h263_decode_init_vlc(s); /* init rv vlc */ @@ -439,9 +430,27 @@ static int rv10_decode_frame(AVCodecContext *avctx, s->rv10_first_dc_coded[0] = 0; s->rv10_first_dc_coded[1] = 0; s->rv10_first_dc_coded[2] = 0; - + + s->block_wrap[0]= + s->block_wrap[1]= + s->block_wrap[2]= + s->block_wrap[3]= s->mb_width*2 + 2; + s->block_wrap[4]= + s->block_wrap[5]= s->mb_width + 2; + s->block_index[0]= s->block_wrap[0]*(s->mb_y*2 + 1) - 1 + s->mb_x*2; + s->block_index[1]= s->block_wrap[0]*(s->mb_y*2 + 1) + s->mb_x*2; + s->block_index[2]= s->block_wrap[0]*(s->mb_y*2 + 2) - 1 + s->mb_x*2; + s->block_index[3]= s->block_wrap[0]*(s->mb_y*2 + 2) + s->mb_x*2; + s->block_index[4]= s->block_wrap[4]*(s->mb_y + 1) + s->block_wrap[0]*(s->mb_height*2 + 2) + s->mb_x; + s->block_index[5]= s->block_wrap[4]*(s->mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2) + s->mb_x; /* decode each macroblock */ for(i=0;i<mb_count;i++) { + s->block_index[0]+=2; + s->block_index[1]+=2; + s->block_index[2]+=2; + s->block_index[3]+=2; + s->block_index[4]++; + s->block_index[5]++; #ifdef DEBUG printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y); #endif @@ -459,6 +468,12 @@ static int rv10_decode_frame(AVCodecContext *avctx, if (++s->mb_x == s->mb_width) { s->mb_x = 0; s->mb_y++; + s->block_index[0]= s->block_wrap[0]*(s->mb_y*2 + 1) - 1; + s->block_index[1]= s->block_wrap[0]*(s->mb_y*2 + 1); + s->block_index[2]= s->block_wrap[0]*(s->mb_y*2 + 2) - 1; + s->block_index[3]= s->block_wrap[0]*(s->mb_y*2 + 2); + s->block_index[4]= s->block_wrap[4]*(s->mb_y + 1) + s->block_wrap[0]*(s->mb_height*2 + 2); + s->block_index[5]= s->block_wrap[4]*(s->mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2); } } diff --git a/src/libffmpeg/libavcodec/simple_idct.c b/src/libffmpeg/libavcodec/simple_idct.c index a9653b187..0665f667a 100644 --- a/src/libffmpeg/libavcodec/simple_idct.c +++ b/src/libffmpeg/libavcodec/simple_idct.c @@ -1,29 +1,29 @@ /* - Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ - + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ /* - based upon some outcommented c code from mpeg2dec (idct_mmx.c written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>) -*/ - -#include <inttypes.h> - + based upon some outcommented c code from mpeg2dec (idct_mmx.c + written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>) + */ +#include "avcodec.h" +#include "dsputil.h" #include "simple_idct.h" -#include "../config.h" #if 0 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ @@ -39,7 +39,7 @@ #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 @@ -47,65 +47,33 @@ #define COL_SHIFT 20 // 6 #endif -/* 8x8 Matrix used to do a trivial (slow) 8 point IDCT */ -static int coeff[64]={ - W4, W4, W4, W4, W4, W4, W4, W4, - W1, W3, W5, W7,-W7,-W5,-W3,-W1, - W2, W6,-W6,-W2,-W2,-W6, W6, W2, - W3,-W7,-W1,-W5, W5, W1, W7,-W3, - W4,-W4,-W4, W4, W4,-W4,-W4, W4, - W5,-W1, W7, W3,-W3,-W7, W1,-W5, - W6,-W2, W2,-W6,-W6, W2,-W2, W6, - W7,-W5, W3,-W1, W1,-W3, W5,-W7 -}; - -static int inline idctRowCondZ (int16_t * row) -{ - int a0, a1, a2, a3, b0, b1, b2, b3; +#ifdef ARCH_ALPHA +#define FAST_64BIT +#endif - if( !( ((uint32_t*)row)[0]|((uint32_t*)row)[1] |((uint32_t*)row)[2] |((uint32_t*)row)[3])) { -/* row[0] = row[1] = row[2] = row[3] = row[4] = - row[5] = row[6] = row[7] = 0;*/ - return 0; - } +#if defined(ARCH_POWERPC_405) - if(!( ((uint32_t*)row)[2] |((uint32_t*)row)[3] )){ - a0 = W4*row[0] + W2*row[2] + (1<<(ROW_SHIFT-1)); - a1 = W4*row[0] + W6*row[2] + (1<<(ROW_SHIFT-1)); - a2 = W4*row[0] - W6*row[2] + (1<<(ROW_SHIFT-1)); - a3 = W4*row[0] - W2*row[2] + (1<<(ROW_SHIFT-1)); - - b0 = W1*row[1] + W3*row[3]; - b1 = W3*row[1] - W7*row[3]; - b2 = W5*row[1] - W1*row[3]; - b3 = W7*row[1] - W5*row[3]; - }else{ - a0 = W4*row[0] + W2*row[2] + W4*row[4] + W6*row[6] + (1<<(ROW_SHIFT-1)); - a1 = W4*row[0] + W6*row[2] - W4*row[4] - W2*row[6] + (1<<(ROW_SHIFT-1)); - a2 = W4*row[0] - W6*row[2] - W4*row[4] + W2*row[6] + (1<<(ROW_SHIFT-1)); - a3 = W4*row[0] - W2*row[2] + W4*row[4] - W6*row[6] + (1<<(ROW_SHIFT-1)); - - b0 = W1*row[1] + W3*row[3] + W5*row[5] + W7*row[7]; - b1 = W3*row[1] - W7*row[3] - W1*row[5] - W5*row[7]; - b2 = W5*row[1] - W1*row[3] + W7*row[5] + W3*row[7]; - b3 = W7*row[1] - W5*row[3] + W3*row[5] - W1*row[7]; - } +/* signed 16x16 -> 32 multiply add accumulate */ +#define MAC16(rt, ra, rb) \ + asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); - row[0] = (a0 + b0) >> ROW_SHIFT; - row[1] = (a1 + b1) >> ROW_SHIFT; - row[2] = (a2 + b2) >> ROW_SHIFT; - row[3] = (a3 + b3) >> ROW_SHIFT; - row[4] = (a3 - b3) >> ROW_SHIFT; - row[5] = (a2 - b2) >> ROW_SHIFT; - row[6] = (a1 - b1) >> ROW_SHIFT; - row[7] = (a0 - b0) >> ROW_SHIFT; - - return 1; -} +/* signed 16x16 -> 32 multiply */ +#define MUL16(rt, ra, rb) \ + asm ("mullhw %0, %1, %2" : "=r" (rt) : "r" (ra), "r" (rb)); + +#else + +/* signed 16x16 -> 32 multiply add accumulate */ +#define MAC16(rt, ra, rb) rt += (ra) * (rb) + +/* signed 16x16 -> 32 multiply */ +#define MUL16(rt, ra, rb) rt = (ra) * (rb) + +#endif #ifdef ARCH_ALPHA /* 0: all entries 0, 1: only first entry nonzero, 2: otherwise */ -static int inline idctRowCondDC(int16_t *row) +static inline int idctRowCondDC(int16_t *row) { int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3; uint64_t *lrow = (uint64_t *) row; @@ -129,10 +97,10 @@ static int inline idctRowCondDC(int16_t *row) } } - a0 = W4 * row[0]; - a1 = W4 * row[0]; - a2 = W4 * row[0]; - a3 = W4 * row[0]; + a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); + a1 = a0; + a2 = a0; + a3 = a0; if (row[2]) { a0 += W2 * row[2]; @@ -155,11 +123,6 @@ static int inline idctRowCondDC(int16_t *row) a3 -= W6 * row[6]; } - a0 += 1 << (ROW_SHIFT - 1); - a1 += 1 << (ROW_SHIFT - 1); - a2 += 1 << (ROW_SHIFT - 1); - a3 += 1 << (ROW_SHIFT - 1); - if (row[1]) { b0 = W1 * row[1]; b1 = W3 * row[1]; @@ -205,38 +168,86 @@ static int inline idctRowCondDC(int16_t *row) return 2; } #else /* not ARCH_ALPHA */ -static int inline idctRowCondDC (int16_t * row) + +static inline void idctRowCondDC (int16_t * row) { int a0, a1, a2, a3, b0, b1, b2, b3; +#ifdef FAST_64BIT + uint64_t temp; +#else + uint32_t temp; +#endif - if( !( ((uint32_t*)row)[1] |((uint32_t*)row)[2] |((uint32_t*)row)[3]| row[1])) { -// row[0] = row[1] = row[2] = row[3] = row[4] = row[5] = row[6] = row[7] = row[0]<<3; - uint16_t temp= row[0]<<3; - ((uint32_t*)row)[0]=((uint32_t*)row)[1]= - ((uint32_t*)row)[2]=((uint32_t*)row)[3]= temp + (temp<<16); - return 0; +#ifdef FAST_64BIT +#ifdef WORDS_BIGENDIAN +#define ROW0_MASK 0xffff000000000000LL +#else +#define ROW0_MASK 0xffffLL +#endif + if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) | + ((uint64_t *)row)[1]) == 0) { + temp = (row[0] << 3) & 0xffff; + temp += temp << 16; + temp += temp << 32; + ((uint64_t *)row)[0] = temp; + ((uint64_t *)row)[1] = temp; + return; + } +#else + if (!(((uint32_t*)row)[1] | + ((uint32_t*)row)[2] | + ((uint32_t*)row)[3] | + row[1])) { + temp = (row[0] << 3) & 0xffff; + temp += temp << 16; + ((uint32_t*)row)[0]=((uint32_t*)row)[1] = + ((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp; + return; } +#endif - if(!( ((uint32_t*)row)[2] |((uint32_t*)row)[3] )){ - a0 = W4*row[0] + W2*row[2] + (1<<(ROW_SHIFT-1)); - a1 = W4*row[0] + W6*row[2] + (1<<(ROW_SHIFT-1)); - a2 = W4*row[0] - W6*row[2] + (1<<(ROW_SHIFT-1)); - a3 = W4*row[0] - W2*row[2] + (1<<(ROW_SHIFT-1)); - - b0 = W1*row[1] + W3*row[3]; - b1 = W3*row[1] - W7*row[3]; - b2 = W5*row[1] - W1*row[3]; - b3 = W7*row[1] - W5*row[3]; - }else{ - a0 = W4*row[0] + W2*row[2] + W4*row[4] + W6*row[6] + (1<<(ROW_SHIFT-1)); - a1 = W4*row[0] + W6*row[2] - W4*row[4] - W2*row[6] + (1<<(ROW_SHIFT-1)); - a2 = W4*row[0] - W6*row[2] - W4*row[4] + W2*row[6] + (1<<(ROW_SHIFT-1)); - a3 = W4*row[0] - W2*row[2] + W4*row[4] - W6*row[6] + (1<<(ROW_SHIFT-1)); - - b0 = W1*row[1] + W3*row[3] + W5*row[5] + W7*row[7]; - b1 = W3*row[1] - W7*row[3] - W1*row[5] - W5*row[7]; - b2 = W5*row[1] - W1*row[3] + W7*row[5] + W3*row[7]; - b3 = W7*row[1] - W5*row[3] + W3*row[5] - W1*row[7]; + a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); + a1 = a0; + a2 = a0; + a3 = a0; + + /* no need to optimize : gcc does it */ + a0 += W2 * row[2]; + a1 += W6 * row[2]; + a2 -= W6 * row[2]; + a3 -= W2 * row[2]; + + MUL16(b0, W1, row[1]); + MAC16(b0, W3, row[3]); + MUL16(b1, W3, row[1]); + MAC16(b1, -W7, row[3]); + MUL16(b2, W5, row[1]); + MAC16(b2, -W1, row[3]); + MUL16(b3, W7, row[1]); + MAC16(b3, -W5, row[3]); + +#ifdef FAST_64BIT + temp = ((uint64_t*)row)[1]; +#else + temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; +#endif + if (temp != 0) { + a0 += W4*row[4] + W6*row[6]; + a1 += - W4*row[4] - W2*row[6]; + a2 += - W4*row[4] + W2*row[6]; + a3 += W4*row[4] - W6*row[6]; + + MAC16(b0, W5, row[5]); + MAC16(b0, W7, row[7]); + + MAC16(b1, -W1, row[5]); + MAC16(b1, -W5, row[7]); + + MAC16(b2, W7, row[5]); + MAC16(b2, W3, row[7]); + + MAC16(b3, W3, row[5]); + MAC16(b3, -W1, row[7]); } row[0] = (a0 + b0) >> ROW_SHIFT; @@ -247,202 +258,151 @@ static int inline idctRowCondDC (int16_t * row) row[5] = (a2 - b2) >> ROW_SHIFT; row[3] = (a3 + b3) >> ROW_SHIFT; row[4] = (a3 - b3) >> ROW_SHIFT; - - return 1; } #endif /* not ARCH_ALPHA */ -static void inline idctCol (int16_t * col) +static inline void idctSparseColPut (UINT8 *dest, int line_size, + int16_t * col) { - -/* - if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) { - col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] = - col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3; - return; - }*/ - int a0, a1, a2, a3, b0, b1, b2, b3; - col[0] += (1<<(COL_SHIFT-1))/W4; - a0 = W4*col[8*0] + W2*col[8*2] + W4*col[8*4] + W6*col[8*6]; - a1 = W4*col[8*0] + W6*col[8*2] - W4*col[8*4] - W2*col[8*6]; - a2 = W4*col[8*0] - W6*col[8*2] - W4*col[8*4] + W2*col[8*6]; - a3 = W4*col[8*0] - W2*col[8*2] + W4*col[8*4] - W6*col[8*6]; - - b0 = W1*col[8*1] + W3*col[8*3] + W5*col[8*5] + W7*col[8*7]; - b1 = W3*col[8*1] - W7*col[8*3] - W1*col[8*5] - W5*col[8*7]; - b2 = W5*col[8*1] - W1*col[8*3] + W7*col[8*5] + W3*col[8*7]; - b3 = W7*col[8*1] - W5*col[8*3] + W3*col[8*5] - W1*col[8*7]; - - col[8*0] = (a0 + b0) >> COL_SHIFT; - col[8*7] = (a0 - b0) >> COL_SHIFT; - col[8*1] = (a1 + b1) >> COL_SHIFT; - col[8*6] = (a1 - b1) >> COL_SHIFT; - col[8*2] = (a2 + b2) >> COL_SHIFT; - col[8*5] = (a2 - b2) >> COL_SHIFT; - col[8*3] = (a3 + b3) >> COL_SHIFT; - col[8*4] = (a3 - b3) >> COL_SHIFT; -} + UINT8 *cm = cropTbl + MAX_NEG_CROP; -static void inline idctSparseCol (int16_t * col) -{ - int a0, a1, a2, a3, b0, b1, b2, b3; - col[0] += (1<<(COL_SHIFT-1))/W4; - a0 = W4*col[8*0]; - a1 = W4*col[8*0]; - a2 = W4*col[8*0]; - a3 = W4*col[8*0]; - - if(col[8*2]){ - a0 += + W2*col[8*2]; - a1 += + W6*col[8*2]; - a2 += - W6*col[8*2]; - a3 += - W2*col[8*2]; - } + /* XXX: I did that only to give same values as previous code */ + a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); + a1 = a0; + a2 = a0; + a3 = a0; - if(col[8*4]){ - a0 += + W4*col[8*4]; - a1 += - W4*col[8*4]; - a2 += - W4*col[8*4]; - a3 += + W4*col[8*4]; - } - - if(col[8*6]){ - a0 += + W6*col[8*6]; - a1 += - W2*col[8*6]; - a2 += + W2*col[8*6]; - a3 += - W6*col[8*6]; - } + a0 += + W2*col[8*2]; + a1 += + W6*col[8*2]; + a2 += - W6*col[8*2]; + a3 += - W2*col[8*2]; - if(col[8*1]){ - b0 = W1*col[8*1]; - b1 = W3*col[8*1]; - b2 = W5*col[8*1]; - b3 = W7*col[8*1]; - }else{ - b0 = - b1 = - b2 = - b3 = 0; - } + MUL16(b0, W1, col[8*1]); + MUL16(b1, W3, col[8*1]); + MUL16(b2, W5, col[8*1]); + MUL16(b3, W7, col[8*1]); - if(col[8*3]){ - b0 += + W3*col[8*3]; - b1 += - W7*col[8*3]; - b2 += - W1*col[8*3]; - b3 += - W5*col[8*3]; - } + MAC16(b0, + W3, col[8*3]); + MAC16(b1, - W7, col[8*3]); + MAC16(b2, - W1, col[8*3]); + MAC16(b3, - W5, col[8*3]); - if(col[8*5]){ - b0 += + W5*col[8*5]; - b1 += - W1*col[8*5]; - b2 += + W7*col[8*5]; - b3 += + W3*col[8*5]; + if(col[8*4]){ + a0 += + W4*col[8*4]; + a1 += - W4*col[8*4]; + a2 += - W4*col[8*4]; + a3 += + W4*col[8*4]; } - if(col[8*7]){ - b0 += + W7*col[8*7]; - b1 += - W5*col[8*7]; - b2 += + W3*col[8*7]; - b3 += - W1*col[8*7]; + if (col[8*5]) { + MAC16(b0, + W5, col[8*5]); + MAC16(b1, - W1, col[8*5]); + MAC16(b2, + W7, col[8*5]); + MAC16(b3, + W3, col[8*5]); } -#ifndef ARCH_ALPHA - if(!(b0|b1|b2|b3)){ - col[8*0] = (a0) >> COL_SHIFT; - col[8*7] = (a0) >> COL_SHIFT; - col[8*1] = (a1) >> COL_SHIFT; - col[8*6] = (a1) >> COL_SHIFT; - col[8*2] = (a2) >> COL_SHIFT; - col[8*5] = (a2) >> COL_SHIFT; - col[8*3] = (a3) >> COL_SHIFT; - col[8*4] = (a3) >> COL_SHIFT; - }else{ -#endif - col[8*0] = (a0 + b0) >> COL_SHIFT; - col[8*7] = (a0 - b0) >> COL_SHIFT; - col[8*1] = (a1 + b1) >> COL_SHIFT; - col[8*6] = (a1 - b1) >> COL_SHIFT; - col[8*2] = (a2 + b2) >> COL_SHIFT; - col[8*5] = (a2 - b2) >> COL_SHIFT; - col[8*3] = (a3 + b3) >> COL_SHIFT; - col[8*4] = (a3 - b3) >> COL_SHIFT; -#ifndef ARCH_ALPHA - } -#endif + if(col[8*6]){ + a0 += + W6*col[8*6]; + a1 += - W2*col[8*6]; + a2 += + W2*col[8*6]; + a3 += - W6*col[8*6]; + } + + if (col[8*7]) { + MAC16(b0, + W7, col[8*7]); + MAC16(b1, - W5, col[8*7]); + MAC16(b2, + W3, col[8*7]); + MAC16(b3, - W1, col[8*7]); + } + + dest[0] = cm[(a0 + b0) >> COL_SHIFT]; + dest += line_size; + dest[0] = cm[(a1 + b1) >> COL_SHIFT]; + dest += line_size; + dest[0] = cm[(a2 + b2) >> COL_SHIFT]; + dest += line_size; + dest[0] = cm[(a3 + b3) >> COL_SHIFT]; + dest += line_size; + dest[0] = cm[(a3 - b3) >> COL_SHIFT]; + dest += line_size; + dest[0] = cm[(a2 - b2) >> COL_SHIFT]; + dest += line_size; + dest[0] = cm[(a1 - b1) >> COL_SHIFT]; + dest += line_size; + dest[0] = cm[(a0 - b0) >> COL_SHIFT]; } -static void inline idctSparse2Col (int16_t * col) +static inline void idctSparseColAdd (UINT8 *dest, int line_size, + int16_t * col) { int a0, a1, a2, a3, b0, b1, b2, b3; - col[0] += (1<<(COL_SHIFT-1))/W4; - a0 = W4*col[8*0]; - a1 = W4*col[8*0]; - a2 = W4*col[8*0]; - a3 = W4*col[8*0]; - - if(col[8*2]){ - a0 += + W2*col[8*2]; - a1 += + W6*col[8*2]; - a2 += - W6*col[8*2]; - a3 += - W2*col[8*2]; - } + UINT8 *cm = cropTbl + MAX_NEG_CROP; - if(col[8*4]){ - a0 += + W4*col[8*4]; - a1 += - W4*col[8*4]; - a2 += - W4*col[8*4]; - a3 += + W4*col[8*4]; - } + /* XXX: I did that only to give same values as previous code */ + a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); + a1 = a0; + a2 = a0; + a3 = a0; - if(col[8*6]){ - a0 += + W6*col[8*6]; - a1 += - W2*col[8*6]; - a2 += + W2*col[8*6]; - a3 += - W6*col[8*6]; - } + a0 += + W2*col[8*2]; + a1 += + W6*col[8*2]; + a2 += - W6*col[8*2]; + a3 += - W2*col[8*2]; - if(col[8*1] || 1){ - b0 = W1*col[8*1]; - b1 = W3*col[8*1]; - b2 = W5*col[8*1]; - b3 = W7*col[8*1]; - }else{ - b0 = - b1 = - b2 = - b3 = 0; - } + MUL16(b0, W1, col[8*1]); + MUL16(b1, W3, col[8*1]); + MUL16(b2, W5, col[8*1]); + MUL16(b3, W7, col[8*1]); - if(col[8*3]){ - b0 += + W3*col[8*3]; - b1 += - W7*col[8*3]; - b2 += - W1*col[8*3]; - b3 += - W5*col[8*3]; - } + MAC16(b0, + W3, col[8*3]); + MAC16(b1, - W7, col[8*3]); + MAC16(b2, - W1, col[8*3]); + MAC16(b3, - W5, col[8*3]); - if(col[8*5]){ - b0 += + W5*col[8*5]; - b1 += - W1*col[8*5]; - b2 += + W7*col[8*5]; - b3 += + W3*col[8*5]; + if(col[8*4]){ + a0 += + W4*col[8*4]; + a1 += - W4*col[8*4]; + a2 += - W4*col[8*4]; + a3 += + W4*col[8*4]; } - if(col[8*7]){ - b0 += + W7*col[8*7]; - b1 += - W5*col[8*7]; - b2 += + W3*col[8*7]; - b3 += - W1*col[8*7]; + if (col[8*5]) { + MAC16(b0, + W5, col[8*5]); + MAC16(b1, - W1, col[8*5]); + MAC16(b2, + W7, col[8*5]); + MAC16(b3, + W3, col[8*5]); } - col[8*0] = (a0 + b0) >> COL_SHIFT; - col[8*7] = (a0 - b0) >> COL_SHIFT; - col[8*1] = (a1 + b1) >> COL_SHIFT; - col[8*6] = (a1 - b1) >> COL_SHIFT; - col[8*2] = (a2 + b2) >> COL_SHIFT; - col[8*5] = (a2 - b2) >> COL_SHIFT; - col[8*3] = (a3 + b3) >> COL_SHIFT; - col[8*4] = (a3 - b3) >> COL_SHIFT; + if(col[8*6]){ + a0 += + W6*col[8*6]; + a1 += - W2*col[8*6]; + a2 += + W2*col[8*6]; + a3 += - W6*col[8*6]; + } + + if (col[8*7]) { + MAC16(b0, + W7, col[8*7]); + MAC16(b1, - W5, col[8*7]); + MAC16(b2, + W3, col[8*7]); + MAC16(b3, - W1, col[8*7]); + } + + dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)]; + dest += line_size; + dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)]; + dest += line_size; + dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)]; + dest += line_size; + dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)]; + dest += line_size; + dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)]; + dest += line_size; + dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)]; + dest += line_size; + dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)]; + dest += line_size; + dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)]; } #ifdef ARCH_ALPHA @@ -472,82 +432,11 @@ static inline void idctCol2(int16_t *col) lcol[12] = l; lcol[13] = r; lcol[14] = l; lcol[15] = r; } -#endif void simple_idct (short *block) { int i; - -#if 0 - int nonZero[8]; - int buffer[64]; - int nNonZero=0; - - idctRowCondDC(block); - - for(i=1; i<8; i++) - { - nonZero[nNonZero]=i; - nNonZero+= idctRowCondZ(block + i*8); - } - - if(nNonZero==0) - { - for(i=0; i<8; i++) - { - block[i ]= - block[i+8 ]= - block[i+16]= - block[i+24]= - block[i+32]= - block[i+40]= - block[i+48]= - block[i+56]= (W4*block[i] + (1<<(COL_SHIFT-1))) >> COL_SHIFT; - } - } - else if(nNonZero==1) - { - int index= nonZero[0]*8; - for(i=0; i<8; i++) - { - int bias= W4*block[i] + (1<<(COL_SHIFT-1)); - int c= block[i + index]; - block[i ]= (c*coeff[index ] + bias) >> COL_SHIFT; - block[i+8 ]= (c*coeff[index+1] + bias) >> COL_SHIFT; - block[i+16]= (c*coeff[index+2] + bias) >> COL_SHIFT; - block[i+24]= (c*coeff[index+3] + bias) >> COL_SHIFT; - block[i+32]= (c*coeff[index+4] + bias) >> COL_SHIFT; - block[i+40]= (c*coeff[index+5] + bias) >> COL_SHIFT; - block[i+48]= (c*coeff[index+6] + bias) >> COL_SHIFT; - block[i+56]= (c*coeff[index+7] + bias) >> COL_SHIFT; - } - } -/* else if(nNonZero==2) - { - int index1= nonZero[0]*8; - int index2= nonZero[1]*8; - for(i=0; i<8; i++) - { - int bias= W4*block[i] + (1<<(COL_SHIFT-1)); - int c1= block[i + index1]; - int c2= block[i + index2]; - block[i ]= (c1*coeff[index1 ] + c2*coeff[index2 ] + bias) >> COL_SHIFT; - block[i+8 ]= (c1*coeff[index1+1] + c2*coeff[index2+1] + bias) >> COL_SHIFT; - block[i+16]= (c1*coeff[index1+2] + c2*coeff[index2+2] + bias) >> COL_SHIFT; - block[i+24]= (c1*coeff[index1+3] + c2*coeff[index2+3] + bias) >> COL_SHIFT; - block[i+32]= (c1*coeff[index1+4] + c2*coeff[index2+4] + bias) >> COL_SHIFT; - block[i+40]= (c1*coeff[index1+5] + c2*coeff[index2+5] + bias) >> COL_SHIFT; - block[i+48]= (c1*coeff[index1+6] + c2*coeff[index2+6] + bias) >> COL_SHIFT; - block[i+56]= (c1*coeff[index1+7] + c2*coeff[index2+7] + bias) >> COL_SHIFT; - } - }*/ - else - { - for(i=0; i<8; i++) - idctSparse2Col(block + i); - } -#elif defined(ARCH_ALPHA) int rowsZero = 1; /* all rows except row 0 zero */ int rowsConstant = 1; /* all rows consist of a constant value */ @@ -579,11 +468,43 @@ void simple_idct (short *block) for (i = 0; i < 8; i++) idctSparseCol(block + i); } +} + +/* XXX: suppress this mess */ +void simple_idct_put(UINT8 *dest, int line_size, DCTELEM *block) +{ + simple_idct(block); + put_pixels_clamped(block, dest, line_size); +} + +void simple_idct_add(UINT8 *dest, int line_size, DCTELEM *block) +{ + simple_idct(block); + add_pixels_clamped(block, dest, line_size); +} + #else - for(i=0; i<8; i++) - idctRowCondDC(block + i*8); - - for(i=0; i<8; i++) - idctSparseCol(block + i); -#endif + +void simple_idct_put(UINT8 *dest, int line_size, INT16 *block) +{ + int i; + for(i=0; i<8; i++) + idctRowCondDC(block + i*8); + + for(i=0; i<8; i++) + idctSparseColPut(dest + i, line_size, block + i); } + +void simple_idct_add(UINT8 *dest, int line_size, INT16 *block) +{ + int i; + for(i=0; i<8; i++) + idctRowCondDC(block + i*8); + + for(i=0; i<8; i++) + idctSparseColAdd(dest + i, line_size, block + i); +} + +#endif + +#undef COL_SHIFT diff --git a/src/libffmpeg/libavcodec/simple_idct.h b/src/libffmpeg/libavcodec/simple_idct.h index 54dff7396..233a7b841 100644 --- a/src/libffmpeg/libavcodec/simple_idct.h +++ b/src/libffmpeg/libavcodec/simple_idct.h @@ -1,20 +1,23 @@ /* - Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at) + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ - -void simple_idct(short *block); +void simple_idct_put(UINT8 *dest, int line_size, INT16 *block); +void simple_idct_add(UINT8 *dest, int line_size, INT16 *block); void simple_idct_mmx(short *block); diff --git a/src/libffmpeg/libavcodec/utils.c b/src/libffmpeg/libavcodec/utils.c index 180712314..f6d967757 100644 --- a/src/libffmpeg/libavcodec/utils.c +++ b/src/libffmpeg/libavcodec/utils.c @@ -1,35 +1,30 @@ /* * utils for libavcodec - * Copyright (c) 2001 Gerard Lantau. + * Copyright (c) 2001 Fabrice Bellard. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include <stdio.h> -#include <string.h> -#include <errno.h> -#include "common.h" -#include "dsputil.h" #include "avcodec.h" +#include "dsputil.h" +#include "mpegvideo.h" #ifdef HAVE_MALLOC_H #include <malloc.h> -#else -#include <stdlib.h> #endif /* memory alloc */ -void *av_mallocz(int size) +void *av_malloc(int size) { void *ptr; #if defined ( ARCH_X86 ) && defined ( HAVE_MEMALIGN ) @@ -52,6 +47,31 @@ void *av_mallocz(int size) return ptr; } +void *av_mallocz(int size) +{ + void *ptr; + ptr = av_malloc(size); + if (!ptr) + return NULL; + memset(ptr, 0, size); + return ptr; +} + +/* NOTE: ptr = NULL is explicetly allowed */ +void av_free(void *ptr) +{ + /* XXX: this test should not be needed on most libcs */ + if (ptr) + free(ptr); +} + +/* cannot call it directly because of 'void **' casting is not automatic */ +void __av_freep(void **ptr) +{ + av_free(*ptr); + *ptr = NULL; +} + /* encoder management */ AVCodec *first_avcodec; @@ -70,13 +90,16 @@ int avcodec_open(AVCodecContext *avctx, AVCodec *codec) avctx->codec = codec; avctx->frame_number = 0; - avctx->priv_data = av_mallocz(codec->priv_data_size); - if (!avctx->priv_data) - return -ENOMEM; + if (codec->priv_data_size > 0) { + avctx->priv_data = av_mallocz(codec->priv_data_size); + if (!avctx->priv_data) + return -ENOMEM; + } else { + avctx->priv_data = NULL; + } ret = avctx->codec->init(avctx); if (ret < 0) { - free(avctx->priv_data); - avctx->priv_data = NULL; + av_freep(&avctx->priv_data); return ret; } return 0; @@ -138,8 +161,7 @@ int avcodec_close(AVCodecContext *avctx) { if (avctx->codec->close) avctx->codec->close(avctx); - free(avctx->priv_data); - avctx->priv_data = NULL; + av_freep(&avctx->priv_data); avctx->codec = NULL; return 0; } @@ -205,6 +227,7 @@ AVCodec *avcodec_find(enum CodecID id) } const char *pix_fmt_str[] = { + "??", "yuv420p", "yuv422", "rgb24", @@ -218,6 +241,7 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode) const char *codec_name; AVCodec *p; char buf1[32]; + char channels_str[100]; int bitrate; if (encode) @@ -259,19 +283,54 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode) enc->width, enc->height, (float)enc->frame_rate / FRAME_RATE_BASE); } + snprintf(buf + strlen(buf), buf_size - strlen(buf), + ", q=%d-%d", enc->qmin, enc->qmax); + bitrate = enc->bit_rate; break; case CODEC_TYPE_AUDIO: snprintf(buf, buf_size, "Audio: %s", codec_name); + switch (enc->channels) { + case 1: + strcpy(channels_str, "mono"); + break; + case 2: + strcpy(channels_str, "stereo"); + break; + case 6: + strcpy(channels_str, "5:1"); + break; + default: + sprintf(channels_str, "%d channels", enc->channels); + break; + } if (enc->sample_rate) { snprintf(buf + strlen(buf), buf_size - strlen(buf), ", %d Hz, %s", enc->sample_rate, - enc->channels == 2 ? "stereo" : "mono"); + channels_str); + } + + /* for PCM codecs, compute bitrate directly */ + switch(enc->codec_id) { + case CODEC_ID_PCM_S16LE: + case CODEC_ID_PCM_S16BE: + case CODEC_ID_PCM_U16LE: + case CODEC_ID_PCM_U16BE: + bitrate = enc->sample_rate * enc->channels * 16; + break; + case CODEC_ID_PCM_S8: + case CODEC_ID_PCM_U8: + case CODEC_ID_PCM_ALAW: + case CODEC_ID_PCM_MULAW: + bitrate = enc->sample_rate * enc->channels * 8; + break; + default: + bitrate = enc->bit_rate; + break; } - bitrate = enc->bit_rate; break; default: abort(); @@ -364,6 +423,15 @@ int avpicture_get_size(int pix_fmt, int width, int height) return size; } +unsigned avcodec_version( void ) +{ + return LIBAVCODEC_VERSION_INT; +} + +unsigned avcodec_build( void ) +{ + return LIBAVCODEC_BUILD; +} /* must be called before any other functions */ void avcodec_init(void) @@ -412,6 +480,7 @@ void avcodec_register_all(void) register_avcodec(&msmpeg4v1_decoder); register_avcodec(&msmpeg4v2_decoder); register_avcodec(&msmpeg4v3_decoder); + register_avcodec(&wmv1_decoder); register_avcodec(&mpeg_decoder); register_avcodec(&h263i_decoder); register_avcodec(&rv10_decoder); @@ -423,20 +492,28 @@ void avcodec_register_all(void) } -static int encode_init(AVCodecContext *s) +/* this should be called after seeking and before trying to decode the next frame */ +void avcodec_flush_buffers(AVCodecContext *avctx) +{ + MpegEncContext *s = avctx->priv_data; + s->num_available_buffers=0; +} + + +static int raw_encode_init(AVCodecContext *s) { return 0; } -static int decode_frame(AVCodecContext *avctx, - void *data, int *data_size, - UINT8 *buf, int buf_size) +static int raw_decode_frame(AVCodecContext *avctx, + void *data, int *data_size, + UINT8 *buf, int buf_size) { return -1; } -static int encode_frame(AVCodecContext *avctx, - unsigned char *frame, int buf_size, void *data) +static int raw_encode_frame(AVCodecContext *avctx, + unsigned char *frame, int buf_size, void *data) { return -1; } @@ -446,8 +523,8 @@ AVCodec rawvideo_codec = { CODEC_TYPE_VIDEO, CODEC_ID_RAWVIDEO, 0, - encode_init, - encode_frame, + raw_encode_init, + raw_encode_frame, NULL, - decode_frame, + raw_decode_frame, }; |