diff options
Diffstat (limited to 'contrib/ffmpeg/libavcodec/armv4l')
-rw-r--r-- | contrib/ffmpeg/libavcodec/armv4l/dsputil_arm.c | 15 | ||||
-rw-r--r-- | contrib/ffmpeg/libavcodec/armv4l/dsputil_iwmmxt.c | 12 | ||||
-rw-r--r-- | contrib/ffmpeg/libavcodec/armv4l/mpegvideo_arm.c | 8 | ||||
-rw-r--r-- | contrib/ffmpeg/libavcodec/armv4l/mpegvideo_armv5te.c | 213 | ||||
-rw-r--r-- | contrib/ffmpeg/libavcodec/armv4l/simple_idct_armv6.S | 448 |
5 files changed, 688 insertions, 8 deletions
diff --git a/contrib/ffmpeg/libavcodec/armv4l/dsputil_arm.c b/contrib/ffmpeg/libavcodec/armv4l/dsputil_arm.c index 9f0bfa2af..61b5fdacc 100644 --- a/contrib/ffmpeg/libavcodec/armv4l/dsputil_arm.c +++ b/contrib/ffmpeg/libavcodec/armv4l/dsputil_arm.c @@ -35,6 +35,12 @@ extern void simple_idct_put_armv5te(uint8_t *dest, int line_size, extern void simple_idct_add_armv5te(uint8_t *dest, int line_size, DCTELEM *data); +extern void ff_simple_idct_armv6(DCTELEM *data); +extern void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, + DCTELEM *data); +extern void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, + DCTELEM *data); + /* XXX: local hack */ static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); @@ -206,6 +212,8 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) if(idct_algo == FF_IDCT_AUTO){ #if defined(HAVE_IPP) idct_algo = FF_IDCT_IPP; +#elif defined(HAVE_ARMV6) + idct_algo = FF_IDCT_SIMPLEARMV6; #elif defined(HAVE_ARMV5TE) idct_algo = FF_IDCT_SIMPLEARMV5TE; #else @@ -223,6 +231,13 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) c->idct_add= simple_idct_ARM_add; c->idct = simple_idct_ARM; c->idct_permutation_type= FF_NO_IDCT_PERM; +#ifdef HAVE_ARMV6 + } else if (idct_algo==FF_IDCT_SIMPLEARMV6){ + c->idct_put= ff_simple_idct_put_armv6; + c->idct_add= ff_simple_idct_add_armv6; + c->idct = ff_simple_idct_armv6; + c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; +#endif #ifdef HAVE_ARMV5TE } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){ c->idct_put= simple_idct_put_armv5te; diff --git a/contrib/ffmpeg/libavcodec/armv4l/dsputil_iwmmxt.c b/contrib/ffmpeg/libavcodec/armv4l/dsputil_iwmmxt.c index d7401e760..7536100ee 100644 --- a/contrib/ffmpeg/libavcodec/armv4l/dsputil_iwmmxt.c +++ b/contrib/ffmpeg/libavcodec/armv4l/dsputil_iwmmxt.c @@ -128,17 +128,13 @@ static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h) return; } -int mm_flags; /* multimedia extension flags */ - -int mm_support(void) -{ - return 0; /* TODO, implement proper detection */ -} +/* A run time test is not simple. If this file is compiled in + * then we should install the functions + */ +int mm_flags = MM_IWMMXT; /* multimedia extension flags */ void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx) { - mm_flags = mm_support(); - if (avctx->dsp_mask) { if (avctx->dsp_mask & FF_MM_FORCE) mm_flags |= (avctx->dsp_mask & 0xffff); diff --git a/contrib/ffmpeg/libavcodec/armv4l/mpegvideo_arm.c b/contrib/ffmpeg/libavcodec/armv4l/mpegvideo_arm.c index 10a005cd3..22d40d8bc 100644 --- a/contrib/ffmpeg/libavcodec/armv4l/mpegvideo_arm.c +++ b/contrib/ffmpeg/libavcodec/armv4l/mpegvideo_arm.c @@ -24,9 +24,17 @@ #include "../avcodec.h" extern void MPV_common_init_iwmmxt(MpegEncContext *s); +extern void MPV_common_init_armv5te(MpegEncContext *s); void MPV_common_init_armv4l(MpegEncContext *s) { + /* IWMMXT support is a superset of armv5te, so + * allow optimised functions for armv5te unless + * a better iwmmxt function exists + */ +#ifdef HAVE_ARMV5TE + MPV_common_init_armv5te(s); +#endif #ifdef HAVE_IWMMXT MPV_common_init_iwmmxt(s); #endif diff --git a/contrib/ffmpeg/libavcodec/armv4l/mpegvideo_armv5te.c b/contrib/ffmpeg/libavcodec/armv4l/mpegvideo_armv5te.c new file mode 100644 index 000000000..5e83c8a43 --- /dev/null +++ b/contrib/ffmpeg/libavcodec/armv4l/mpegvideo_armv5te.c @@ -0,0 +1,213 @@ +/* + * Optimization of some functions from mpegvideo.c for armv5te + * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * Some useful links for those who may be interested in optimizing code for ARM. + * ARM Architecture Reference Manual: http://www.arm.com/community/academy/resources.html + * Instructions timings and optimization guide for ARM9E: http://www.arm.com/pdfs/DDI0222B_9EJS_r1p2.pdf + */ + +#include "../dsputil.h" +#include "../mpegvideo.h" +#include "../avcodec.h" + + +#ifdef ENABLE_ARM_TESTS +/** + * h263 dequantizer supplementary function, it is performance critical and needs to + * have optimized implementations for each architecture. Is also used as a reference + * implementation in regression tests + */ +static inline void dct_unquantize_h263_helper_c(DCTELEM *block, int qmul, int qadd, int count) +{ + int i, level; + for (i = 0; i < count; i++) { + level = block[i]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[i] = level; + } + } +} +#endif + +/* GCC 3.1 or higher is required to support symbolic names in assembly code */ +#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) + +/** + * Special optimized version of dct_unquantize_h263_helper_c, it requires the block + * to be at least 8 bytes aligned, and may process more elements than requested. + * But it is guaranteed to never process more than 64 elements provided that + * xxcount argument is <= 64, so it is safe. This macro is optimized for a common + * distribution of values for nCoeffs (they are mostly multiple of 8 plus one or + * two extra elements). So this macro processes data as 8 elements per loop iteration + * and contains optional 2 elements processing in the end. + * + * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) + */ +#define dct_unquantize_h263_special_helper_armv5te(xxblock, xxqmul, xxqadd, xxcount) \ +({ DCTELEM *xblock = xxblock; \ + int xqmul = xxqmul, xqadd = xxqadd, xcount = xxcount, xtmp; \ + int xdata1, xdata2; \ +__asm__ __volatile__( \ + "subs %[count], %[count], #2 \n\t" \ + "ble 2f \n\t" \ + "ldrd r4, [%[block], #0] \n\t" \ + "1: \n\t" \ + "ldrd r6, [%[block], #8] \n\t" \ +\ + "rsbs %[data1], %[zero], r4, asr #16 \n\t" \ + "addgt %[data1], %[qadd], #0 \n\t" \ + "rsblt %[data1], %[qadd], #0 \n\t" \ + "smlatbne %[data1], r4, %[qmul], %[data1] \n\t" \ +\ + "rsbs %[data2], %[zero], r5, asr #16 \n\t" \ + "addgt %[data2], %[qadd], #0 \n\t" \ + "rsblt %[data2], %[qadd], #0 \n\t" \ + "smlatbne %[data2], r5, %[qmul], %[data2] \n\t" \ +\ + "rsbs %[tmp], %[zero], r4, asl #16 \n\t" \ + "addgt %[tmp], %[qadd], #0 \n\t" \ + "rsblt %[tmp], %[qadd], #0 \n\t" \ + "smlabbne r4, r4, %[qmul], %[tmp] \n\t" \ +\ + "rsbs %[tmp], %[zero], r5, asl #16 \n\t" \ + "addgt %[tmp], %[qadd], #0 \n\t" \ + "rsblt %[tmp], %[qadd], #0 \n\t" \ + "smlabbne r5, r5, %[qmul], %[tmp] \n\t" \ +\ + "strh r4, [%[block]], #2 \n\t" \ + "strh %[data1], [%[block]], #2 \n\t" \ + "strh r5, [%[block]], #2 \n\t" \ + "strh %[data2], [%[block]], #2 \n\t" \ +\ + "rsbs %[data1], %[zero], r6, asr #16 \n\t" \ + "addgt %[data1], %[qadd], #0 \n\t" \ + "rsblt %[data1], %[qadd], #0 \n\t" \ + "smlatbne %[data1], r6, %[qmul], %[data1] \n\t" \ +\ + "rsbs %[data2], %[zero], r7, asr #16 \n\t" \ + "addgt %[data2], %[qadd], #0 \n\t" \ + "rsblt %[data2], %[qadd], #0 \n\t" \ + "smlatbne %[data2], r7, %[qmul], %[data2] \n\t" \ +\ + "rsbs %[tmp], %[zero], r6, asl #16 \n\t" \ + "addgt %[tmp], %[qadd], #0 \n\t" \ + "rsblt %[tmp], %[qadd], #0 \n\t" \ + "smlabbne r6, r6, %[qmul], %[tmp] \n\t" \ +\ + "rsbs %[tmp], %[zero], r7, asl #16 \n\t" \ + "addgt %[tmp], %[qadd], #0 \n\t" \ + "rsblt %[tmp], %[qadd], #0 \n\t" \ + "smlabbne r7, r7, %[qmul], %[tmp] \n\t" \ +\ + "strh r6, [%[block]], #2 \n\t" \ + "strh %[data1], [%[block]], #2 \n\t" \ + "strh r7, [%[block]], #2 \n\t" \ + "strh %[data2], [%[block]], #2 \n\t" \ +\ + "subs %[count], %[count], #8 \n\t" \ + "ldrgtd r4, [%[block], #0] \n\t" /* load data early to avoid load/use pipeline stall */ \ + "bgt 1b \n\t" \ +\ + "adds %[count], %[count], #2 \n\t" \ + "ble 3f \n\t" \ + "2: \n\t" \ + "ldrsh %[data1], [%[block], #0] \n\t" \ + "ldrsh %[data2], [%[block], #2] \n\t" \ + "mov %[tmp], %[qadd] \n\t" \ + "cmp %[data1], #0 \n\t" \ + "rsblt %[tmp], %[qadd], #0 \n\t" \ + "smlabbne %[data1], %[data1], %[qmul], %[tmp] \n\t" \ + "mov %[tmp], %[qadd] \n\t" \ + "cmp %[data2], #0 \n\t" \ + "rsblt %[tmp], %[qadd], #0 \n\t" \ + "smlabbne %[data2], %[data2], %[qmul], %[tmp] \n\t" \ + "strh %[data1], [%[block]], #2 \n\t" \ + "strh %[data2], [%[block]], #2 \n\t" \ + "3: \n\t" \ + : [block] "+&r" (xblock), [count] "+&r" (xcount), [tmp] "=&r" (xtmp), \ + [data1] "=&r" (xdata1), [data2] "=&r" (xdata2) \ + : [qmul] "r" (xqmul), [qadd] "r" (xqadd), [zero] "r" (0) \ + : "r4", "r5", "r6", "r7", "cc", "memory" \ +); \ +}) + +static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int i, level, qmul, qadd; + int nCoeffs; + + assert(s->block_last_index[n]>=0); + + qmul = qscale << 1; + + if (!s->h263_aic) { + if (n < 4) + level = block[0] * s->y_dc_scale; + else + level = block[0] * s->c_dc_scale; + qadd = (qscale - 1) | 1; + }else{ + qadd = 0; + level = block[0]; + } + if(s->ac_pred) + nCoeffs=63; + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + dct_unquantize_h263_special_helper_armv5te(block, qmul, qadd, nCoeffs + 1); + block[0] = level; +} + +static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int i, level, qmul, qadd; + int nCoeffs; + + assert(s->block_last_index[n]>=0); + + qadd = (qscale - 1) | 1; + qmul = qscale << 1; + + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + dct_unquantize_h263_special_helper_armv5te(block, qmul, qadd, nCoeffs + 1); +} + +#define HAVE_DCT_UNQUANTIZE_H263_ARMV5TE_OPTIMIZED + +#endif + +void MPV_common_init_armv5te(MpegEncContext *s) +{ +#ifdef HAVE_DCT_UNQUANTIZE_H263_ARMV5TE_OPTIMIZED + s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te; + s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te; +#endif +} diff --git a/contrib/ffmpeg/libavcodec/armv4l/simple_idct_armv6.S b/contrib/ffmpeg/libavcodec/armv4l/simple_idct_armv6.S new file mode 100644 index 000000000..401e1910d --- /dev/null +++ b/contrib/ffmpeg/libavcodec/armv4l/simple_idct_armv6.S @@ -0,0 +1,448 @@ +/* + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> + * Copyright (c) 2007 Mans Rullgard <mru@inprovide.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define W13 (W1 | (W3 << 16)) +#define W26 (W2 | (W6 << 16)) +#define W42 (W4 | (W2 << 16)) +#define W42n (-W4&0xffff | (-W2 << 16)) +#define W46 (W4 | (W6 << 16)) +#define W57 (W5 | (W7 << 16)) + + .text + .align +w13: .long W13 +w26: .long W26 +w42: .long W42 +w42n: .long W42n +w46: .long W46 +w57: .long W57 + +/* + Compute partial IDCT of single row. + shift = left-shift amount + a1 = source address + a3 = row[2,0] <= 2 cycles + a4 = row[3,1] + ip = w42 <= 2 cycles + + Output in registers v1--v8 +*/ + .macro idct_row shift + ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ + mov a2, #(1<<(\shift-1)) + smlad v1, a3, ip, a2 + smlsd v4, a3, ip, a2 + ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ + ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ + smlad v2, a3, lr, a2 + smlsd v3, a3, lr, a2 + + smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ + smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ + ldr lr, [a1, #12] /* lr = row[7,5] */ + pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ + pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ + smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ + smlad v5, lr, v7, v5 /* B0 += W5*row[5] + W7*row[7] */ + smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ + + ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */ + smlad v7, lr, a3, v7 /* B2 += W7*row[5] + W3*row[7] */ + ldr a3, [a1, #4] /* a3 = row[6,4] */ + smlsdx fp, lr, ip, fp /* B3 += W3*row[5] - W1*row[7] */ + ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */ + smlad v6, lr, a2, v6 /* B1 -= W1*row[5] + W5*row[7] */ + + smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */ + smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */ + smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */ + smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */ + .endm + +/* + Compute partial IDCT of half row. + shift = left-shift amount + a3 = row[2,0] + a4 = row[3,1] + ip = w42 + + Output in registers v1--v8 +*/ + .macro idct_row4 shift + ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ + ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ + mov a2, #(1<<(\shift-1)) + smlad v1, a3, ip, a2 + smlsd v4, a3, ip, a2 + ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ + smlad v2, a3, lr, a2 + smlsd v3, a3, lr, a2 + smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ + smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ + pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ + pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ + smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ + smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ + .endm + +/* + Compute final part of IDCT single row without shift. + Input in registers v1--v8 + Output in registers ip, v1--v3, lr, v5--v7 +*/ + .macro idct_finish + add ip, v1, v5 /* a2 = A0 + B0 */ + sub lr, v1, v5 /* a3 = A0 - B0 */ + sub v1, v2, v6 /* a3 = A1 + B1 */ + add v5, v2, v6 /* a3 = A1 - B1 */ + add v2, v3, v7 /* a2 = A2 + B2 */ + sub v6, v3, v7 /* a2 = A2 - B2 */ + add v3, v4, fp /* a3 = A3 + B3 */ + sub v7, v4, fp /* a3 = A3 - B3 */ + .endm + +/* + Compute final part of IDCT single row. + shift = right-shift amount + Input/output in registers v1--v8 +*/ + .macro idct_finish_shift shift + add a4, v1, v5 /* a4 = A0 + B0 */ + sub a3, v1, v5 /* a3 = A0 - B0 */ + mov v1, a4, asr #\shift + mov v5, a3, asr #\shift + + sub a4, v2, v6 /* a4 = A1 + B1 */ + add a3, v2, v6 /* a3 = A1 - B1 */ + mov v2, a4, asr #\shift + mov v6, a3, asr #\shift + + add a4, v3, v7 /* a4 = A2 + B2 */ + sub a3, v3, v7 /* a3 = A2 - B2 */ + mov v3, a4, asr #\shift + mov v7, a3, asr #\shift + + add a4, v4, fp /* a4 = A3 + B3 */ + sub a3, v4, fp /* a3 = A3 - B3 */ + mov v4, a4, asr #\shift + mov fp, a3, asr #\shift + .endm + +/* + Compute final part of IDCT single row, saturating results at 8 bits. + shift = right-shift amount + Input/output in registers v1--v8 +*/ + .macro idct_finish_shift_sat shift + add a4, v1, v5 /* a4 = A0 + B0 */ + sub ip, v1, v5 /* ip = A0 - B0 */ + usat v1, #8, a4, asr #\shift + usat v5, #8, ip, asr #\shift + + sub a4, v2, v6 /* a4 = A1 + B1 */ + add ip, v2, v6 /* ip = A1 - B1 */ + usat v2, #8, a4, asr #\shift + usat v6, #8, ip, asr #\shift + + add a4, v3, v7 /* a4 = A2 + B2 */ + sub ip, v3, v7 /* ip = A2 - B2 */ + usat v3, #8, a4, asr #\shift + usat v7, #8, ip, asr #\shift + + add a4, v4, fp /* a4 = A3 + B3 */ + sub ip, v4, fp /* ip = A3 - B3 */ + usat v4, #8, a4, asr #\shift + usat fp, #8, ip, asr #\shift + .endm + +/* + Compute IDCT of single row, storing as column. + a1 = source + a2 = dest +*/ + .align + .func idct_row_armv6 +idct_row_armv6: + str lr, [sp, #-4]! + + ldr lr, [a1, #12] /* lr = row[7,5] */ + ldr ip, [a1, #4] /* ip = row[6,4] */ + ldr a4, [a1, #8] /* a4 = row[3,1] */ + ldr a3, [a1] /* a3 = row[2,0] */ + orrs lr, lr, ip + cmpeq lr, a4 + cmpeq lr, a3, lsr #16 + beq 1f + str a2, [sp, #-4]! + ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ + cmp lr, #0 + beq 2f + + idct_row ROW_SHIFT + b 3f + +2: idct_row4 ROW_SHIFT + +3: ldr a2, [sp], #4 + idct_finish_shift ROW_SHIFT + + strh v1, [a2] + strh v2, [a2, #(16*2)] + strh v3, [a2, #(16*4)] + strh v4, [a2, #(16*6)] + strh fp, [a2, #(16*1)] + strh v7, [a2, #(16*3)] + strh v6, [a2, #(16*5)] + strh v5, [a2, #(16*7)] + + ldr pc, [sp], #4 + +1: mov a3, a3, lsl #3 + strh a3, [a2] + strh a3, [a2, #(16*2)] + strh a3, [a2, #(16*4)] + strh a3, [a2, #(16*6)] + strh a3, [a2, #(16*1)] + strh a3, [a2, #(16*3)] + strh a3, [a2, #(16*5)] + strh a3, [a2, #(16*7)] + ldr pc, [sp], #4 + .endfunc + +/* + Compute IDCT of single column, read as row. + a1 = source + a2 = dest +*/ + .align + .func idct_col_armv6 +idct_col_armv6: + stmfd sp!, {a2, lr} + + ldr a3, [a1] /* a3 = row[2,0] */ + ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ + ldr a4, [a1, #8] /* a4 = row[3,1] */ + idct_row COL_SHIFT + ldr a2, [sp], #4 + idct_finish_shift COL_SHIFT + + strh v1, [a2] + strh v2, [a2, #(16*1)] + strh v3, [a2, #(16*2)] + strh v4, [a2, #(16*3)] + strh fp, [a2, #(16*4)] + strh v7, [a2, #(16*5)] + strh v6, [a2, #(16*6)] + strh v5, [a2, #(16*7)] + + ldr pc, [sp], #4 + .endfunc + +/* + Compute IDCT of single column, read as row, store saturated 8-bit. + a1 = source + a2 = dest + a3 = line size +*/ + .align + .func idct_col_put_armv6 +idct_col_put_armv6: + stmfd sp!, {a2, a3, lr} + + ldr a3, [a1] /* a3 = row[2,0] */ + ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ + ldr a4, [a1, #8] /* a4 = row[3,1] */ + idct_row COL_SHIFT + ldmfd sp!, {a2, a3} + idct_finish_shift_sat COL_SHIFT + + strb v1, [a2], a3 + strb v2, [a2], a3 + strb v3, [a2], a3 + strb v4, [a2], a3 + strb fp, [a2], a3 + strb v7, [a2], a3 + strb v6, [a2], a3 + strb v5, [a2], a3 + + sub a2, a2, a3, lsl #3 + + ldr pc, [sp], #4 + .endfunc + +/* + Compute IDCT of single column, read as row, add/store saturated 8-bit. + a1 = source + a2 = dest + a3 = line size +*/ + .align + .func idct_col_add_armv6 +idct_col_add_armv6: + stmfd sp!, {a2, a3, lr} + + ldr a3, [a1] /* a3 = row[2,0] */ + ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ + ldr a4, [a1, #8] /* a4 = row[3,1] */ + idct_row COL_SHIFT + ldmfd sp!, {a2, a3} + idct_finish + + ldrb a4, [a2] + ldrb v4, [a2, a3] + ldrb fp, [a2, a3, lsl #2] + add ip, a4, ip, asr #COL_SHIFT + usat ip, #8, ip + add v1, v4, v1, asr #COL_SHIFT + strb ip, [a2], a3 + ldrb ip, [a2, a3] + usat v1, #8, v1 + ldrb fp, [a2, a3, lsl #2] + add v2, ip, v2, asr #COL_SHIFT + usat v2, #8, v2 + strb v1, [a2], a3 + ldrb a4, [a2, a3] + ldrb ip, [a2, a3, lsl #2] + strb v2, [a2], a3 + ldrb v4, [a2, a3] + ldrb v1, [a2, a3, lsl #2] + add v3, a4, v3, asr #COL_SHIFT + usat v3, #8, v3 + add v7, v4, v7, asr #COL_SHIFT + usat v7, #8, v7 + add v6, fp, v6, asr #COL_SHIFT + usat v6, #8, v6 + add v5, ip, v5, asr #COL_SHIFT + usat v5, #8, v5 + add lr, v1, lr, asr #COL_SHIFT + usat lr, #8, lr + strb v3, [a2], a3 + strb v7, [a2], a3 + strb v6, [a2], a3 + strb v5, [a2], a3 + strb lr, [a2], a3 + + sub a2, a2, a3, lsl #3 + + ldr pc, [sp], #4 + .endfunc + +/* + Compute 8 IDCT row transforms. + func = IDCT row->col function + width = width of columns in bytes +*/ + .macro idct_rows func width + bl \func + add a1, a1, #(16*2) + add a2, a2, #\width + bl \func + add a1, a1, #(16*2) + add a2, a2, #\width + bl \func + add a1, a1, #(16*2) + add a2, a2, #\width + bl \func + sub a1, a1, #(16*5) + add a2, a2, #\width + bl \func + add a1, a1, #(16*2) + add a2, a2, #\width + bl \func + add a1, a1, #(16*2) + add a2, a2, #\width + bl \func + add a1, a1, #(16*2) + add a2, a2, #\width + bl \func + + sub a1, a1, #(16*7) + .endm + + .align + .global ff_simple_idct_armv6 + .func ff_simple_idct_armv6 +/* void ff_simple_idct_armv6(DCTELEM *data); */ +ff_simple_idct_armv6: + stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} + sub sp, sp, #128 + + mov a2, sp + idct_rows idct_row_armv6, 2 + mov a2, a1 + mov a1, sp + idct_rows idct_col_armv6, 2 + + add sp, sp, #128 + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} + .endfunc + + .align + .global ff_simple_idct_add_armv6 + .func ff_simple_idct_add_armv6 +/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ +ff_simple_idct_add_armv6: + stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} + sub sp, sp, #128 + + mov a1, a3 + mov a2, sp + idct_rows idct_row_armv6, 2 + mov a1, sp + ldr a2, [sp, #128] + ldr a3, [sp, #(128+4)] + idct_rows idct_col_add_armv6, 1 + + add sp, sp, #(128+8) + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} + .endfunc + + .align + .global ff_simple_idct_put_armv6 + .func ff_simple_idct_put_armv6 +/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ +ff_simple_idct_put_armv6: + stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} + sub sp, sp, #128 + + mov a1, a3 + mov a2, sp + idct_rows idct_row_armv6, 2 + mov a1, sp + ldr a2, [sp, #128] + ldr a3, [sp, #(128+4)] + idct_rows idct_col_put_armv6, 1 + + add sp, sp, #(128+8) + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} + .endfunc |