From 03ac29c63fd3d5019c67b3662669b1c443896f0b Mon Sep 17 00:00:00 2001 From: Mike Melanson Date: Sat, 29 Oct 2005 23:57:06 +0000 Subject: update libfaad2 to CVS snapshot 2004-09-15 CVS patchset: 7777 CVS date: 2005/10/29 23:57:06 --- src/libfaad/filtbank.c | 397 +++++-------------------------------------------- 1 file changed, 39 insertions(+), 358 deletions(-) (limited to 'src/libfaad/filtbank.c') diff --git a/src/libfaad/filtbank.c b/src/libfaad/filtbank.c index 3fc2cf456..2919b5be3 100644 --- a/src/libfaad/filtbank.c +++ b/src/libfaad/filtbank.c @@ -22,7 +22,7 @@ ** Commercial non-GPL licensing of this software is possible. ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com. ** -** $Id: filtbank.c,v 1.9 2004/12/03 01:15:30 tmattern Exp $ +** $Id: filtbank.c,v 1.10 2005/10/29 23:57:06 tmmm Exp $ **/ #include "common.h" @@ -87,15 +87,6 @@ fb_info *filter_bank_init(uint16_t frame_len) } #endif -#ifdef USE_SSE - if (cpu_has_sse()) - { - fb->if_func = ifilter_bank_sse; - } else { - fb->if_func = ifilter_bank; - } -#endif - return fb; } @@ -140,30 +131,6 @@ static INLINE void imdct_long(fb_info *fb, real_t *in_data, real_t *out_data, ui #endif } -#ifdef USE_SSE -static INLINE void imdct_long_sse(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len) -{ -#ifdef LD_DEC - mdct_info *mdct = NULL; - - switch (len) - { - case 2048: - case 1920: - mdct = fb->mdct2048; - break; - case 1024: - case 960: - mdct = fb->mdct1024; - break; - } - - faad_imdct_sse(mdct, in_data, out_data); -#else - faad_imdct_sse(fb->mdct2048, in_data, out_data); -#endif -} -#endif #ifdef LTP_DEC static INLINE void mdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len) @@ -215,6 +182,7 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, int64_t count = faad_get_ts(); #endif + /* select windows of current frame and previous frame (Sine or KBD) */ #ifdef LD_DEC if (object_type == LD) { @@ -230,11 +198,24 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, } #endif +#if 0 + for (i = 0; i < 1024; i++) + { + printf("%d\n", freq_in[i]); + } +#endif + +#if 0 + printf("%d %d\n", window_sequence, window_shape); +#endif switch (window_sequence) { case ONLY_LONG_SEQUENCE: + /* perform iMDCT */ imdct_long(fb, freq_in, transf_buf, 2*nlong); + + /* add second half output of previous frame to windowed output of current frame */ for (i = 0; i < nlong; i+=4) { time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]); @@ -242,6 +223,8 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]); time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]); } + + /* window the second half and save as overlap for next frame */ for (i = 0; i < nlong; i+=4) { overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); @@ -252,7 +235,10 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, break; case LONG_START_SEQUENCE: + /* perform iMDCT */ imdct_long(fb, freq_in, transf_buf, 2*nlong); + + /* add second half output of previous frame to windowed output of current frame */ for (i = 0; i < nlong; i+=4) { time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]); @@ -260,6 +246,9 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]); time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]); } + + /* window the second half and save as overlap for next frame */ + /* construct second half window using padding with 1's and 0's */ for (i = 0; i < nflat_ls; i++) overlap[i] = transf_buf[nlong+i]; for (i = 0; i < nshort; i++) @@ -269,6 +258,7 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, break; case EIGHT_SHORT_SEQUENCE: + /* perform iMDCT for each short block */ faad_imdct(fb->mdct256, freq_in+0*nshort, transf_buf+2*nshort*0); faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1); faad_imdct(fb->mdct256, freq_in+2*nshort, transf_buf+2*nshort*2); @@ -277,6 +267,8 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, faad_imdct(fb->mdct256, freq_in+5*nshort, transf_buf+2*nshort*5); faad_imdct(fb->mdct256, freq_in+6*nshort, transf_buf+2*nshort*6); faad_imdct(fb->mdct256, freq_in+7*nshort, transf_buf+2*nshort*7); + + /* add second half output of previous frame to windowed output of current frame */ for (i = 0; i < nflat_ls; i++) time_out[i] = overlap[i]; for(i = 0; i < nshort; i++) @@ -288,6 +280,8 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, if (i < trans) time_out[nflat_ls+4*nshort+i] = overlap[nflat_ls+nshort*4+i] + MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]); } + + /* window the second half and save as overlap for next frame */ for(i = 0; i < nshort; i++) { if (i >= trans) @@ -302,352 +296,39 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, break; case LONG_STOP_SEQUENCE: + /* perform iMDCT */ imdct_long(fb, freq_in, transf_buf, 2*nlong); + + /* add second half output of previous frame to windowed output of current frame */ + /* construct first half window using padding with 1's and 0's */ for (i = 0; i < nflat_ls; i++) time_out[i] = overlap[i]; for (i = 0; i < nshort; i++) time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]); for (i = 0; i < nflat_ls; i++) time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i]; + + /* window the second half and save as overlap for next frame */ for (i = 0; i < nlong; i++) overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); break; } -#ifdef PROFILE - count = faad_get_ts() - count; - fb->cycles += count; -#endif -} - -#ifdef USE_SSE -void ifilter_bank_sse(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, - uint8_t window_shape_prev, real_t *freq_in, - real_t *time_out, uint8_t object_type, uint16_t frame_len) -{ - int16_t i; - ALIGN real_t transf_buf[2*1024] = {0}; - - const real_t *window_long = NULL; - const real_t *window_long_prev = NULL; - const real_t *window_short = NULL; - const real_t *window_short_prev = NULL; - - uint16_t nlong = frame_len; - uint16_t nshort = frame_len/8; - uint16_t trans = nshort/2; - - uint16_t nflat_ls = (nlong-nshort)/2; - -#ifdef PROFILE - int64_t count = faad_get_ts(); -#endif - -#ifdef LD_DEC - if (object_type == LD) +#if 0 + for (i = 0; i < 1024; i++) { - window_long = fb->ld_window[window_shape]; - window_long_prev = fb->ld_window[window_shape_prev]; - } else { -#endif - window_long = fb->long_window[window_shape]; - window_long_prev = fb->long_window[window_shape_prev]; - window_short = fb->short_window[window_shape]; - window_short_prev = fb->short_window[window_shape_prev]; -#ifdef LD_DEC + printf("%d\n", time_out[i]); + //printf("0x%.8X\n", time_out[i]); } #endif - switch (window_sequence) - { - case ONLY_LONG_SEQUENCE: - imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); - for (i = 0; i < nlong; i+=4) - { - __m128 m1, m2, m3, m4, m5, m6, m7, m8; - - m1 = _mm_load_ps(&transf_buf[i]); - m2 = _mm_load_ps(&window_long_prev[i]); - m6 = _mm_load_ps(&window_long[nlong-4-i]); - m3 = _mm_load_ps(&time_out[nlong+i]); - m5 = _mm_load_ps(&transf_buf[nlong+i]); - - m4 = _mm_mul_ps(m1, m2); - m7 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 1, 2, 3)); - - m4 = _mm_add_ps(m4, m3); - m8 = _mm_mul_ps(m5, m7); - - _mm_store_ps(&time_out[i], m4); - _mm_store_ps(&time_out[nlong+i], m8); - } - break; - - case LONG_START_SEQUENCE: - imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); - for (i = 0; i < nlong; i+=4) - { - __m128 m1 = _mm_load_ps(&transf_buf[i]); - __m128 m2 = _mm_load_ps(&window_long_prev[i]); - __m128 m3 = _mm_load_ps(&time_out[nlong+i]); - - __m128 m4 = _mm_mul_ps(m1, m2); - m4 = _mm_add_ps(m4, m3); - - _mm_store_ps(&time_out[i], m4); - } - for (i = 0; i < nflat_ls; i+=4) - { - __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]); - _mm_store_ps(&time_out[nlong+i], m1); - } - for (i = 0; i < nshort; i+=4) - { - __m128 m1 = _mm_load_ps(&transf_buf[nlong+nflat_ls+i]); - __m128 m2 = _mm_load_ps(&window_short[nshort-4-i]); - __m128 m3, m4; - - m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); - - m4 = _mm_mul_ps(m1, m3); - - _mm_store_ps(&time_out[nlong+nflat_ls+i], m4); - } - for (i = 0; i < nflat_ls; i+=4) - { - __m128 m1 = _mm_setzero_ps(); - _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1); - } - break; - - case EIGHT_SHORT_SEQUENCE: - faad_imdct_sse(fb->mdct256, &freq_in[0*nshort], &transf_buf[2*nshort*0]); - faad_imdct_sse(fb->mdct256, &freq_in[1*nshort], &transf_buf[2*nshort*1]); - faad_imdct_sse(fb->mdct256, &freq_in[2*nshort], &transf_buf[2*nshort*2]); - faad_imdct_sse(fb->mdct256, &freq_in[3*nshort], &transf_buf[2*nshort*3]); - faad_imdct_sse(fb->mdct256, &freq_in[4*nshort], &transf_buf[2*nshort*4]); - faad_imdct_sse(fb->mdct256, &freq_in[5*nshort], &transf_buf[2*nshort*5]); - faad_imdct_sse(fb->mdct256, &freq_in[6*nshort], &transf_buf[2*nshort*6]); - faad_imdct_sse(fb->mdct256, &freq_in[7*nshort], &transf_buf[2*nshort*7]); - for (i = 0; i < nflat_ls; i+=4) - { - __m128 m1 = _mm_load_ps(&time_out[nlong+i]); - _mm_store_ps(&time_out[i], m1); - } - for (i = 0; i < nshort; i+=4) - { - __m128 m1 = _mm_load_ps(&transf_buf[nshort*0+i]); - __m128 m2 = _mm_load_ps(&window_short_prev[i]); - __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]); - - __m128 m4 = _mm_mul_ps(m1, m2); - m4 = _mm_add_ps(m4, m3); - - _mm_store_ps(&time_out[nflat_ls+i], m4); - } - for (i = 0; i < nshort; i+=4) - { - __m128 m1, m2, m3, m4, m5, m6, m7, m8; - m1 = _mm_load_ps(&transf_buf[nshort*1+i]); - m2 = _mm_load_ps(&window_short[nshort-4-i]); - m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*1+i]); - m6 = _mm_load_ps(&transf_buf[nshort*2+i]); - m7 = _mm_load_ps(&window_short[i]); - - m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); - - m4 = _mm_mul_ps(m1, m5); - m8 = _mm_mul_ps(m6, m7); - m4 = _mm_add_ps(m4, m3); - m4 = _mm_add_ps(m4, m8); - - _mm_store_ps(&time_out[nflat_ls+1*nshort+i], m4); - } - for (i = 0; i < nshort; i+=4) - { - __m128 m1, m2, m3, m4, m5, m6, m7, m8; - m1 = _mm_load_ps(&transf_buf[nshort*3+i]); - m2 = _mm_load_ps(&window_short[nshort-4-i]); - m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*2+i]); - m6 = _mm_load_ps(&transf_buf[nshort*4+i]); - m7 = _mm_load_ps(&window_short[i]); - - m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); - - m4 = _mm_mul_ps(m1, m5); - m8 = _mm_mul_ps(m6, m7); - m4 = _mm_add_ps(m4, m3); - m4 = _mm_add_ps(m4, m8); - - _mm_store_ps(&time_out[nflat_ls+2*nshort+i], m4); - } - for (i = 0; i < nshort; i+=4) - { - __m128 m1, m2, m3, m4, m5, m6, m7, m8; - m1 = _mm_load_ps(&transf_buf[nshort*5+i]); - m2 = _mm_load_ps(&window_short[nshort-4-i]); - m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*3+i]); - m6 = _mm_load_ps(&transf_buf[nshort*6+i]); - m7 = _mm_load_ps(&window_short[i]); - - m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); - - m4 = _mm_mul_ps(m1, m5); - m8 = _mm_mul_ps(m6, m7); - m4 = _mm_add_ps(m4, m3); - m4 = _mm_add_ps(m4, m8); - - _mm_store_ps(&time_out[nflat_ls+3*nshort+i], m4); - } - for(i = 0; i < trans; i+=4) - { - __m128 m1, m2, m3, m4, m5, m6, m7, m8; - m1 = _mm_load_ps(&transf_buf[nshort*7+i]); - m2 = _mm_load_ps(&window_short[nshort-4-i]); - m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*4+i]); - m6 = _mm_load_ps(&transf_buf[nshort*8+i]); - m7 = _mm_load_ps(&window_short[i]); - - m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); - - m4 = _mm_mul_ps(m1, m5); - m8 = _mm_mul_ps(m6, m7); - m4 = _mm_add_ps(m4, m3); - m4 = _mm_add_ps(m4, m8); - - _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m4); - } - for (i = trans; i < nshort; i+=4) - { - __m128 m1, m2, m3, m4, m5, m6, m7, m8; - m1 = _mm_load_ps(&transf_buf[nshort*7+i]); - m2 = _mm_load_ps(&window_short[nshort-4-i]); - m6 = _mm_load_ps(&transf_buf[nshort*8+i]); - m7 = _mm_load_ps(&window_short[i]); - - m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); - - m4 = _mm_mul_ps(m1, m5); - m8 = _mm_mul_ps(m6, m7); - m3 = _mm_add_ps(m4, m8); - - _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m3); - } - for (i = 0; i < nshort; i+=4) - { - __m128 m1, m2, m3, m4, m5, m6, m7, m8; - m1 = _mm_load_ps(&transf_buf[nshort*9+i]); - m2 = _mm_load_ps(&window_short[nshort-4-i]); - m6 = _mm_load_ps(&transf_buf[nshort*10+i]); - m7 = _mm_load_ps(&window_short[i]); - - m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); - - m4 = _mm_mul_ps(m1, m5); - m8 = _mm_mul_ps(m6, m7); - m3 = _mm_add_ps(m4, m8); - - _mm_store_ps(&time_out[nflat_ls+5*nshort+i], m3); - } - for (i = 0; i < nshort; i+=4) - { - __m128 m1, m2, m3, m4, m5, m6, m7, m8; - m1 = _mm_load_ps(&transf_buf[nshort*11+i]); - m2 = _mm_load_ps(&window_short[nshort-4-i]); - m6 = _mm_load_ps(&transf_buf[nshort*12+i]); - m7 = _mm_load_ps(&window_short[i]); - - m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); - - m4 = _mm_mul_ps(m1, m5); - m8 = _mm_mul_ps(m6, m7); - m3 = _mm_add_ps(m4, m8); - - _mm_store_ps(&time_out[nflat_ls+6*nshort+i], m3); - } - for (i = 0; i < nshort; i+=4) - { - __m128 m1, m2, m3, m4, m5, m6, m7, m8; - m1 = _mm_load_ps(&transf_buf[nshort*13+i]); - m2 = _mm_load_ps(&window_short[nshort-4-i]); - m6 = _mm_load_ps(&transf_buf[nshort*14+i]); - m7 = _mm_load_ps(&window_short[i]); - - m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); - - m4 = _mm_mul_ps(m1, m5); - m8 = _mm_mul_ps(m6, m7); - m3 = _mm_add_ps(m4, m8); - - _mm_store_ps(&time_out[nflat_ls+7*nshort+i], m3); - } - for (i = 0; i < nshort; i+=4) - { - __m128 m1, m2, m3, m5; - m1 = _mm_load_ps(&transf_buf[nshort*15+i]); - m2 = _mm_load_ps(&window_short[nshort-4-i]); - - m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); - - m3 = _mm_mul_ps(m1, m5); - - _mm_store_ps(&time_out[nflat_ls+8*nshort+i], m3); - } - for (i = 0; i < nflat_ls; i+=4) - { - __m128 m1 = _mm_setzero_ps(); - _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1); - } - break; - - case LONG_STOP_SEQUENCE: - imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); - for (i = 0; i < nflat_ls; i+=4) - { - __m128 m1 = _mm_load_ps(&time_out[nlong+i]); - _mm_store_ps(&time_out[i], m1); - } - for (i = 0; i < nshort; i+=4) - { - __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+i]); - __m128 m2 = _mm_load_ps(&window_short_prev[i]); - __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]); - - __m128 m4 = _mm_mul_ps(m1, m2); - m4 = _mm_add_ps(m4, m3); - - _mm_store_ps(&time_out[nflat_ls+i], m4); - } - for (i = 0; i < nflat_ls; i+=4) - { - __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+nshort+i]); - __m128 m2 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort+i]); - - __m128 m3 = _mm_add_ps(m1, m2); - - _mm_store_ps(&time_out[nflat_ls+nshort+i], m3); - } - for (i = 0; i < nlong; i+=4) - { - __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]); - __m128 m2 = _mm_load_ps(&window_long[nlong-4-i]); - __m128 m3, m4; - - m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); - - m4 = _mm_mul_ps(m1, m3); - - _mm_store_ps(&time_out[nlong+i], m4); - } - break; - } #ifdef PROFILE count = faad_get_ts() - count; fb->cycles += count; #endif } -#endif + #ifdef LTP_DEC /* only works for LTP -> no overlapping, no short blocks */ -- cgit v1.2.3