diff options
Diffstat (limited to 'src/libfaad/cfft.c')
-rw-r--r-- | src/libfaad/cfft.c | 1116 |
1 files changed, 895 insertions, 221 deletions
diff --git a/src/libfaad/cfft.c b/src/libfaad/cfft.c index a18527dbc..bbdfc36df 100644 --- a/src/libfaad/cfft.c +++ b/src/libfaad/cfft.c @@ -1,22 +1,28 @@ /* -** FAAD - Freeware Advanced Audio Decoder -** Copyright (C) 2002 M. Bakker -** +** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding +** Copyright (C) 2003 M. Bakker, Ahead Software AG, http://www.nero.com +** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. -** +** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. -** +** ** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software +** along with this program; if not, write to the Free Software ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ** -** $Id: cfft.c,v 1.6 2003/04/12 14:58:46 miguelfreitas Exp $ +** Any non-GPL usage of this software or parts of this software is strictly +** forbidden. +** +** Commercial non-GPL licensing of this software is possible. +** For more info contact Ahead Software through Mpeg4AAClicense@nero.com. +** +** $Id: cfft.c,v 1.7 2003/12/30 02:00:10 miguelfreitas Exp $ **/ /* @@ -37,27 +43,91 @@ #include "cfft_tab.h" -/* static declarations moved to avoid compiler warnings [MF] */ -static void passf2(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, - complex_t *wa, int8_t isign); -static void passf3(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, - complex_t *wa1, complex_t *wa2, int8_t isign); -static void passf4(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, - complex_t *wa1, complex_t *wa2, complex_t *wa3, int8_t isign); -static void passf5(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, - complex_t *wa1, complex_t *wa2, complex_t *wa3, complex_t *wa4, - int8_t isign); -INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch, - uint16_t *ifac, complex_t *wa, int8_t isign); -static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac); - - /*---------------------------------------------------------------------- passf2, passf3, passf4, passf5. Complex FFT passes fwd and bwd. ----------------------------------------------------------------------*/ -static void passf2(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, - complex_t *wa, int8_t isign) +#ifdef USE_SSE +static void passf2pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa) +{ + uint16_t i, k, ah, ac; + + if (ido == 1) + { + for (k = 0; k < l1; k++) + { + ah = 2*k; + ac = 4*k; + + RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]); + IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]); + + RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]); + IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]); + } + } else { + for (k = 0; k < l1; k++) + { + ah = k*ido; + ac = 2*k*ido; + + for (i = 0; i < ido; i+=4) + { + __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14; + __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24; + __m128 w1, w2, w3, w4; + + m1 = _mm_load_ps(&RE(cc[ac+i])); + m2 = _mm_load_ps(&RE(cc[ac+ido+i])); + m5 = _mm_load_ps(&RE(cc[ac+i+2])); + m6 = _mm_load_ps(&RE(cc[ac+ido+i+2])); + w1 = _mm_load_ps(&RE(wa[i])); + w3 = _mm_load_ps(&RE(wa[i+2])); + + m3 = _mm_add_ps(m1, m2); + m15 = _mm_add_ps(m5, m6); + + m4 = _mm_sub_ps(m1, m2); + m16 = _mm_sub_ps(m5, m6); + + _mm_store_ps(&RE(ch[ah+i]), m3); + _mm_store_ps(&RE(ch[ah+i+2]), m15); + + w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1)); + w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1)); + + m7 = _mm_mul_ps(m4, w1); + m17 = _mm_mul_ps(m16, w3); + m8 = _mm_mul_ps(m4, w2); + m18 = _mm_mul_ps(m16, w4); + + m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0)); + m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0)); + m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1)); + m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1)); + + m11 = _mm_add_ps(m9, m10); + m21 = _mm_add_ps(m19, m20); + m12 = _mm_sub_ps(m9, m10); + m22 = _mm_sub_ps(m19, m20); + + m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2)); + m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2)); + + m14 = _mm_unpacklo_ps(m12, m13); + m24 = _mm_unpacklo_ps(m22, m23); + + _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14); + _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24); + } + } + } +} +#endif + +static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa) { uint16_t i, k, ah, ac; @@ -68,9 +138,9 @@ static void passf2(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, ah = 2*k; ac = 4*k; - RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]); - IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]); + RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]); RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]); + IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]); IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]); } } else { @@ -83,251 +153,685 @@ static void passf2(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, { complex_t t2; - RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+ido]); - IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+ido]); + RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]); + RE(t2) = RE(cc[ac+i]) - RE(cc[ac+i+ido]); - RE(t2) = RE(cc[ac]) - RE(cc[ac+ido]); - IM(t2) = IM(cc[ac]) - IM(cc[ac+ido]); + IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]); + IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]); - RE(ch[ah+l1*ido]) = MUL_R_C(RE(t2),RE(wa[i])) - MUL_R_C(IM(t2),IM(wa[i]))*isign; - IM(ch[ah+l1*ido]) = MUL_R_C(IM(t2),RE(wa[i])) + MUL_R_C(RE(t2),IM(wa[i]))*isign; - ah++; - ac++; + ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]), + IM(t2), RE(t2), RE(wa[i]), IM(wa[i])); } } } } +static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa) +{ + uint16_t i, k, ah, ac; + + if (ido == 1) + { + for (k = 0; k < l1; k++) + { + ah = 2*k; + ac = 4*k; -static void passf3(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, - complex_t *wa1, complex_t *wa2, int8_t isign) + RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]); + RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]); + IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]); + IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]); + } + } else { + for (k = 0; k < l1; k++) + { + ah = k*ido; + ac = 2*k*ido; + + for (i = 0; i < ido; i++) + { + complex_t t2; + + RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]); + RE(t2) = RE(cc[ac+i]) - RE(cc[ac+i+ido]); + + IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]); + IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]); + + ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]), + RE(t2), IM(t2), RE(wa[i]), IM(wa[i])); + } + } + } +} + + +static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa1, const complex_t *wa2, + const int8_t isign) { - static real_t taur = COEF_CONST(-0.5); - static real_t taui = COEF_CONST(0.866025403784439); + static real_t taur = FRAC_CONST(-0.5); + static real_t taui = FRAC_CONST(0.866025403784439); uint16_t i, k, ac, ah; complex_t c2, c3, d2, d3, t2; if (ido == 1) { - for (k = 0; k < l1; k++) + if (isign == 1) { - ac = 3*k+1; - ah = k; + for (k = 0; k < l1; k++) + { + ac = 3*k+1; + ah = k; - RE(t2) = RE(cc[ac]) + RE(cc[ac+1]); - IM(t2) = IM(cc[ac]) + IM(cc[ac+1]); - RE(c2) = RE(cc[ac-1]) + MUL_R_C(RE(t2),taur); - IM(c2) = IM(cc[ac-1]) + MUL_R_C(IM(t2),taur); + RE(t2) = RE(cc[ac]) + RE(cc[ac+1]); + IM(t2) = IM(cc[ac]) + IM(cc[ac+1]); + RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),taur); + IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),taur); - RE(ch[ah]) = RE(cc[ac-1]) + RE(t2); - IM(ch[ah]) = IM(cc[ac-1]) + IM(t2); + RE(ch[ah]) = RE(cc[ac-1]) + RE(t2); + IM(ch[ah]) = IM(cc[ac-1]) + IM(t2); - RE(c3) = MUL_R_C((RE(cc[ac]) - RE(cc[ac+1])), taui)*isign; - IM(c3) = MUL_R_C((IM(cc[ac]) - IM(cc[ac+1])), taui)*isign; + RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+1])), taui); + IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+1])), taui); - RE(ch[ah+l1]) = RE(c2) - IM(c3); - IM(ch[ah+l1]) = IM(c2) + RE(c3); - RE(ch[ah+2*l1]) = RE(c2) + IM(c3); - IM(ch[ah+2*l1]) = IM(c2) - RE(c3); + RE(ch[ah+l1]) = RE(c2) - IM(c3); + IM(ch[ah+l1]) = IM(c2) + RE(c3); + RE(ch[ah+2*l1]) = RE(c2) + IM(c3); + IM(ch[ah+2*l1]) = IM(c2) - RE(c3); + } + } else { + for (k = 0; k < l1; k++) + { + ac = 3*k+1; + ah = k; + + RE(t2) = RE(cc[ac]) + RE(cc[ac+1]); + IM(t2) = IM(cc[ac]) + IM(cc[ac+1]); + RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),taur); + IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),taur); + + RE(ch[ah]) = RE(cc[ac-1]) + RE(t2); + IM(ch[ah]) = IM(cc[ac-1]) + IM(t2); + + RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+1])), taui); + IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+1])), taui); + + RE(ch[ah+l1]) = RE(c2) + IM(c3); + IM(ch[ah+l1]) = IM(c2) - RE(c3); + RE(ch[ah+2*l1]) = RE(c2) - IM(c3); + IM(ch[ah+2*l1]) = IM(c2) + RE(c3); + } } } else { - for (k = 0; k < l1; k++) + if (isign == 1) { - for (i = 0; i < ido; i++) + for (k = 0; k < l1; k++) + { + for (i = 0; i < ido; i++) + { + ac = i + (3*k+1)*ido; + ah = i + k * ido; + + RE(t2) = RE(cc[ac]) + RE(cc[ac+ido]); + RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),taur); + IM(t2) = IM(cc[ac]) + IM(cc[ac+ido]); + IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),taur); + + RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2); + IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2); + + RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+ido])), taui); + IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+ido])), taui); + + RE(d2) = RE(c2) - IM(c3); + IM(d3) = IM(c2) - RE(c3); + RE(d3) = RE(c2) + IM(c3); + IM(d2) = IM(c2) + RE(c3); + + ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]), + IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i])); + ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]), + IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i])); + } + } + } else { + for (k = 0; k < l1; k++) { - ac = i + (3*k+1)*ido; - ah = i + k * ido; + for (i = 0; i < ido; i++) + { + ac = i + (3*k+1)*ido; + ah = i + k * ido; + + RE(t2) = RE(cc[ac]) + RE(cc[ac+ido]); + RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),taur); + IM(t2) = IM(cc[ac]) + IM(cc[ac+ido]); + IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),taur); + + RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2); + IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2); + + RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+ido])), taui); + IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+ido])), taui); + + RE(d2) = RE(c2) + IM(c3); + IM(d3) = IM(c2) + RE(c3); + RE(d3) = RE(c2) - IM(c3); + IM(d2) = IM(c2) - RE(c3); + + ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]), + RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i])); + ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]), + RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i])); + } + } + } + } +} - RE(t2) = RE(cc[ac]) + RE(cc[ac+ido]); - RE(c2) = RE(cc[ac-ido]) + MUL_R_C(RE(t2),taur); - IM(t2) = IM(cc[ac]) + IM(cc[ac+ido]); - IM(c2) = IM(cc[ac-ido]) + MUL_R_C(IM(t2),taur); +#ifdef USE_SSE +static void passf4pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa1, const complex_t *wa2, + const complex_t *wa3) +{ + uint16_t i, k, ac, ah; - RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2); - IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2); + if (ido == 1) + { + for (k = 0; k < l1; k+=2) + { + __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10; + __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, n10; + __m128 neg1 = _mm_set_ps(-1.0, 1.0, 1.0, 1.0); - RE(c3) = MUL_R_C((RE(cc[ac]) - RE(cc[ac+ido])), taui)*isign; - IM(c3) = MUL_R_C((IM(cc[ac]) - IM(cc[ac+ido])), taui)*isign; + m1 = _mm_load_ps(&RE(cc[4*k])); + m2 = _mm_load_ps(&RE(cc[4*k+2])); + n1 = _mm_load_ps(&RE(cc[4*k+4])); + n2 = _mm_load_ps(&RE(cc[4*k+6])); - RE(d2) = RE(c2) - IM(c3); - IM(d3) = IM(c2) - RE(c3); - RE(d3) = RE(c2) + IM(c3); - IM(d2) = IM(c2) + RE(c3); + m3 = _mm_add_ps(m1, m2); - RE(ch[ah+l1*ido]) = MUL_R_C(RE(d2),RE(wa1[i])) - MUL_R_C(IM(d2),IM(wa1[i]))*isign; - IM(ch[ah+l1*ido]) = MUL_R_C(IM(d2),RE(wa1[i])) + MUL_R_C(RE(d2),IM(wa1[i]))*isign; - RE(ch[ah+l1*2*ido]) = MUL_R_C(RE(d3),RE(wa2[i])) - MUL_R_C(IM(d3),IM(wa2[i]))*isign; - IM(ch[ah+l1*2*ido]) = MUL_R_C(IM(d3),RE(wa2[i])) + MUL_R_C(RE(d3),IM(wa2[i]))*isign; + n4 = _mm_mul_ps(neg1, n1); + n5 = _mm_mul_ps(neg1, n2); + m4 = _mm_mul_ps(neg1, m1); + m5 = _mm_mul_ps(neg1, m2); + + n3 = _mm_add_ps(n1, n2); + m6 = _mm_sub_ps(m4, m5); + + m7 = _mm_shuffle_ps(m3, n3, _MM_SHUFFLE(1, 0, 1, 0)); + n6 = _mm_sub_ps(n4, n5); + m8 = _mm_shuffle_ps(m3, n3, _MM_SHUFFLE(3, 2, 3, 2)); + + n7 = _mm_shuffle_ps(m6, n6, _MM_SHUFFLE(1, 0, 1, 0)); + m9 = _mm_add_ps(m7, m8); + n8 = _mm_shuffle_ps(m6, n6, _MM_SHUFFLE(2, 3, 2, 3)); + + m10 = _mm_sub_ps(m7, m8); + n9 = _mm_add_ps(n7, n8); + + _mm_store_ps(&RE(ch[k]), m9); + n10 = _mm_sub_ps(n7, n8); + _mm_store_ps(&RE(ch[k+l1]), n9); + _mm_store_ps(&RE(ch[k+2*l1]), m10); + _mm_store_ps(&RE(ch[k+3*l1]), n10); + } + } else { + for (k = 0; k < l1; k++) + { + ac = 4*k*ido; + ah = k*ido; + + for (i = 0; i < ido; i+=2) + { + __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, m16; + __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, m17, m18, m19, m20, m21, m22, m23; + __m128 w1, w2, w3, w4, w5, w6, m24, m25, m26, m27, m28, m29, m30; + __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0); + + m1 = _mm_load_ps(&RE(cc[ac+i])); + m2 = _mm_load_ps(&RE(cc[ac+i+2*ido])); + m3 = _mm_add_ps(m1, m2); + m4 = _mm_sub_ps(m1, m2); + + n1 = _mm_load_ps(&RE(cc[ac+i+ido])); + n2 = _mm_load_ps(&RE(cc[ac+i+3*ido])); + n3 = _mm_add_ps(n1, n2); + + n4 = _mm_mul_ps(neg1, n1); + n5 = _mm_mul_ps(neg1, n2); + n6 = _mm_sub_ps(n4, n5); + + m5 = _mm_add_ps(m3, n3); + + n7 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2, 3, 0, 1)); + n8 = _mm_add_ps(m4, n7); + + m6 = _mm_sub_ps(m3, n3); + n9 = _mm_sub_ps(m4, n7); + + _mm_store_ps(&RE(ch[ah+i]), m5); + +#if 0 + static INLINE void ComplexMult(real_t *y1, real_t *y2, + real_t x1, real_t x2, real_t c1, real_t c2) + { + *y1 = MUL_F(x1, c1) + MUL_F(x2, c2); + *y2 = MUL_F(x2, c1) - MUL_F(x1, c2); + } + + m7.0 = RE(c2)*RE(wa1[i]) + m7.1 = IM(c2)*IM(wa1[i]) + m7.2 = RE(c6)*RE(wa1[i+1]) + m7.3 = IM(c6)*IM(wa1[i+1]) + + m8.0 = RE(c2)*IM(wa1[i]) + m8.1 = IM(c2)*RE(wa1[i]) + m8.2 = RE(c6)*IM(wa1[i+1]) + m8.3 = IM(c6)*RE(wa1[i+1]) + + RE(0) = m7.0 - m7.1 + IM(0) = m8.0 + m8.1 + RE(1) = m7.2 - m7.3 + IM(1) = m8.2 + m8.3 + +//// + RE(0) = RE(c2)*RE(wa1[i]) - IM(c2)*IM(wa1[i]) + IM(0) = RE(c2)*IM(wa1[i]) + IM(c2)*RE(wa1[i]) + RE(1) = RE(c6)*RE(wa1[i+1]) - IM(c6)*IM(wa1[i+1]) + IM(1) = RE(c6)*IM(wa1[i+1]) + IM(c6)*RE(wa1[i+1]) +#endif + + w1 = _mm_load_ps(&RE(wa1[i])); + w3 = _mm_load_ps(&RE(wa2[i])); + w5 = _mm_load_ps(&RE(wa3[i])); + + w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1)); + w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1)); + w6 = _mm_shuffle_ps(w5, w5, _MM_SHUFFLE(2, 3, 0, 1)); + + m7 = _mm_mul_ps(n8, w1); + m15 = _mm_mul_ps(m6, w3); + m23 = _mm_mul_ps(n9, w5); + m8 = _mm_mul_ps(n8, w2); + m16 = _mm_mul_ps(m6, w4); + m24 = _mm_mul_ps(n9, w6); + + m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0)); + m17 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(2, 0, 2, 0)); + m25 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(2, 0, 2, 0)); + m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1)); + m18 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(3, 1, 3, 1)); + m26 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(3, 1, 3, 1)); + + m11 = _mm_add_ps(m9, m10); + m19 = _mm_add_ps(m17, m18); + m27 = _mm_add_ps(m25, m26); + m12 = _mm_sub_ps(m9, m10); + m20 = _mm_sub_ps(m17, m18); + m28 = _mm_sub_ps(m25, m26); + + m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2)); + m21 = _mm_shuffle_ps(m19, m19, _MM_SHUFFLE(0, 0, 3, 2)); + m29 = _mm_shuffle_ps(m27, m27, _MM_SHUFFLE(0, 0, 3, 2)); + m14 = _mm_unpacklo_ps(m12, m13); + m22 = _mm_unpacklo_ps(m20, m21); + m30 = _mm_unpacklo_ps(m28, m29); + + _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14); + _mm_store_ps(&RE(ch[ah+i+2*l1*ido]), m22); + _mm_store_ps(&RE(ch[ah+i+3*l1*ido]), m30); } } } } +#endif - -static void passf4(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, - complex_t *wa1, complex_t *wa2, complex_t *wa3, int8_t isign) +static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa1, const complex_t *wa2, + const complex_t *wa3) { uint16_t i, k, ac, ah; - complex_t c2, c3, c4, t1, t2, t3, t4; if (ido == 1) { for (k = 0; k < l1; k++) { + complex_t t1, t2, t3, t4; + ac = 4*k; ah = k; - RE(t2) = RE(cc[ac]) + RE(cc[ac+2]); - IM(t2) = IM(cc[ac]) + IM(cc[ac+2]); + RE(t2) = RE(cc[ac]) + RE(cc[ac+2]); + RE(t1) = RE(cc[ac]) - RE(cc[ac+2]); + IM(t2) = IM(cc[ac]) + IM(cc[ac+2]); + IM(t1) = IM(cc[ac]) - IM(cc[ac+2]); RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]); - IM(t3) = IM(cc[ac+1]) + IM(cc[ac+3]); - RE(t1) = RE(cc[ac]) - RE(cc[ac+2]); - IM(t1) = IM(cc[ac]) - IM(cc[ac+2]); - RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]); IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]); + IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]); + RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]); - RE(ch[ah]) = RE(t2) + RE(t3); - IM(ch[ah]) = IM(t2) + IM(t3); - RE(ch[ah+l1]) = RE(t1) + RE(t4)*isign; - IM(ch[ah+l1]) = IM(t1) + IM(t4)*isign; + RE(ch[ah]) = RE(t2) + RE(t3); RE(ch[ah+2*l1]) = RE(t2) - RE(t3); + + IM(ch[ah]) = IM(t2) + IM(t3); IM(ch[ah+2*l1]) = IM(t2) - IM(t3); - RE(ch[ah+3*l1]) = RE(t1) - RE(t4)*isign; - IM(ch[ah+3*l1]) = IM(t1) - IM(t4)*isign; + + RE(ch[ah+l1]) = RE(t1) + RE(t4); + RE(ch[ah+3*l1]) = RE(t1) - RE(t4); + + IM(ch[ah+l1]) = IM(t1) + IM(t4); + IM(ch[ah+3*l1]) = IM(t1) - IM(t4); } } else { for (k = 0; k < l1; k++) { + ac = 4*k*ido; + ah = k*ido; + for (i = 0; i < ido; i++) { - ac = i + 4*k*ido; - ah = i + k*ido; - - RE(t2) = RE(cc[ac]) + RE(cc[ac+2*ido]); - IM(t2) = IM(cc[ac]) + IM(cc[ac+2*ido]); - RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+3*ido]); - IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+3*ido]); - RE(t1) = RE(cc[ac]) - RE(cc[ac+2*ido]); - IM(t1) = IM(cc[ac]) - IM(cc[ac+2*ido]); - RE(t4) = IM(cc[ac+3*ido]) - IM(cc[ac+ido]); - IM(t4) = RE(cc[ac+ido]) - RE(cc[ac+3*ido]); - - RE(ch[ah]) = RE(t2) + RE(t3); - IM(ch[ah]) = IM(t2) + IM(t3); - - RE(c2) = RE(t1) + RE(t4)*isign; - IM(c2) = IM(t1) + IM(t4)*isign; - RE(c3) = RE(t2) - RE(t3); - IM(c3) = IM(t2) - IM(t3); - RE(c4) = RE(t1) - RE(t4)*isign; - IM(c4) = IM(t1) - IM(t4)*isign; - - RE(ch[ah+l1*ido]) = MUL_R_C(RE(c2),RE(wa1[i])) - MUL_R_C(IM(c2),IM(wa1[i]))*isign; - IM(ch[ah+l1*ido]) = MUL_R_C(IM(c2),RE(wa1[i])) + MUL_R_C(RE(c2),IM(wa1[i]))*isign; - RE(ch[ah+2*l1*ido]) = MUL_R_C(RE(c3),RE(wa2[i])) - MUL_R_C(IM(c3),IM(wa2[i]))*isign; - IM(ch[ah+2*l1*ido]) = MUL_R_C(IM(c3),RE(wa2[i])) + MUL_R_C(RE(c3),IM(wa2[i]))*isign; - RE(ch[ah+3*l1*ido]) = MUL_R_C(RE(c4),RE(wa3[i])) - MUL_R_C(IM(c4),IM(wa3[i]))*isign; - IM(ch[ah+3*l1*ido]) = MUL_R_C(IM(c4),RE(wa3[i])) + MUL_R_C(RE(c4),IM(wa3[i]))*isign; + complex_t c2, c3, c4, t1, t2, t3, t4; + + RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]); + RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]); + IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]); + IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]); + RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]); + IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]); + IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]); + RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]); + + RE(c2) = RE(t1) + RE(t4); + RE(c4) = RE(t1) - RE(t4); + + IM(c2) = IM(t1) + IM(t4); + IM(c4) = IM(t1) - IM(t4); + + RE(ch[ah+i]) = RE(t2) + RE(t3); + RE(c3) = RE(t2) - RE(t3); + + IM(ch[ah+i]) = IM(t2) + IM(t3); + IM(c3) = IM(t2) - IM(t3); + + ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]), + IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i])); + ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]), + IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i])); + ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]), + IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i])); } } } } - -static void passf5(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, - complex_t *wa1, complex_t *wa2, complex_t *wa3, complex_t *wa4, - int8_t isign) +static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa1, const complex_t *wa2, + const complex_t *wa3) { - static real_t tr11 = COEF_CONST(0.309016994374947); - static real_t ti11 = COEF_CONST(0.951056516295154); - static real_t tr12 = COEF_CONST(-0.809016994374947); - static real_t ti12 = COEF_CONST(0.587785252292473); uint16_t i, k, ac, ah; - complex_t c2, c3, c4, c5, d3, d4, d5, d2, t2, t3, t4, t5; if (ido == 1) { for (k = 0; k < l1; k++) { - ac = 5*k + 1; + complex_t t1, t2, t3, t4; + + ac = 4*k; ah = k; - RE(t2) = RE(cc[ac]) + RE(cc[ac+3]); - IM(t2) = IM(cc[ac]) + IM(cc[ac+3]); - RE(t3) = RE(cc[ac+1]) + RE(cc[ac+2]); - IM(t3) = IM(cc[ac+1]) + IM(cc[ac+2]); - RE(t4) = RE(cc[ac+1]) - RE(cc[ac+2]); - IM(t4) = IM(cc[ac+1]) - IM(cc[ac+2]); - RE(t5) = RE(cc[ac]) - RE(cc[ac+3]); - IM(t5) = IM(cc[ac]) - IM(cc[ac+3]); - - RE(ch[ah]) = RE(cc[ac-1]) + RE(t2) + RE(t3); - IM(ch[ah]) = IM(cc[ac-1]) + IM(t2) + IM(t3); - - RE(c2) = RE(cc[ac-1]) + MUL_R_C(RE(t2),tr11) + MUL_R_C(RE(t3),tr12); - IM(c2) = IM(cc[ac-1]) + MUL_R_C(IM(t2),tr11) + MUL_R_C(IM(t3),tr12); - RE(c3) = RE(cc[ac-1]) + MUL_R_C(RE(t2),tr12) + MUL_R_C(RE(t3),tr11); - IM(c3) = IM(cc[ac-1]) + MUL_R_C(IM(t2),tr12) + MUL_R_C(IM(t3),tr11); - RE(c4) = (MUL_R_C(RE(t5),ti12)*isign - MUL_R_C(RE(t4),ti11)); - IM(c4) = (MUL_R_C(IM(t5),ti12)*isign - MUL_R_C(IM(t4),ti11)); - RE(c5) = (MUL_R_C(RE(t5),ti11)*isign + MUL_R_C(RE(t4),ti12)); - IM(c5) = (MUL_R_C(IM(t5),ti11)*isign + MUL_R_C(IM(t4),ti12)); - - RE(ch[ah+l1]) = RE(c2) - IM(c5); - IM(ch[ah+l1]) = IM(c2) + RE(c5); - RE(ch[ah+2*l1]) = RE(c3) - IM(c4); - IM(ch[ah+2*l1]) = IM(c3) + RE(c4); - RE(ch[ah+3*l1]) = RE(c3) + IM(c4); - IM(ch[ah+3*l1]) = IM(c3) - RE(c4); - RE(ch[ah+4*l1]) = RE(c2) + IM(c5); - IM(ch[ah+4*l1]) = IM(c2) - RE(c5); + RE(t2) = RE(cc[ac]) + RE(cc[ac+2]); + RE(t1) = RE(cc[ac]) - RE(cc[ac+2]); + IM(t2) = IM(cc[ac]) + IM(cc[ac+2]); + IM(t1) = IM(cc[ac]) - IM(cc[ac+2]); + RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]); + IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]); + IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]); + RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]); + + RE(ch[ah]) = RE(t2) + RE(t3); + RE(ch[ah+2*l1]) = RE(t2) - RE(t3); + + IM(ch[ah]) = IM(t2) + IM(t3); + IM(ch[ah+2*l1]) = IM(t2) - IM(t3); + + RE(ch[ah+l1]) = RE(t1) - RE(t4); + RE(ch[ah+3*l1]) = RE(t1) + RE(t4); + + IM(ch[ah+l1]) = IM(t1) - IM(t4); + IM(ch[ah+3*l1]) = IM(t1) + IM(t4); } } else { for (k = 0; k < l1; k++) { + ac = 4*k*ido; + ah = k*ido; + for (i = 0; i < ido; i++) { - ac = i + (k*5 + 1) * ido; - ah = i + k * ido; - - RE(t2) = RE(cc[ac]) + RE(cc[ac+3*ido]); - IM(t2) = IM(cc[ac]) + IM(cc[ac+3*ido]); - RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+2*ido]); - IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+2*ido]); - RE(t4) = RE(cc[ac+ido]) - RE(cc[ac+2*ido]); - IM(t4) = IM(cc[ac+ido]) - IM(cc[ac+2*ido]); - RE(t5) = RE(cc[ac]) - RE(cc[ac+3*ido]); - IM(t5) = IM(cc[ac]) - IM(cc[ac+3*ido]); - - RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2) + RE(t3); - IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2) + IM(t3); - - RE(c2) = RE(cc[ac-ido]) + MUL_R_C(RE(t2),tr11) + MUL_R_C(RE(t3),tr12); - IM(c2) = IM(cc[ac-ido]) + MUL_R_C(IM(t2),tr11) + MUL_R_C(IM(t3),tr12); - RE(c3) = RE(cc[ac-ido]) + MUL_R_C(RE(t2),tr12) + MUL_R_C(RE(t3),tr11); - IM(c3) = IM(cc[ac-ido]) + MUL_R_C(IM(t2),tr12) + MUL_R_C(IM(t3),tr11); - RE(c4) = (MUL_R_C(RE(t5),ti12)*isign - MUL_R_C(RE(t4),ti11)); - IM(c4) = (MUL_R_C(IM(t5),ti12)*isign - MUL_R_C(IM(t4),ti11)); - RE(c5) = (MUL_R_C(RE(t5),ti11)*isign + MUL_R_C(RE(t4),ti12)); - IM(c5) = (MUL_R_C(IM(t5),ti11)*isign + MUL_R_C(IM(t4),ti12)); - - IM(d2) = IM(c2) + RE(c5); - IM(d3) = IM(c3) + RE(c4); - RE(d4) = RE(c3) + IM(c4); - RE(d5) = RE(c2) + IM(c5); - RE(d2) = RE(c2) - IM(c5); - IM(d5) = IM(c2) - RE(c5); - RE(d3) = RE(c3) - IM(c4); - IM(d4) = IM(c3) - RE(c4); - - RE(ch[ah+l1*ido]) = MUL_R_C(RE(d2),RE(wa1[i])) - MUL_R_C(IM(d2),IM(wa1[i]))*isign; - IM(ch[ah+l1*ido]) = MUL_R_C(IM(d2),RE(wa1[i])) + MUL_R_C(RE(d2),IM(wa1[i]))*isign; - RE(ch[ah+2*l1*ido]) = MUL_R_C(RE(d3),RE(wa2[i])) - MUL_R_C(IM(d3),IM(wa2[i]))*isign; - IM(ch[ah+2*l1*ido]) = MUL_R_C(IM(d3),RE(wa2[i])) + MUL_R_C(RE(d3),IM(wa2[i]))*isign; - RE(ch[ah+3*l1*ido]) = MUL_R_C(RE(d4),RE(wa3[i])) - MUL_R_C(IM(d4),IM(wa3[i]))*isign; - IM(ch[ah+3*l1*ido]) = MUL_R_C(IM(d4),RE(wa3[i])) + MUL_R_C(RE(d4),IM(wa3[i]))*isign; - RE(ch[ah+4*l1*ido]) = MUL_R_C(RE(d5),RE(wa4[i])) - MUL_R_C(IM(d5),IM(wa4[i]))*isign; - IM(ch[ah+4*l1*ido]) = MUL_R_C(IM(d5),RE(wa4[i])) + MUL_R_C(RE(d5),IM(wa4[i]))*isign; + complex_t c2, c3, c4, t1, t2, t3, t4; + + RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]); + RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]); + IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]); + IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]); + RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]); + IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]); + IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]); + RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]); + + RE(c2) = RE(t1) - RE(t4); + RE(c4) = RE(t1) + RE(t4); + + IM(c2) = IM(t1) - IM(t4); + IM(c4) = IM(t1) + IM(t4); + + RE(ch[ah+i]) = RE(t2) + RE(t3); + RE(c3) = RE(t2) - RE(t3); + + IM(ch[ah+i]) = IM(t2) + IM(t3); + IM(c3) = IM(t2) - IM(t3); + + ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]), + RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i])); + ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]), + RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i])); + ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]), + RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i])); + } + } + } +} + +static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa1, const complex_t *wa2, const complex_t *wa3, + const complex_t *wa4, const int8_t isign) +{ + static real_t tr11 = FRAC_CONST(0.309016994374947); + static real_t ti11 = FRAC_CONST(0.951056516295154); + static real_t tr12 = FRAC_CONST(-0.809016994374947); + static real_t ti12 = FRAC_CONST(0.587785252292473); + uint16_t i, k, ac, ah; + complex_t c2, c3, c4, c5, d3, d4, d5, d2, t2, t3, t4, t5; + + if (ido == 1) + { + if (isign == 1) + { + for (k = 0; k < l1; k++) + { + ac = 5*k + 1; + ah = k; + + RE(t2) = RE(cc[ac]) + RE(cc[ac+3]); + IM(t2) = IM(cc[ac]) + IM(cc[ac+3]); + RE(t3) = RE(cc[ac+1]) + RE(cc[ac+2]); + IM(t3) = IM(cc[ac+1]) + IM(cc[ac+2]); + RE(t4) = RE(cc[ac+1]) - RE(cc[ac+2]); + IM(t4) = IM(cc[ac+1]) - IM(cc[ac+2]); + RE(t5) = RE(cc[ac]) - RE(cc[ac+3]); + IM(t5) = IM(cc[ac]) - IM(cc[ac+3]); + + RE(ch[ah]) = RE(cc[ac-1]) + RE(t2) + RE(t3); + IM(ch[ah]) = IM(cc[ac-1]) + IM(t2) + IM(t3); + + RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12); + IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12); + RE(c3) = RE(cc[ac-1]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11); + IM(c3) = IM(cc[ac-1]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11); + + ComplexMult(&RE(c5), &RE(c4), + ti11, ti12, RE(t5), RE(t4)); + ComplexMult(&IM(c5), &IM(c4), + ti11, ti12, IM(t5), IM(t4)); + + RE(ch[ah+l1]) = RE(c2) - IM(c5); + IM(ch[ah+l1]) = IM(c2) + RE(c5); + RE(ch[ah+2*l1]) = RE(c3) - IM(c4); + IM(ch[ah+2*l1]) = IM(c3) + RE(c4); + RE(ch[ah+3*l1]) = RE(c3) + IM(c4); + IM(ch[ah+3*l1]) = IM(c3) - RE(c4); + RE(ch[ah+4*l1]) = RE(c2) + IM(c5); + IM(ch[ah+4*l1]) = IM(c2) - RE(c5); + } + } else { + for (k = 0; k < l1; k++) + { + ac = 5*k + 1; + ah = k; + + RE(t2) = RE(cc[ac]) + RE(cc[ac+3]); + IM(t2) = IM(cc[ac]) + IM(cc[ac+3]); + RE(t3) = RE(cc[ac+1]) + RE(cc[ac+2]); + IM(t3) = IM(cc[ac+1]) + IM(cc[ac+2]); + RE(t4) = RE(cc[ac+1]) - RE(cc[ac+2]); + IM(t4) = IM(cc[ac+1]) - IM(cc[ac+2]); + RE(t5) = RE(cc[ac]) - RE(cc[ac+3]); + IM(t5) = IM(cc[ac]) - IM(cc[ac+3]); + + RE(ch[ah]) = RE(cc[ac-1]) + RE(t2) + RE(t3); + IM(ch[ah]) = IM(cc[ac-1]) + IM(t2) + IM(t3); + + RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12); + IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12); + RE(c3) = RE(cc[ac-1]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11); + IM(c3) = IM(cc[ac-1]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11); + + ComplexMult(&RE(c4), &RE(c5), + ti12, ti11, RE(t5), RE(t4)); + ComplexMult(&IM(c4), &IM(c5), + ti12, ti12, IM(t5), IM(t4)); + + RE(ch[ah+l1]) = RE(c2) + IM(c5); + IM(ch[ah+l1]) = IM(c2) - RE(c5); + RE(ch[ah+2*l1]) = RE(c3) + IM(c4); + IM(ch[ah+2*l1]) = IM(c3) - RE(c4); + RE(ch[ah+3*l1]) = RE(c3) - IM(c4); + IM(ch[ah+3*l1]) = IM(c3) + RE(c4); + RE(ch[ah+4*l1]) = RE(c2) - IM(c5); + IM(ch[ah+4*l1]) = IM(c2) + RE(c5); + } + } + } else { + if (isign == 1) + { + for (k = 0; k < l1; k++) + { + for (i = 0; i < ido; i++) + { + ac = i + (k*5 + 1) * ido; + ah = i + k * ido; + + RE(t2) = RE(cc[ac]) + RE(cc[ac+3*ido]); + IM(t2) = IM(cc[ac]) + IM(cc[ac+3*ido]); + RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+2*ido]); + IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+2*ido]); + RE(t4) = RE(cc[ac+ido]) - RE(cc[ac+2*ido]); + IM(t4) = IM(cc[ac+ido]) - IM(cc[ac+2*ido]); + RE(t5) = RE(cc[ac]) - RE(cc[ac+3*ido]); + IM(t5) = IM(cc[ac]) - IM(cc[ac+3*ido]); + + RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2) + RE(t3); + IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2) + IM(t3); + + RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12); + IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12); + RE(c3) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11); + IM(c3) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11); + + ComplexMult(&RE(c5), &RE(c4), + ti11, ti12, RE(t5), RE(t4)); + ComplexMult(&IM(c5), &IM(c4), + ti11, ti12, IM(t5), IM(t4)); + + IM(d2) = IM(c2) + RE(c5); + IM(d3) = IM(c3) + RE(c4); + RE(d4) = RE(c3) + IM(c4); + RE(d5) = RE(c2) + IM(c5); + RE(d2) = RE(c2) - IM(c5); + IM(d5) = IM(c2) - RE(c5); + RE(d3) = RE(c3) - IM(c4); + IM(d4) = IM(c3) - RE(c4); + + ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]), + IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i])); + ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]), + IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i])); + ComplexMult(&IM(ch[ah+3*l1*ido]), &RE(ch[ah+3*l1*ido]), + IM(d4), RE(d4), RE(wa3[i]), IM(wa3[i])); + ComplexMult(&IM(ch[ah+4*l1*ido]), &RE(ch[ah+4*l1*ido]), + IM(d5), RE(d5), RE(wa4[i]), IM(wa4[i])); + } + } + } else { + for (k = 0; k < l1; k++) + { + for (i = 0; i < ido; i++) + { + ac = i + (k*5 + 1) * ido; + ah = i + k * ido; + + RE(t2) = RE(cc[ac]) + RE(cc[ac+3*ido]); + IM(t2) = IM(cc[ac]) + IM(cc[ac+3*ido]); + RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+2*ido]); + IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+2*ido]); + RE(t4) = RE(cc[ac+ido]) - RE(cc[ac+2*ido]); + IM(t4) = IM(cc[ac+ido]) - IM(cc[ac+2*ido]); + RE(t5) = RE(cc[ac]) - RE(cc[ac+3*ido]); + IM(t5) = IM(cc[ac]) - IM(cc[ac+3*ido]); + + RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2) + RE(t3); + IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2) + IM(t3); + + RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12); + IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12); + RE(c3) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11); + IM(c3) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11); + + ComplexMult(&RE(c4), &RE(c5), + ti12, ti11, RE(t5), RE(t4)); + ComplexMult(&IM(c4), &IM(c5), + ti12, ti12, IM(t5), IM(t4)); + + IM(d2) = IM(c2) - RE(c5); + IM(d3) = IM(c3) - RE(c4); + RE(d4) = RE(c3) - IM(c4); + RE(d5) = RE(c2) - IM(c5); + RE(d2) = RE(c2) + IM(c5); + IM(d5) = IM(c2) + RE(c5); + RE(d3) = RE(c3) + IM(c4); + IM(d4) = IM(c3) + RE(c4); + + ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]), + RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i])); + ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]), + RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i])); + ComplexMult(&RE(ch[ah+3*l1*ido]), &IM(ch[ah+3*l1*ido]), + RE(d4), IM(d4), RE(wa3[i]), IM(wa3[i])); + ComplexMult(&RE(ch[ah+4*l1*ido]), &IM(ch[ah+4*l1*ido]), + RE(d5), IM(d5), RE(wa4[i]), IM(wa4[i])); + } } } } @@ -338,8 +842,9 @@ static void passf5(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch, cfftf1, cfftf, cfftb, cffti1, cffti. Complex FFTs. ----------------------------------------------------------------------*/ -INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch, - uint16_t *ifac, complex_t *wa, int8_t isign) +#ifdef USE_SSE +INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch, + const uint16_t *ifac, const complex_t *wa, const int8_t isign) { uint16_t i; uint16_t k1, l1, l2; @@ -359,11 +864,22 @@ INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch, switch (ip) { + case 4: + ix2 = iw + ido; + ix3 = ix2 + ido; + + if (na == 0) + passf4pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]); + else + passf4pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]); + + na = 1 - na; + break; case 2: if (na == 0) - passf2(ido, l1, c, ch, &wa[iw], isign); + passf2pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]); else - passf2(ido, l1, ch, c, &wa[iw], isign); + passf2pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]); na = 1 - na; break; @@ -371,20 +887,88 @@ INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch, ix2 = iw + ido; if (na == 0) - passf3(ido, l1, c, ch, &wa[iw], &wa[ix2], isign); + passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign); else - passf3(ido, l1, ch, c, &wa[iw], &wa[ix2], isign); + passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign); na = 1 - na; break; + case 5: + ix2 = iw + ido; + ix3 = ix2 + ido; + ix4 = ix3 + ido; + + if (na == 0) + passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); + else + passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); + + na = 1 - na; + break; + } + + l1 = l2; + iw += (ip-1) * ido; + } + + if (na == 0) + return; + + for (i = 0; i < n; i++) + { + RE(c[i]) = RE(ch[i]); + IM(c[i]) = IM(ch[i]); + } +} +#endif + +INLINE void cfftf1pos(uint16_t n, complex_t *c, complex_t *ch, + const uint16_t *ifac, const complex_t *wa, const int8_t isign) +{ + uint16_t i; + uint16_t k1, l1, l2; + uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1; + + nf = ifac[1]; + na = 0; + l1 = 1; + iw = 0; + + for (k1 = 2; k1 <= nf+1; k1++) + { + ip = ifac[k1]; + l2 = ip*l1; + ido = n / l2; + idl1 = ido*l1; + + switch (ip) + { case 4: ix2 = iw + ido; ix3 = ix2 + ido; if (na == 0) - passf4(ido, l1, c, ch, &wa[iw], &wa[ix2], &wa[ix3], isign); + passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]); else - passf4(ido, l1, ch, c, &wa[iw], &wa[ix2], &wa[ix3], isign); + passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]); + + na = 1 - na; + break; + case 2: + if (na == 0) + passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]); + else + passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]); + + na = 1 - na; + break; + case 3: + ix2 = iw + ido; + + if (na == 0) + passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign); + else + passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign); na = 1 - na; break; @@ -394,9 +978,87 @@ INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch, ix4 = ix3 + ido; if (na == 0) - passf5(ido, l1, c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); + passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); else - passf5(ido, l1, ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); + passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); + + na = 1 - na; + break; + } + + l1 = l2; + iw += (ip-1) * ido; + } + + if (na == 0) + return; + + for (i = 0; i < n; i++) + { + RE(c[i]) = RE(ch[i]); + IM(c[i]) = IM(ch[i]); + } +} + +INLINE void cfftf1neg(uint16_t n, complex_t *c, complex_t *ch, + const uint16_t *ifac, const complex_t *wa, const int8_t isign) +{ + uint16_t i; + uint16_t k1, l1, l2; + uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1; + + nf = ifac[1]; + na = 0; + l1 = 1; + iw = 0; + + for (k1 = 2; k1 <= nf+1; k1++) + { + ip = ifac[k1]; + l2 = ip*l1; + ido = n / l2; + idl1 = ido*l1; + + switch (ip) + { + case 4: + ix2 = iw + ido; + ix3 = ix2 + ido; + + if (na == 0) + passf4neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]); + else + passf4neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]); + + na = 1 - na; + break; + case 2: + if (na == 0) + passf2neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]); + else + passf2neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]); + + na = 1 - na; + break; + case 3: + ix2 = iw + ido; + + if (na == 0) + passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign); + else + passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign); + + na = 1 - na; + break; + case 5: + ix2 = iw + ido; + ix3 = ix2 + ido; + ix4 = ix3 + ido; + + if (na == 0) + passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); + else + passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); na = 1 - na; break; @@ -418,14 +1080,21 @@ INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch, void cfftf(cfft_info *cfft, complex_t *c) { - cfftf1(cfft->n, c, cfft->work, cfft->ifac, cfft->tab, -1); + cfftf1neg(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, -1); } void cfftb(cfft_info *cfft, complex_t *c) { - cfftf1(cfft->n, c, cfft->work, cfft->ifac, cfft->tab, +1); + cfftf1pos(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1); } +#ifdef USE_SSE +void cfftb_sse(cfft_info *cfft, complex_t *c) +{ + cfftf1pos_sse(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1); +} +#endif + static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac) { static uint16_t ntryh[4] = {3, 4, 2, 5}; @@ -478,7 +1147,7 @@ startloop: ifac[1] = nf; #ifndef FIXED_POINT - argh = 2.0*M_PI / (real_t)n; + argh = (real_t)2.0*(real_t)M_PI / (real_t)n; i = 0; l1 = 1; @@ -504,8 +1173,8 @@ startloop: i++; fi++; arg = fi * argld; - RE(wa[i]) = cos(arg); - IM(wa[i]) = sin(arg); + RE(wa[i]) = (real_t)cos(arg); + IM(wa[i]) = (real_t)sin(arg); } if (ip > 5) @@ -521,13 +1190,13 @@ startloop: cfft_info *cffti(uint16_t n) { - cfft_info *cfft = (cfft_info*)malloc(sizeof(cfft_info)); + cfft_info *cfft = (cfft_info*)faad_malloc(sizeof(cfft_info)); cfft->n = n; - cfft->work = (complex_t*)malloc(n*sizeof(complex_t)); + cfft->work = (complex_t*)faad_malloc(n*sizeof(complex_t)); #ifndef FIXED_POINT - cfft->tab = (complex_t*)malloc(n*sizeof(complex_t)); + cfft->tab = (complex_t*)faad_malloc(n*sizeof(complex_t)); cffti1(n, cfft->tab, cfft->ifac); #else @@ -535,14 +1204,19 @@ cfft_info *cffti(uint16_t n) switch (n) { - case 60: cfft->tab = cfft_tab_60; break; case 64: cfft->tab = cfft_tab_64; break; - case 480: cfft->tab = cfft_tab_480; break; case 512: cfft->tab = cfft_tab_512; break; #ifdef LD_DEC - case 240: cfft->tab = cfft_tab_240; break; case 256: cfft->tab = cfft_tab_256; break; #endif + +#ifdef ALLOW_SMALL_FRAMELENGTH + case 60: cfft->tab = cfft_tab_60; break; + case 480: cfft->tab = cfft_tab_480; break; +#ifdef LD_DEC + case 240: cfft->tab = cfft_tab_240; break; +#endif +#endif } #endif @@ -551,11 +1225,11 @@ cfft_info *cffti(uint16_t n) void cfftu(cfft_info *cfft) { - if (cfft->work) free(cfft->work); + if (cfft->work) faad_free(cfft->work); #ifndef FIXED_POINT - if (cfft->tab) free(cfft->tab); + if (cfft->tab) faad_free(cfft->tab); #endif - if (cfft) free(cfft); + if (cfft) faad_free(cfft); } |