summaryrefslogtreecommitdiff
path: root/src/libfaad/cfft.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libfaad/cfft.c')
-rw-r--r--src/libfaad/cfft.c1116
1 files changed, 895 insertions, 221 deletions
diff --git a/src/libfaad/cfft.c b/src/libfaad/cfft.c
index a18527dbc..bbdfc36df 100644
--- a/src/libfaad/cfft.c
+++ b/src/libfaad/cfft.c
@@ -1,22 +1,28 @@
/*
-** FAAD - Freeware Advanced Audio Decoder
-** Copyright (C) 2002 M. Bakker
-**
+** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
+** Copyright (C) 2003 M. Bakker, Ahead Software AG, http://www.nero.com
+**
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation; either version 2 of the License, or
** (at your option) any later version.
-**
+**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
-**
+**
** You should have received a copy of the GNU General Public License
-** along with this program; if not, write to the Free Software
+** along with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
**
-** $Id: cfft.c,v 1.6 2003/04/12 14:58:46 miguelfreitas Exp $
+** Any non-GPL usage of this software or parts of this software is strictly
+** forbidden.
+**
+** Commercial non-GPL licensing of this software is possible.
+** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
+**
+** $Id: cfft.c,v 1.7 2003/12/30 02:00:10 miguelfreitas Exp $
**/
/*
@@ -37,27 +43,91 @@
#include "cfft_tab.h"
-/* static declarations moved to avoid compiler warnings [MF] */
-static void passf2(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch,
- complex_t *wa, int8_t isign);
-static void passf3(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch,
- complex_t *wa1, complex_t *wa2, int8_t isign);
-static void passf4(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch,
- complex_t *wa1, complex_t *wa2, complex_t *wa3, int8_t isign);
-static void passf5(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch,
- complex_t *wa1, complex_t *wa2, complex_t *wa3, complex_t *wa4,
- int8_t isign);
-INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch,
- uint16_t *ifac, complex_t *wa, int8_t isign);
-static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac);
-
-
/*----------------------------------------------------------------------
passf2, passf3, passf4, passf5. Complex FFT passes fwd and bwd.
----------------------------------------------------------------------*/
-static void passf2(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch,
- complex_t *wa, int8_t isign)
+#ifdef USE_SSE
+static void passf2pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa)
+{
+ uint16_t i, k, ah, ac;
+
+ if (ido == 1)
+ {
+ for (k = 0; k < l1; k++)
+ {
+ ah = 2*k;
+ ac = 4*k;
+
+ RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
+ IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
+
+ RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
+ IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
+ }
+ } else {
+ for (k = 0; k < l1; k++)
+ {
+ ah = k*ido;
+ ac = 2*k*ido;
+
+ for (i = 0; i < ido; i+=4)
+ {
+ __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14;
+ __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24;
+ __m128 w1, w2, w3, w4;
+
+ m1 = _mm_load_ps(&RE(cc[ac+i]));
+ m2 = _mm_load_ps(&RE(cc[ac+ido+i]));
+ m5 = _mm_load_ps(&RE(cc[ac+i+2]));
+ m6 = _mm_load_ps(&RE(cc[ac+ido+i+2]));
+ w1 = _mm_load_ps(&RE(wa[i]));
+ w3 = _mm_load_ps(&RE(wa[i+2]));
+
+ m3 = _mm_add_ps(m1, m2);
+ m15 = _mm_add_ps(m5, m6);
+
+ m4 = _mm_sub_ps(m1, m2);
+ m16 = _mm_sub_ps(m5, m6);
+
+ _mm_store_ps(&RE(ch[ah+i]), m3);
+ _mm_store_ps(&RE(ch[ah+i+2]), m15);
+
+ w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
+ w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));
+
+ m7 = _mm_mul_ps(m4, w1);
+ m17 = _mm_mul_ps(m16, w3);
+ m8 = _mm_mul_ps(m4, w2);
+ m18 = _mm_mul_ps(m16, w4);
+
+ m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
+ m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0));
+ m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
+ m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1));
+
+ m11 = _mm_add_ps(m9, m10);
+ m21 = _mm_add_ps(m19, m20);
+ m12 = _mm_sub_ps(m9, m10);
+ m22 = _mm_sub_ps(m19, m20);
+
+ m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
+ m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2));
+
+ m14 = _mm_unpacklo_ps(m12, m13);
+ m24 = _mm_unpacklo_ps(m22, m23);
+
+ _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
+ _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24);
+ }
+ }
+ }
+}
+#endif
+
+static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa)
{
uint16_t i, k, ah, ac;
@@ -68,9 +138,9 @@ static void passf2(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch,
ah = 2*k;
ac = 4*k;
- RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
- IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
+ RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
+ IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
}
} else {
@@ -83,251 +153,685 @@ static void passf2(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch,
{
complex_t t2;
- RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+ido]);
- IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+ido]);
+ RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);
+ RE(t2) = RE(cc[ac+i]) - RE(cc[ac+i+ido]);
- RE(t2) = RE(cc[ac]) - RE(cc[ac+ido]);
- IM(t2) = IM(cc[ac]) - IM(cc[ac+ido]);
+ IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);
+ IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]);
- RE(ch[ah+l1*ido]) = MUL_R_C(RE(t2),RE(wa[i])) - MUL_R_C(IM(t2),IM(wa[i]))*isign;
- IM(ch[ah+l1*ido]) = MUL_R_C(IM(t2),RE(wa[i])) + MUL_R_C(RE(t2),IM(wa[i]))*isign;
- ah++;
- ac++;
+ ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
+ IM(t2), RE(t2), RE(wa[i]), IM(wa[i]));
}
}
}
}
+static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa)
+{
+ uint16_t i, k, ah, ac;
+
+ if (ido == 1)
+ {
+ for (k = 0; k < l1; k++)
+ {
+ ah = 2*k;
+ ac = 4*k;
-static void passf3(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch,
- complex_t *wa1, complex_t *wa2, int8_t isign)
+ RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
+ RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
+ IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
+ IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
+ }
+ } else {
+ for (k = 0; k < l1; k++)
+ {
+ ah = k*ido;
+ ac = 2*k*ido;
+
+ for (i = 0; i < ido; i++)
+ {
+ complex_t t2;
+
+ RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);
+ RE(t2) = RE(cc[ac+i]) - RE(cc[ac+i+ido]);
+
+ IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);
+ IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]);
+
+ ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
+ RE(t2), IM(t2), RE(wa[i]), IM(wa[i]));
+ }
+ }
+ }
+}
+
+
+static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa1, const complex_t *wa2,
+ const int8_t isign)
{
- static real_t taur = COEF_CONST(-0.5);
- static real_t taui = COEF_CONST(0.866025403784439);
+ static real_t taur = FRAC_CONST(-0.5);
+ static real_t taui = FRAC_CONST(0.866025403784439);
uint16_t i, k, ac, ah;
complex_t c2, c3, d2, d3, t2;
if (ido == 1)
{
- for (k = 0; k < l1; k++)
+ if (isign == 1)
{
- ac = 3*k+1;
- ah = k;
+ for (k = 0; k < l1; k++)
+ {
+ ac = 3*k+1;
+ ah = k;
- RE(t2) = RE(cc[ac]) + RE(cc[ac+1]);
- IM(t2) = IM(cc[ac]) + IM(cc[ac+1]);
- RE(c2) = RE(cc[ac-1]) + MUL_R_C(RE(t2),taur);
- IM(c2) = IM(cc[ac-1]) + MUL_R_C(IM(t2),taur);
+ RE(t2) = RE(cc[ac]) + RE(cc[ac+1]);
+ IM(t2) = IM(cc[ac]) + IM(cc[ac+1]);
+ RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),taur);
+ IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),taur);
- RE(ch[ah]) = RE(cc[ac-1]) + RE(t2);
- IM(ch[ah]) = IM(cc[ac-1]) + IM(t2);
+ RE(ch[ah]) = RE(cc[ac-1]) + RE(t2);
+ IM(ch[ah]) = IM(cc[ac-1]) + IM(t2);
- RE(c3) = MUL_R_C((RE(cc[ac]) - RE(cc[ac+1])), taui)*isign;
- IM(c3) = MUL_R_C((IM(cc[ac]) - IM(cc[ac+1])), taui)*isign;
+ RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+1])), taui);
+ IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+1])), taui);
- RE(ch[ah+l1]) = RE(c2) - IM(c3);
- IM(ch[ah+l1]) = IM(c2) + RE(c3);
- RE(ch[ah+2*l1]) = RE(c2) + IM(c3);
- IM(ch[ah+2*l1]) = IM(c2) - RE(c3);
+ RE(ch[ah+l1]) = RE(c2) - IM(c3);
+ IM(ch[ah+l1]) = IM(c2) + RE(c3);
+ RE(ch[ah+2*l1]) = RE(c2) + IM(c3);
+ IM(ch[ah+2*l1]) = IM(c2) - RE(c3);
+ }
+ } else {
+ for (k = 0; k < l1; k++)
+ {
+ ac = 3*k+1;
+ ah = k;
+
+ RE(t2) = RE(cc[ac]) + RE(cc[ac+1]);
+ IM(t2) = IM(cc[ac]) + IM(cc[ac+1]);
+ RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),taur);
+ IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),taur);
+
+ RE(ch[ah]) = RE(cc[ac-1]) + RE(t2);
+ IM(ch[ah]) = IM(cc[ac-1]) + IM(t2);
+
+ RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+1])), taui);
+ IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+1])), taui);
+
+ RE(ch[ah+l1]) = RE(c2) + IM(c3);
+ IM(ch[ah+l1]) = IM(c2) - RE(c3);
+ RE(ch[ah+2*l1]) = RE(c2) - IM(c3);
+ IM(ch[ah+2*l1]) = IM(c2) + RE(c3);
+ }
}
} else {
- for (k = 0; k < l1; k++)
+ if (isign == 1)
{
- for (i = 0; i < ido; i++)
+ for (k = 0; k < l1; k++)
+ {
+ for (i = 0; i < ido; i++)
+ {
+ ac = i + (3*k+1)*ido;
+ ah = i + k * ido;
+
+ RE(t2) = RE(cc[ac]) + RE(cc[ac+ido]);
+ RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),taur);
+ IM(t2) = IM(cc[ac]) + IM(cc[ac+ido]);
+ IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),taur);
+
+ RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2);
+ IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2);
+
+ RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+ido])), taui);
+ IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+ido])), taui);
+
+ RE(d2) = RE(c2) - IM(c3);
+ IM(d3) = IM(c2) - RE(c3);
+ RE(d3) = RE(c2) + IM(c3);
+ IM(d2) = IM(c2) + RE(c3);
+
+ ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
+ IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
+ ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
+ IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
+ }
+ }
+ } else {
+ for (k = 0; k < l1; k++)
{
- ac = i + (3*k+1)*ido;
- ah = i + k * ido;
+ for (i = 0; i < ido; i++)
+ {
+ ac = i + (3*k+1)*ido;
+ ah = i + k * ido;
+
+ RE(t2) = RE(cc[ac]) + RE(cc[ac+ido]);
+ RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),taur);
+ IM(t2) = IM(cc[ac]) + IM(cc[ac+ido]);
+ IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),taur);
+
+ RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2);
+ IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2);
+
+ RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+ido])), taui);
+ IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+ido])), taui);
+
+ RE(d2) = RE(c2) + IM(c3);
+ IM(d3) = IM(c2) + RE(c3);
+ RE(d3) = RE(c2) - IM(c3);
+ IM(d2) = IM(c2) - RE(c3);
+
+ ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
+ RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
+ ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
+ RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
+ }
+ }
+ }
+ }
+}
- RE(t2) = RE(cc[ac]) + RE(cc[ac+ido]);
- RE(c2) = RE(cc[ac-ido]) + MUL_R_C(RE(t2),taur);
- IM(t2) = IM(cc[ac]) + IM(cc[ac+ido]);
- IM(c2) = IM(cc[ac-ido]) + MUL_R_C(IM(t2),taur);
+#ifdef USE_SSE
+static void passf4pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa1, const complex_t *wa2,
+ const complex_t *wa3)
+{
+ uint16_t i, k, ac, ah;
- RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2);
- IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2);
+ if (ido == 1)
+ {
+ for (k = 0; k < l1; k+=2)
+ {
+ __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10;
+ __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, n10;
+ __m128 neg1 = _mm_set_ps(-1.0, 1.0, 1.0, 1.0);
- RE(c3) = MUL_R_C((RE(cc[ac]) - RE(cc[ac+ido])), taui)*isign;
- IM(c3) = MUL_R_C((IM(cc[ac]) - IM(cc[ac+ido])), taui)*isign;
+ m1 = _mm_load_ps(&RE(cc[4*k]));
+ m2 = _mm_load_ps(&RE(cc[4*k+2]));
+ n1 = _mm_load_ps(&RE(cc[4*k+4]));
+ n2 = _mm_load_ps(&RE(cc[4*k+6]));
- RE(d2) = RE(c2) - IM(c3);
- IM(d3) = IM(c2) - RE(c3);
- RE(d3) = RE(c2) + IM(c3);
- IM(d2) = IM(c2) + RE(c3);
+ m3 = _mm_add_ps(m1, m2);
- RE(ch[ah+l1*ido]) = MUL_R_C(RE(d2),RE(wa1[i])) - MUL_R_C(IM(d2),IM(wa1[i]))*isign;
- IM(ch[ah+l1*ido]) = MUL_R_C(IM(d2),RE(wa1[i])) + MUL_R_C(RE(d2),IM(wa1[i]))*isign;
- RE(ch[ah+l1*2*ido]) = MUL_R_C(RE(d3),RE(wa2[i])) - MUL_R_C(IM(d3),IM(wa2[i]))*isign;
- IM(ch[ah+l1*2*ido]) = MUL_R_C(IM(d3),RE(wa2[i])) + MUL_R_C(RE(d3),IM(wa2[i]))*isign;
+ n4 = _mm_mul_ps(neg1, n1);
+ n5 = _mm_mul_ps(neg1, n2);
+ m4 = _mm_mul_ps(neg1, m1);
+ m5 = _mm_mul_ps(neg1, m2);
+
+ n3 = _mm_add_ps(n1, n2);
+ m6 = _mm_sub_ps(m4, m5);
+
+ m7 = _mm_shuffle_ps(m3, n3, _MM_SHUFFLE(1, 0, 1, 0));
+ n6 = _mm_sub_ps(n4, n5);
+ m8 = _mm_shuffle_ps(m3, n3, _MM_SHUFFLE(3, 2, 3, 2));
+
+ n7 = _mm_shuffle_ps(m6, n6, _MM_SHUFFLE(1, 0, 1, 0));
+ m9 = _mm_add_ps(m7, m8);
+ n8 = _mm_shuffle_ps(m6, n6, _MM_SHUFFLE(2, 3, 2, 3));
+
+ m10 = _mm_sub_ps(m7, m8);
+ n9 = _mm_add_ps(n7, n8);
+
+ _mm_store_ps(&RE(ch[k]), m9);
+ n10 = _mm_sub_ps(n7, n8);
+ _mm_store_ps(&RE(ch[k+l1]), n9);
+ _mm_store_ps(&RE(ch[k+2*l1]), m10);
+ _mm_store_ps(&RE(ch[k+3*l1]), n10);
+ }
+ } else {
+ for (k = 0; k < l1; k++)
+ {
+ ac = 4*k*ido;
+ ah = k*ido;
+
+ for (i = 0; i < ido; i+=2)
+ {
+ __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, m16;
+ __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, m17, m18, m19, m20, m21, m22, m23;
+ __m128 w1, w2, w3, w4, w5, w6, m24, m25, m26, m27, m28, m29, m30;
+ __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0);
+
+ m1 = _mm_load_ps(&RE(cc[ac+i]));
+ m2 = _mm_load_ps(&RE(cc[ac+i+2*ido]));
+ m3 = _mm_add_ps(m1, m2);
+ m4 = _mm_sub_ps(m1, m2);
+
+ n1 = _mm_load_ps(&RE(cc[ac+i+ido]));
+ n2 = _mm_load_ps(&RE(cc[ac+i+3*ido]));
+ n3 = _mm_add_ps(n1, n2);
+
+ n4 = _mm_mul_ps(neg1, n1);
+ n5 = _mm_mul_ps(neg1, n2);
+ n6 = _mm_sub_ps(n4, n5);
+
+ m5 = _mm_add_ps(m3, n3);
+
+ n7 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2, 3, 0, 1));
+ n8 = _mm_add_ps(m4, n7);
+
+ m6 = _mm_sub_ps(m3, n3);
+ n9 = _mm_sub_ps(m4, n7);
+
+ _mm_store_ps(&RE(ch[ah+i]), m5);
+
+#if 0
+ static INLINE void ComplexMult(real_t *y1, real_t *y2,
+ real_t x1, real_t x2, real_t c1, real_t c2)
+ {
+ *y1 = MUL_F(x1, c1) + MUL_F(x2, c2);
+ *y2 = MUL_F(x2, c1) - MUL_F(x1, c2);
+ }
+
+ m7.0 = RE(c2)*RE(wa1[i])
+ m7.1 = IM(c2)*IM(wa1[i])
+ m7.2 = RE(c6)*RE(wa1[i+1])
+ m7.3 = IM(c6)*IM(wa1[i+1])
+
+ m8.0 = RE(c2)*IM(wa1[i])
+ m8.1 = IM(c2)*RE(wa1[i])
+ m8.2 = RE(c6)*IM(wa1[i+1])
+ m8.3 = IM(c6)*RE(wa1[i+1])
+
+ RE(0) = m7.0 - m7.1
+ IM(0) = m8.0 + m8.1
+ RE(1) = m7.2 - m7.3
+ IM(1) = m8.2 + m8.3
+
+////
+ RE(0) = RE(c2)*RE(wa1[i]) - IM(c2)*IM(wa1[i])
+ IM(0) = RE(c2)*IM(wa1[i]) + IM(c2)*RE(wa1[i])
+ RE(1) = RE(c6)*RE(wa1[i+1]) - IM(c6)*IM(wa1[i+1])
+ IM(1) = RE(c6)*IM(wa1[i+1]) + IM(c6)*RE(wa1[i+1])
+#endif
+
+ w1 = _mm_load_ps(&RE(wa1[i]));
+ w3 = _mm_load_ps(&RE(wa2[i]));
+ w5 = _mm_load_ps(&RE(wa3[i]));
+
+ w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
+ w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));
+ w6 = _mm_shuffle_ps(w5, w5, _MM_SHUFFLE(2, 3, 0, 1));
+
+ m7 = _mm_mul_ps(n8, w1);
+ m15 = _mm_mul_ps(m6, w3);
+ m23 = _mm_mul_ps(n9, w5);
+ m8 = _mm_mul_ps(n8, w2);
+ m16 = _mm_mul_ps(m6, w4);
+ m24 = _mm_mul_ps(n9, w6);
+
+ m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
+ m17 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(2, 0, 2, 0));
+ m25 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(2, 0, 2, 0));
+ m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
+ m18 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(3, 1, 3, 1));
+ m26 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(3, 1, 3, 1));
+
+ m11 = _mm_add_ps(m9, m10);
+ m19 = _mm_add_ps(m17, m18);
+ m27 = _mm_add_ps(m25, m26);
+ m12 = _mm_sub_ps(m9, m10);
+ m20 = _mm_sub_ps(m17, m18);
+ m28 = _mm_sub_ps(m25, m26);
+
+ m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
+ m21 = _mm_shuffle_ps(m19, m19, _MM_SHUFFLE(0, 0, 3, 2));
+ m29 = _mm_shuffle_ps(m27, m27, _MM_SHUFFLE(0, 0, 3, 2));
+ m14 = _mm_unpacklo_ps(m12, m13);
+ m22 = _mm_unpacklo_ps(m20, m21);
+ m30 = _mm_unpacklo_ps(m28, m29);
+
+ _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
+ _mm_store_ps(&RE(ch[ah+i+2*l1*ido]), m22);
+ _mm_store_ps(&RE(ch[ah+i+3*l1*ido]), m30);
}
}
}
}
+#endif
-
-static void passf4(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch,
- complex_t *wa1, complex_t *wa2, complex_t *wa3, int8_t isign)
+static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa1, const complex_t *wa2,
+ const complex_t *wa3)
{
uint16_t i, k, ac, ah;
- complex_t c2, c3, c4, t1, t2, t3, t4;
if (ido == 1)
{
for (k = 0; k < l1; k++)
{
+ complex_t t1, t2, t3, t4;
+
ac = 4*k;
ah = k;
- RE(t2) = RE(cc[ac]) + RE(cc[ac+2]);
- IM(t2) = IM(cc[ac]) + IM(cc[ac+2]);
+ RE(t2) = RE(cc[ac]) + RE(cc[ac+2]);
+ RE(t1) = RE(cc[ac]) - RE(cc[ac+2]);
+ IM(t2) = IM(cc[ac]) + IM(cc[ac+2]);
+ IM(t1) = IM(cc[ac]) - IM(cc[ac+2]);
RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);
- IM(t3) = IM(cc[ac+1]) + IM(cc[ac+3]);
- RE(t1) = RE(cc[ac]) - RE(cc[ac+2]);
- IM(t1) = IM(cc[ac]) - IM(cc[ac+2]);
- RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);
IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);
+ IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);
+ RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);
- RE(ch[ah]) = RE(t2) + RE(t3);
- IM(ch[ah]) = IM(t2) + IM(t3);
- RE(ch[ah+l1]) = RE(t1) + RE(t4)*isign;
- IM(ch[ah+l1]) = IM(t1) + IM(t4)*isign;
+ RE(ch[ah]) = RE(t2) + RE(t3);
RE(ch[ah+2*l1]) = RE(t2) - RE(t3);
+
+ IM(ch[ah]) = IM(t2) + IM(t3);
IM(ch[ah+2*l1]) = IM(t2) - IM(t3);
- RE(ch[ah+3*l1]) = RE(t1) - RE(t4)*isign;
- IM(ch[ah+3*l1]) = IM(t1) - IM(t4)*isign;
+
+ RE(ch[ah+l1]) = RE(t1) + RE(t4);
+ RE(ch[ah+3*l1]) = RE(t1) - RE(t4);
+
+ IM(ch[ah+l1]) = IM(t1) + IM(t4);
+ IM(ch[ah+3*l1]) = IM(t1) - IM(t4);
}
} else {
for (k = 0; k < l1; k++)
{
+ ac = 4*k*ido;
+ ah = k*ido;
+
for (i = 0; i < ido; i++)
{
- ac = i + 4*k*ido;
- ah = i + k*ido;
-
- RE(t2) = RE(cc[ac]) + RE(cc[ac+2*ido]);
- IM(t2) = IM(cc[ac]) + IM(cc[ac+2*ido]);
- RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+3*ido]);
- IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+3*ido]);
- RE(t1) = RE(cc[ac]) - RE(cc[ac+2*ido]);
- IM(t1) = IM(cc[ac]) - IM(cc[ac+2*ido]);
- RE(t4) = IM(cc[ac+3*ido]) - IM(cc[ac+ido]);
- IM(t4) = RE(cc[ac+ido]) - RE(cc[ac+3*ido]);
-
- RE(ch[ah]) = RE(t2) + RE(t3);
- IM(ch[ah]) = IM(t2) + IM(t3);
-
- RE(c2) = RE(t1) + RE(t4)*isign;
- IM(c2) = IM(t1) + IM(t4)*isign;
- RE(c3) = RE(t2) - RE(t3);
- IM(c3) = IM(t2) - IM(t3);
- RE(c4) = RE(t1) - RE(t4)*isign;
- IM(c4) = IM(t1) - IM(t4)*isign;
-
- RE(ch[ah+l1*ido]) = MUL_R_C(RE(c2),RE(wa1[i])) - MUL_R_C(IM(c2),IM(wa1[i]))*isign;
- IM(ch[ah+l1*ido]) = MUL_R_C(IM(c2),RE(wa1[i])) + MUL_R_C(RE(c2),IM(wa1[i]))*isign;
- RE(ch[ah+2*l1*ido]) = MUL_R_C(RE(c3),RE(wa2[i])) - MUL_R_C(IM(c3),IM(wa2[i]))*isign;
- IM(ch[ah+2*l1*ido]) = MUL_R_C(IM(c3),RE(wa2[i])) + MUL_R_C(RE(c3),IM(wa2[i]))*isign;
- RE(ch[ah+3*l1*ido]) = MUL_R_C(RE(c4),RE(wa3[i])) - MUL_R_C(IM(c4),IM(wa3[i]))*isign;
- IM(ch[ah+3*l1*ido]) = MUL_R_C(IM(c4),RE(wa3[i])) + MUL_R_C(RE(c4),IM(wa3[i]))*isign;
+ complex_t c2, c3, c4, t1, t2, t3, t4;
+
+ RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);
+ RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);
+ IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);
+ IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);
+ RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);
+ IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);
+ IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);
+ RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);
+
+ RE(c2) = RE(t1) + RE(t4);
+ RE(c4) = RE(t1) - RE(t4);
+
+ IM(c2) = IM(t1) + IM(t4);
+ IM(c4) = IM(t1) - IM(t4);
+
+ RE(ch[ah+i]) = RE(t2) + RE(t3);
+ RE(c3) = RE(t2) - RE(t3);
+
+ IM(ch[ah+i]) = IM(t2) + IM(t3);
+ IM(c3) = IM(t2) - IM(t3);
+
+ ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
+ IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i]));
+ ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]),
+ IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i]));
+ ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]),
+ IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i]));
}
}
}
}
-
-static void passf5(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch,
- complex_t *wa1, complex_t *wa2, complex_t *wa3, complex_t *wa4,
- int8_t isign)
+static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa1, const complex_t *wa2,
+ const complex_t *wa3)
{
- static real_t tr11 = COEF_CONST(0.309016994374947);
- static real_t ti11 = COEF_CONST(0.951056516295154);
- static real_t tr12 = COEF_CONST(-0.809016994374947);
- static real_t ti12 = COEF_CONST(0.587785252292473);
uint16_t i, k, ac, ah;
- complex_t c2, c3, c4, c5, d3, d4, d5, d2, t2, t3, t4, t5;
if (ido == 1)
{
for (k = 0; k < l1; k++)
{
- ac = 5*k + 1;
+ complex_t t1, t2, t3, t4;
+
+ ac = 4*k;
ah = k;
- RE(t2) = RE(cc[ac]) + RE(cc[ac+3]);
- IM(t2) = IM(cc[ac]) + IM(cc[ac+3]);
- RE(t3) = RE(cc[ac+1]) + RE(cc[ac+2]);
- IM(t3) = IM(cc[ac+1]) + IM(cc[ac+2]);
- RE(t4) = RE(cc[ac+1]) - RE(cc[ac+2]);
- IM(t4) = IM(cc[ac+1]) - IM(cc[ac+2]);
- RE(t5) = RE(cc[ac]) - RE(cc[ac+3]);
- IM(t5) = IM(cc[ac]) - IM(cc[ac+3]);
-
- RE(ch[ah]) = RE(cc[ac-1]) + RE(t2) + RE(t3);
- IM(ch[ah]) = IM(cc[ac-1]) + IM(t2) + IM(t3);
-
- RE(c2) = RE(cc[ac-1]) + MUL_R_C(RE(t2),tr11) + MUL_R_C(RE(t3),tr12);
- IM(c2) = IM(cc[ac-1]) + MUL_R_C(IM(t2),tr11) + MUL_R_C(IM(t3),tr12);
- RE(c3) = RE(cc[ac-1]) + MUL_R_C(RE(t2),tr12) + MUL_R_C(RE(t3),tr11);
- IM(c3) = IM(cc[ac-1]) + MUL_R_C(IM(t2),tr12) + MUL_R_C(IM(t3),tr11);
- RE(c4) = (MUL_R_C(RE(t5),ti12)*isign - MUL_R_C(RE(t4),ti11));
- IM(c4) = (MUL_R_C(IM(t5),ti12)*isign - MUL_R_C(IM(t4),ti11));
- RE(c5) = (MUL_R_C(RE(t5),ti11)*isign + MUL_R_C(RE(t4),ti12));
- IM(c5) = (MUL_R_C(IM(t5),ti11)*isign + MUL_R_C(IM(t4),ti12));
-
- RE(ch[ah+l1]) = RE(c2) - IM(c5);
- IM(ch[ah+l1]) = IM(c2) + RE(c5);
- RE(ch[ah+2*l1]) = RE(c3) - IM(c4);
- IM(ch[ah+2*l1]) = IM(c3) + RE(c4);
- RE(ch[ah+3*l1]) = RE(c3) + IM(c4);
- IM(ch[ah+3*l1]) = IM(c3) - RE(c4);
- RE(ch[ah+4*l1]) = RE(c2) + IM(c5);
- IM(ch[ah+4*l1]) = IM(c2) - RE(c5);
+ RE(t2) = RE(cc[ac]) + RE(cc[ac+2]);
+ RE(t1) = RE(cc[ac]) - RE(cc[ac+2]);
+ IM(t2) = IM(cc[ac]) + IM(cc[ac+2]);
+ IM(t1) = IM(cc[ac]) - IM(cc[ac+2]);
+ RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);
+ IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);
+ IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);
+ RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);
+
+ RE(ch[ah]) = RE(t2) + RE(t3);
+ RE(ch[ah+2*l1]) = RE(t2) - RE(t3);
+
+ IM(ch[ah]) = IM(t2) + IM(t3);
+ IM(ch[ah+2*l1]) = IM(t2) - IM(t3);
+
+ RE(ch[ah+l1]) = RE(t1) - RE(t4);
+ RE(ch[ah+3*l1]) = RE(t1) + RE(t4);
+
+ IM(ch[ah+l1]) = IM(t1) - IM(t4);
+ IM(ch[ah+3*l1]) = IM(t1) + IM(t4);
}
} else {
for (k = 0; k < l1; k++)
{
+ ac = 4*k*ido;
+ ah = k*ido;
+
for (i = 0; i < ido; i++)
{
- ac = i + (k*5 + 1) * ido;
- ah = i + k * ido;
-
- RE(t2) = RE(cc[ac]) + RE(cc[ac+3*ido]);
- IM(t2) = IM(cc[ac]) + IM(cc[ac+3*ido]);
- RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+2*ido]);
- IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+2*ido]);
- RE(t4) = RE(cc[ac+ido]) - RE(cc[ac+2*ido]);
- IM(t4) = IM(cc[ac+ido]) - IM(cc[ac+2*ido]);
- RE(t5) = RE(cc[ac]) - RE(cc[ac+3*ido]);
- IM(t5) = IM(cc[ac]) - IM(cc[ac+3*ido]);
-
- RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2) + RE(t3);
- IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2) + IM(t3);
-
- RE(c2) = RE(cc[ac-ido]) + MUL_R_C(RE(t2),tr11) + MUL_R_C(RE(t3),tr12);
- IM(c2) = IM(cc[ac-ido]) + MUL_R_C(IM(t2),tr11) + MUL_R_C(IM(t3),tr12);
- RE(c3) = RE(cc[ac-ido]) + MUL_R_C(RE(t2),tr12) + MUL_R_C(RE(t3),tr11);
- IM(c3) = IM(cc[ac-ido]) + MUL_R_C(IM(t2),tr12) + MUL_R_C(IM(t3),tr11);
- RE(c4) = (MUL_R_C(RE(t5),ti12)*isign - MUL_R_C(RE(t4),ti11));
- IM(c4) = (MUL_R_C(IM(t5),ti12)*isign - MUL_R_C(IM(t4),ti11));
- RE(c5) = (MUL_R_C(RE(t5),ti11)*isign + MUL_R_C(RE(t4),ti12));
- IM(c5) = (MUL_R_C(IM(t5),ti11)*isign + MUL_R_C(IM(t4),ti12));
-
- IM(d2) = IM(c2) + RE(c5);
- IM(d3) = IM(c3) + RE(c4);
- RE(d4) = RE(c3) + IM(c4);
- RE(d5) = RE(c2) + IM(c5);
- RE(d2) = RE(c2) - IM(c5);
- IM(d5) = IM(c2) - RE(c5);
- RE(d3) = RE(c3) - IM(c4);
- IM(d4) = IM(c3) - RE(c4);
-
- RE(ch[ah+l1*ido]) = MUL_R_C(RE(d2),RE(wa1[i])) - MUL_R_C(IM(d2),IM(wa1[i]))*isign;
- IM(ch[ah+l1*ido]) = MUL_R_C(IM(d2),RE(wa1[i])) + MUL_R_C(RE(d2),IM(wa1[i]))*isign;
- RE(ch[ah+2*l1*ido]) = MUL_R_C(RE(d3),RE(wa2[i])) - MUL_R_C(IM(d3),IM(wa2[i]))*isign;
- IM(ch[ah+2*l1*ido]) = MUL_R_C(IM(d3),RE(wa2[i])) + MUL_R_C(RE(d3),IM(wa2[i]))*isign;
- RE(ch[ah+3*l1*ido]) = MUL_R_C(RE(d4),RE(wa3[i])) - MUL_R_C(IM(d4),IM(wa3[i]))*isign;
- IM(ch[ah+3*l1*ido]) = MUL_R_C(IM(d4),RE(wa3[i])) + MUL_R_C(RE(d4),IM(wa3[i]))*isign;
- RE(ch[ah+4*l1*ido]) = MUL_R_C(RE(d5),RE(wa4[i])) - MUL_R_C(IM(d5),IM(wa4[i]))*isign;
- IM(ch[ah+4*l1*ido]) = MUL_R_C(IM(d5),RE(wa4[i])) + MUL_R_C(RE(d5),IM(wa4[i]))*isign;
+ complex_t c2, c3, c4, t1, t2, t3, t4;
+
+ RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);
+ RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);
+ IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);
+ IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);
+ RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);
+ IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);
+ IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);
+ RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);
+
+ RE(c2) = RE(t1) - RE(t4);
+ RE(c4) = RE(t1) + RE(t4);
+
+ IM(c2) = IM(t1) - IM(t4);
+ IM(c4) = IM(t1) + IM(t4);
+
+ RE(ch[ah+i]) = RE(t2) + RE(t3);
+ RE(c3) = RE(t2) - RE(t3);
+
+ IM(ch[ah+i]) = IM(t2) + IM(t3);
+ IM(c3) = IM(t2) - IM(t3);
+
+ ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
+ RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i]));
+ ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]),
+ RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i]));
+ ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]),
+ RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i]));
+ }
+ }
+ }
+}
+
+static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa1, const complex_t *wa2, const complex_t *wa3,
+ const complex_t *wa4, const int8_t isign)
+{
+ static real_t tr11 = FRAC_CONST(0.309016994374947);
+ static real_t ti11 = FRAC_CONST(0.951056516295154);
+ static real_t tr12 = FRAC_CONST(-0.809016994374947);
+ static real_t ti12 = FRAC_CONST(0.587785252292473);
+ uint16_t i, k, ac, ah;
+ complex_t c2, c3, c4, c5, d3, d4, d5, d2, t2, t3, t4, t5;
+
+ if (ido == 1)
+ {
+ if (isign == 1)
+ {
+ for (k = 0; k < l1; k++)
+ {
+ ac = 5*k + 1;
+ ah = k;
+
+ RE(t2) = RE(cc[ac]) + RE(cc[ac+3]);
+ IM(t2) = IM(cc[ac]) + IM(cc[ac+3]);
+ RE(t3) = RE(cc[ac+1]) + RE(cc[ac+2]);
+ IM(t3) = IM(cc[ac+1]) + IM(cc[ac+2]);
+ RE(t4) = RE(cc[ac+1]) - RE(cc[ac+2]);
+ IM(t4) = IM(cc[ac+1]) - IM(cc[ac+2]);
+ RE(t5) = RE(cc[ac]) - RE(cc[ac+3]);
+ IM(t5) = IM(cc[ac]) - IM(cc[ac+3]);
+
+ RE(ch[ah]) = RE(cc[ac-1]) + RE(t2) + RE(t3);
+ IM(ch[ah]) = IM(cc[ac-1]) + IM(t2) + IM(t3);
+
+ RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
+ IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
+ RE(c3) = RE(cc[ac-1]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
+ IM(c3) = IM(cc[ac-1]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
+
+ ComplexMult(&RE(c5), &RE(c4),
+ ti11, ti12, RE(t5), RE(t4));
+ ComplexMult(&IM(c5), &IM(c4),
+ ti11, ti12, IM(t5), IM(t4));
+
+ RE(ch[ah+l1]) = RE(c2) - IM(c5);
+ IM(ch[ah+l1]) = IM(c2) + RE(c5);
+ RE(ch[ah+2*l1]) = RE(c3) - IM(c4);
+ IM(ch[ah+2*l1]) = IM(c3) + RE(c4);
+ RE(ch[ah+3*l1]) = RE(c3) + IM(c4);
+ IM(ch[ah+3*l1]) = IM(c3) - RE(c4);
+ RE(ch[ah+4*l1]) = RE(c2) + IM(c5);
+ IM(ch[ah+4*l1]) = IM(c2) - RE(c5);
+ }
+ } else {
+ for (k = 0; k < l1; k++)
+ {
+ ac = 5*k + 1;
+ ah = k;
+
+ RE(t2) = RE(cc[ac]) + RE(cc[ac+3]);
+ IM(t2) = IM(cc[ac]) + IM(cc[ac+3]);
+ RE(t3) = RE(cc[ac+1]) + RE(cc[ac+2]);
+ IM(t3) = IM(cc[ac+1]) + IM(cc[ac+2]);
+ RE(t4) = RE(cc[ac+1]) - RE(cc[ac+2]);
+ IM(t4) = IM(cc[ac+1]) - IM(cc[ac+2]);
+ RE(t5) = RE(cc[ac]) - RE(cc[ac+3]);
+ IM(t5) = IM(cc[ac]) - IM(cc[ac+3]);
+
+ RE(ch[ah]) = RE(cc[ac-1]) + RE(t2) + RE(t3);
+ IM(ch[ah]) = IM(cc[ac-1]) + IM(t2) + IM(t3);
+
+ RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
+ IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
+ RE(c3) = RE(cc[ac-1]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
+ IM(c3) = IM(cc[ac-1]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
+
+ ComplexMult(&RE(c4), &RE(c5),
+ ti12, ti11, RE(t5), RE(t4));
+ ComplexMult(&IM(c4), &IM(c5),
+ ti12, ti12, IM(t5), IM(t4));
+
+ RE(ch[ah+l1]) = RE(c2) + IM(c5);
+ IM(ch[ah+l1]) = IM(c2) - RE(c5);
+ RE(ch[ah+2*l1]) = RE(c3) + IM(c4);
+ IM(ch[ah+2*l1]) = IM(c3) - RE(c4);
+ RE(ch[ah+3*l1]) = RE(c3) - IM(c4);
+ IM(ch[ah+3*l1]) = IM(c3) + RE(c4);
+ RE(ch[ah+4*l1]) = RE(c2) - IM(c5);
+ IM(ch[ah+4*l1]) = IM(c2) + RE(c5);
+ }
+ }
+ } else {
+ if (isign == 1)
+ {
+ for (k = 0; k < l1; k++)
+ {
+ for (i = 0; i < ido; i++)
+ {
+ ac = i + (k*5 + 1) * ido;
+ ah = i + k * ido;
+
+ RE(t2) = RE(cc[ac]) + RE(cc[ac+3*ido]);
+ IM(t2) = IM(cc[ac]) + IM(cc[ac+3*ido]);
+ RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+2*ido]);
+ IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+2*ido]);
+ RE(t4) = RE(cc[ac+ido]) - RE(cc[ac+2*ido]);
+ IM(t4) = IM(cc[ac+ido]) - IM(cc[ac+2*ido]);
+ RE(t5) = RE(cc[ac]) - RE(cc[ac+3*ido]);
+ IM(t5) = IM(cc[ac]) - IM(cc[ac+3*ido]);
+
+ RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2) + RE(t3);
+ IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2) + IM(t3);
+
+ RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
+ IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
+ RE(c3) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
+ IM(c3) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
+
+ ComplexMult(&RE(c5), &RE(c4),
+ ti11, ti12, RE(t5), RE(t4));
+ ComplexMult(&IM(c5), &IM(c4),
+ ti11, ti12, IM(t5), IM(t4));
+
+ IM(d2) = IM(c2) + RE(c5);
+ IM(d3) = IM(c3) + RE(c4);
+ RE(d4) = RE(c3) + IM(c4);
+ RE(d5) = RE(c2) + IM(c5);
+ RE(d2) = RE(c2) - IM(c5);
+ IM(d5) = IM(c2) - RE(c5);
+ RE(d3) = RE(c3) - IM(c4);
+ IM(d4) = IM(c3) - RE(c4);
+
+ ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
+ IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
+ ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
+ IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
+ ComplexMult(&IM(ch[ah+3*l1*ido]), &RE(ch[ah+3*l1*ido]),
+ IM(d4), RE(d4), RE(wa3[i]), IM(wa3[i]));
+ ComplexMult(&IM(ch[ah+4*l1*ido]), &RE(ch[ah+4*l1*ido]),
+ IM(d5), RE(d5), RE(wa4[i]), IM(wa4[i]));
+ }
+ }
+ } else {
+ for (k = 0; k < l1; k++)
+ {
+ for (i = 0; i < ido; i++)
+ {
+ ac = i + (k*5 + 1) * ido;
+ ah = i + k * ido;
+
+ RE(t2) = RE(cc[ac]) + RE(cc[ac+3*ido]);
+ IM(t2) = IM(cc[ac]) + IM(cc[ac+3*ido]);
+ RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+2*ido]);
+ IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+2*ido]);
+ RE(t4) = RE(cc[ac+ido]) - RE(cc[ac+2*ido]);
+ IM(t4) = IM(cc[ac+ido]) - IM(cc[ac+2*ido]);
+ RE(t5) = RE(cc[ac]) - RE(cc[ac+3*ido]);
+ IM(t5) = IM(cc[ac]) - IM(cc[ac+3*ido]);
+
+ RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2) + RE(t3);
+ IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2) + IM(t3);
+
+ RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
+ IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
+ RE(c3) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
+ IM(c3) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
+
+ ComplexMult(&RE(c4), &RE(c5),
+ ti12, ti11, RE(t5), RE(t4));
+ ComplexMult(&IM(c4), &IM(c5),
+ ti12, ti12, IM(t5), IM(t4));
+
+ IM(d2) = IM(c2) - RE(c5);
+ IM(d3) = IM(c3) - RE(c4);
+ RE(d4) = RE(c3) - IM(c4);
+ RE(d5) = RE(c2) - IM(c5);
+ RE(d2) = RE(c2) + IM(c5);
+ IM(d5) = IM(c2) + RE(c5);
+ RE(d3) = RE(c3) + IM(c4);
+ IM(d4) = IM(c3) + RE(c4);
+
+ ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
+ RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
+ ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
+ RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
+ ComplexMult(&RE(ch[ah+3*l1*ido]), &IM(ch[ah+3*l1*ido]),
+ RE(d4), IM(d4), RE(wa3[i]), IM(wa3[i]));
+ ComplexMult(&RE(ch[ah+4*l1*ido]), &IM(ch[ah+4*l1*ido]),
+ RE(d5), IM(d5), RE(wa4[i]), IM(wa4[i]));
+ }
}
}
}
@@ -338,8 +842,9 @@ static void passf5(uint16_t ido, uint16_t l1, complex_t *cc, complex_t *ch,
cfftf1, cfftf, cfftb, cffti1, cffti. Complex FFTs.
----------------------------------------------------------------------*/
-INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch,
- uint16_t *ifac, complex_t *wa, int8_t isign)
+#ifdef USE_SSE
+INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch,
+ const uint16_t *ifac, const complex_t *wa, const int8_t isign)
{
uint16_t i;
uint16_t k1, l1, l2;
@@ -359,11 +864,22 @@ INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch,
switch (ip)
{
+ case 4:
+ ix2 = iw + ido;
+ ix3 = ix2 + ido;
+
+ if (na == 0)
+ passf4pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
+ else
+ passf4pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
+
+ na = 1 - na;
+ break;
case 2:
if (na == 0)
- passf2(ido, l1, c, ch, &wa[iw], isign);
+ passf2pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
else
- passf2(ido, l1, ch, c, &wa[iw], isign);
+ passf2pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
na = 1 - na;
break;
@@ -371,20 +887,88 @@ INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch,
ix2 = iw + ido;
if (na == 0)
- passf3(ido, l1, c, ch, &wa[iw], &wa[ix2], isign);
+ passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
else
- passf3(ido, l1, ch, c, &wa[iw], &wa[ix2], isign);
+ passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
na = 1 - na;
break;
+ case 5:
+ ix2 = iw + ido;
+ ix3 = ix2 + ido;
+ ix4 = ix3 + ido;
+
+ if (na == 0)
+ passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+ else
+ passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+
+ na = 1 - na;
+ break;
+ }
+
+ l1 = l2;
+ iw += (ip-1) * ido;
+ }
+
+ if (na == 0)
+ return;
+
+ for (i = 0; i < n; i++)
+ {
+ RE(c[i]) = RE(ch[i]);
+ IM(c[i]) = IM(ch[i]);
+ }
+}
+#endif
+
+INLINE void cfftf1pos(uint16_t n, complex_t *c, complex_t *ch,
+ const uint16_t *ifac, const complex_t *wa, const int8_t isign)
+{
+ uint16_t i;
+ uint16_t k1, l1, l2;
+ uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
+
+ nf = ifac[1];
+ na = 0;
+ l1 = 1;
+ iw = 0;
+
+ for (k1 = 2; k1 <= nf+1; k1++)
+ {
+ ip = ifac[k1];
+ l2 = ip*l1;
+ ido = n / l2;
+ idl1 = ido*l1;
+
+ switch (ip)
+ {
case 4:
ix2 = iw + ido;
ix3 = ix2 + ido;
if (na == 0)
- passf4(ido, l1, c, ch, &wa[iw], &wa[ix2], &wa[ix3], isign);
+ passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
else
- passf4(ido, l1, ch, c, &wa[iw], &wa[ix2], &wa[ix3], isign);
+ passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
+
+ na = 1 - na;
+ break;
+ case 2:
+ if (na == 0)
+ passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
+ else
+ passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
+
+ na = 1 - na;
+ break;
+ case 3:
+ ix2 = iw + ido;
+
+ if (na == 0)
+ passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
+ else
+ passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
na = 1 - na;
break;
@@ -394,9 +978,87 @@ INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch,
ix4 = ix3 + ido;
if (na == 0)
- passf5(ido, l1, c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+ passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
else
- passf5(ido, l1, ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+ passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+
+ na = 1 - na;
+ break;
+ }
+
+ l1 = l2;
+ iw += (ip-1) * ido;
+ }
+
+ if (na == 0)
+ return;
+
+ for (i = 0; i < n; i++)
+ {
+ RE(c[i]) = RE(ch[i]);
+ IM(c[i]) = IM(ch[i]);
+ }
+}
+
+INLINE void cfftf1neg(uint16_t n, complex_t *c, complex_t *ch,
+ const uint16_t *ifac, const complex_t *wa, const int8_t isign)
+{
+ uint16_t i;
+ uint16_t k1, l1, l2;
+ uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
+
+ nf = ifac[1];
+ na = 0;
+ l1 = 1;
+ iw = 0;
+
+ for (k1 = 2; k1 <= nf+1; k1++)
+ {
+ ip = ifac[k1];
+ l2 = ip*l1;
+ ido = n / l2;
+ idl1 = ido*l1;
+
+ switch (ip)
+ {
+ case 4:
+ ix2 = iw + ido;
+ ix3 = ix2 + ido;
+
+ if (na == 0)
+ passf4neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
+ else
+ passf4neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
+
+ na = 1 - na;
+ break;
+ case 2:
+ if (na == 0)
+ passf2neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
+ else
+ passf2neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
+
+ na = 1 - na;
+ break;
+ case 3:
+ ix2 = iw + ido;
+
+ if (na == 0)
+ passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
+ else
+ passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
+
+ na = 1 - na;
+ break;
+ case 5:
+ ix2 = iw + ido;
+ ix3 = ix2 + ido;
+ ix4 = ix3 + ido;
+
+ if (na == 0)
+ passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+ else
+ passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
na = 1 - na;
break;
@@ -418,14 +1080,21 @@ INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch,
void cfftf(cfft_info *cfft, complex_t *c)
{
- cfftf1(cfft->n, c, cfft->work, cfft->ifac, cfft->tab, -1);
+ cfftf1neg(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, -1);
}
void cfftb(cfft_info *cfft, complex_t *c)
{
- cfftf1(cfft->n, c, cfft->work, cfft->ifac, cfft->tab, +1);
+ cfftf1pos(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1);
}
+#ifdef USE_SSE
+void cfftb_sse(cfft_info *cfft, complex_t *c)
+{
+ cfftf1pos_sse(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1);
+}
+#endif
+
static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac)
{
static uint16_t ntryh[4] = {3, 4, 2, 5};
@@ -478,7 +1147,7 @@ startloop:
ifac[1] = nf;
#ifndef FIXED_POINT
- argh = 2.0*M_PI / (real_t)n;
+ argh = (real_t)2.0*(real_t)M_PI / (real_t)n;
i = 0;
l1 = 1;
@@ -504,8 +1173,8 @@ startloop:
i++;
fi++;
arg = fi * argld;
- RE(wa[i]) = cos(arg);
- IM(wa[i]) = sin(arg);
+ RE(wa[i]) = (real_t)cos(arg);
+ IM(wa[i]) = (real_t)sin(arg);
}
if (ip > 5)
@@ -521,13 +1190,13 @@ startloop:
cfft_info *cffti(uint16_t n)
{
- cfft_info *cfft = (cfft_info*)malloc(sizeof(cfft_info));
+ cfft_info *cfft = (cfft_info*)faad_malloc(sizeof(cfft_info));
cfft->n = n;
- cfft->work = (complex_t*)malloc(n*sizeof(complex_t));
+ cfft->work = (complex_t*)faad_malloc(n*sizeof(complex_t));
#ifndef FIXED_POINT
- cfft->tab = (complex_t*)malloc(n*sizeof(complex_t));
+ cfft->tab = (complex_t*)faad_malloc(n*sizeof(complex_t));
cffti1(n, cfft->tab, cfft->ifac);
#else
@@ -535,14 +1204,19 @@ cfft_info *cffti(uint16_t n)
switch (n)
{
- case 60: cfft->tab = cfft_tab_60; break;
case 64: cfft->tab = cfft_tab_64; break;
- case 480: cfft->tab = cfft_tab_480; break;
case 512: cfft->tab = cfft_tab_512; break;
#ifdef LD_DEC
- case 240: cfft->tab = cfft_tab_240; break;
case 256: cfft->tab = cfft_tab_256; break;
#endif
+
+#ifdef ALLOW_SMALL_FRAMELENGTH
+ case 60: cfft->tab = cfft_tab_60; break;
+ case 480: cfft->tab = cfft_tab_480; break;
+#ifdef LD_DEC
+ case 240: cfft->tab = cfft_tab_240; break;
+#endif
+#endif
}
#endif
@@ -551,11 +1225,11 @@ cfft_info *cffti(uint16_t n)
void cfftu(cfft_info *cfft)
{
- if (cfft->work) free(cfft->work);
+ if (cfft->work) faad_free(cfft->work);
#ifndef FIXED_POINT
- if (cfft->tab) free(cfft->tab);
+ if (cfft->tab) faad_free(cfft->tab);
#endif
- if (cfft) free(cfft);
+ if (cfft) faad_free(cfft);
}