summaryrefslogtreecommitdiff
path: root/src/libfaad/cfft.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libfaad/cfft.c')
-rw-r--r--src/libfaad/cfft.c641
1 files changed, 406 insertions, 235 deletions
diff --git a/src/libfaad/cfft.c b/src/libfaad/cfft.c
index bbdfc36df..fa858303c 100644
--- a/src/libfaad/cfft.c
+++ b/src/libfaad/cfft.c
@@ -1,6 +1,6 @@
/*
** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
-** Copyright (C) 2003 M. Bakker, Ahead Software AG, http://www.nero.com
+** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
**
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
@@ -22,7 +22,7 @@
** Commercial non-GPL licensing of this software is possible.
** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
**
-** $Id: cfft.c,v 1.7 2003/12/30 02:00:10 miguelfreitas Exp $
+** $Id: cfft.c,v 1.8 2004/01/11 15:44:04 mroi Exp $
**/
/*
@@ -43,84 +43,115 @@
#include "cfft_tab.h"
+/* static function declarations */
+#ifdef USE_SSE
+static void passf2pos_sse(const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa);
+static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa);
+static void passf4pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
+ const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
+#endif
+static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa);
+static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa);
+static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa1, const complex_t *wa2, const int8_t isign);
+static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
+ const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
+static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
+ const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
+static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
+ const complex_t *wa1, const complex_t *wa2, const complex_t *wa3,
+ const complex_t *wa4, const int8_t isign);
+INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch,
+ const uint16_t *ifac, const complex_t *wa, const int8_t isign);
+static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac);
+
+
/*----------------------------------------------------------------------
passf2, passf3, passf4, passf5. Complex FFT passes fwd and bwd.
----------------------------------------------------------------------*/
-#ifdef USE_SSE
-static void passf2pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+#if 0 //def USE_SSE
+static void passf2pos_sse(const uint16_t l1, const complex_t *cc,
complex_t *ch, const complex_t *wa)
{
- uint16_t i, k, ah, ac;
+ uint16_t k, ah, ac;
- if (ido == 1)
+ for (k = 0; k < l1; k++)
{
- for (k = 0; k < l1; k++)
- {
- ah = 2*k;
- ac = 4*k;
+ ah = 2*k;
+ ac = 4*k;
- RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
- IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
+ RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
+ IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
- RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
- IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
- }
- } else {
- for (k = 0; k < l1; k++)
+ RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
+ IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
+ }
+}
+
+static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa)
+{
+ uint16_t i, k, ah, ac;
+
+ for (k = 0; k < l1; k++)
+ {
+ ah = k*ido;
+ ac = 2*k*ido;
+
+ for (i = 0; i < ido; i+=4)
{
- ah = k*ido;
- ac = 2*k*ido;
+ __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14;
+ __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24;
+ __m128 w1, w2, w3, w4;
- for (i = 0; i < ido; i+=4)
- {
- __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14;
- __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24;
- __m128 w1, w2, w3, w4;
+ m1 = _mm_load_ps(&RE(cc[ac+i]));
+ m2 = _mm_load_ps(&RE(cc[ac+ido+i]));
+ m5 = _mm_load_ps(&RE(cc[ac+i+2]));
+ m6 = _mm_load_ps(&RE(cc[ac+ido+i+2]));
+ w1 = _mm_load_ps(&RE(wa[i]));
+ w3 = _mm_load_ps(&RE(wa[i+2]));
- m1 = _mm_load_ps(&RE(cc[ac+i]));
- m2 = _mm_load_ps(&RE(cc[ac+ido+i]));
- m5 = _mm_load_ps(&RE(cc[ac+i+2]));
- m6 = _mm_load_ps(&RE(cc[ac+ido+i+2]));
- w1 = _mm_load_ps(&RE(wa[i]));
- w3 = _mm_load_ps(&RE(wa[i+2]));
+ m3 = _mm_add_ps(m1, m2);
+ m15 = _mm_add_ps(m5, m6);
- m3 = _mm_add_ps(m1, m2);
- m15 = _mm_add_ps(m5, m6);
+ m4 = _mm_sub_ps(m1, m2);
+ m16 = _mm_sub_ps(m5, m6);
- m4 = _mm_sub_ps(m1, m2);
- m16 = _mm_sub_ps(m5, m6);
+ _mm_store_ps(&RE(ch[ah+i]), m3);
+ _mm_store_ps(&RE(ch[ah+i+2]), m15);
- _mm_store_ps(&RE(ch[ah+i]), m3);
- _mm_store_ps(&RE(ch[ah+i+2]), m15);
- w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
- w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));
+ w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
+ w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));
- m7 = _mm_mul_ps(m4, w1);
- m17 = _mm_mul_ps(m16, w3);
- m8 = _mm_mul_ps(m4, w2);
- m18 = _mm_mul_ps(m16, w4);
+ m7 = _mm_mul_ps(m4, w1);
+ m17 = _mm_mul_ps(m16, w3);
+ m8 = _mm_mul_ps(m4, w2);
+ m18 = _mm_mul_ps(m16, w4);
- m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
- m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0));
- m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
- m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1));
+ m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
+ m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0));
+ m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
+ m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1));
- m11 = _mm_add_ps(m9, m10);
- m21 = _mm_add_ps(m19, m20);
- m12 = _mm_sub_ps(m9, m10);
- m22 = _mm_sub_ps(m19, m20);
+ m11 = _mm_add_ps(m9, m10);
+ m21 = _mm_add_ps(m19, m20);
+ m12 = _mm_sub_ps(m9, m10);
+ m22 = _mm_sub_ps(m19, m20);
- m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
- m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2));
+ m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
+ m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2));
- m14 = _mm_unpacklo_ps(m12, m13);
- m24 = _mm_unpacklo_ps(m22, m23);
+ m14 = _mm_unpacklo_ps(m12, m13);
+ m24 = _mm_unpacklo_ps(m22, m23);
- _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
- _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24);
- }
+ _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
+ _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24);
}
}
}
@@ -159,8 +190,13 @@ static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc
IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);
IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]);
+#if 1
ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
IM(t2), RE(t2), RE(wa[i]), IM(wa[i]));
+#else
+ ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
+ RE(t2), IM(t2), RE(wa[i]), IM(wa[i]));
+#endif
}
}
}
@@ -199,8 +235,13 @@ static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc
IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);
IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]);
+#if 1
ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
RE(t2), IM(t2), RE(wa[i]), IM(wa[i]));
+#else
+ ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
+ IM(t2), RE(t2), RE(wa[i]), IM(wa[i]));
+#endif
}
}
}
@@ -290,10 +331,17 @@ static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc,
RE(d3) = RE(c2) + IM(c3);
IM(d2) = IM(c2) + RE(c3);
+#if 1
ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
+#else
+ ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
+ RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
+ ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
+ RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
+#endif
}
}
} else {
@@ -320,10 +368,17 @@ static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc,
RE(d3) = RE(c2) - IM(c3);
IM(d2) = IM(c2) - RE(c3);
+#if 1
ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
+#else
+ ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
+ IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
+ ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
+ IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
+#endif
}
}
}
@@ -331,158 +386,213 @@ static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc,
}
#ifdef USE_SSE
-static void passf4pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc,
- complex_t *ch, const complex_t *wa1, const complex_t *wa2,
- const complex_t *wa3)
+ALIGN static const int32_t negate[4] = { 0x0, 0x0, 0x0, 0x80000000 };
+
+__declspec(naked) static void passf4pos_sse(const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa1, const complex_t *wa2,
+ const complex_t *wa3)
+{
+ __asm {
+ push ebx
+ mov ebx, esp
+ and esp, -16
+ push edi
+ push esi
+ sub esp, 8
+ movzx edi, WORD PTR [ebx+8]
+
+ movaps xmm1, XMMWORD PTR negate
+
+ test edi, edi
+ jle l1_is_zero
+
+ lea esi, DWORD PTR [edi+edi]
+ add esi, esi
+ sub esi, edi
+ add esi, esi
+ add esi, esi
+ add esi, esi
+ mov eax, DWORD PTR [ebx+16]
+ add esi, eax
+ lea ecx, DWORD PTR [edi+edi]
+ add ecx, ecx
+ add ecx, ecx
+ add ecx, ecx
+ add ecx, eax
+ lea edx, DWORD PTR [edi+edi]
+ add edx, edx
+ add edx, edx
+ add edx, eax
+ xor eax, eax
+ mov DWORD PTR [esp], ebp
+ mov ebp, DWORD PTR [ebx+12]
+
+fftloop:
+ lea edi, DWORD PTR [eax+eax]
+ add edi, edi
+ movaps xmm2, XMMWORD PTR [ebp+edi*8]
+ movaps xmm0, XMMWORD PTR [ebp+edi*8+16]
+ movaps xmm7, XMMWORD PTR [ebp+edi*8+32]
+ movaps xmm5, XMMWORD PTR [ebp+edi*8+48]
+ movaps xmm6, xmm2
+ addps xmm6, xmm0
+ movaps xmm4, xmm1
+ xorps xmm4, xmm7
+ movaps xmm3, xmm1
+ xorps xmm3, xmm5
+ xorps xmm2, xmm1
+ xorps xmm0, xmm1
+ addps xmm7, xmm5
+ subps xmm2, xmm0
+ movaps xmm0, xmm6
+ shufps xmm0, xmm7, 68
+ subps xmm4, xmm3
+ shufps xmm6, xmm7, 238
+ movaps xmm5, xmm2
+ shufps xmm5, xmm4, 68
+ movaps xmm3, xmm0
+ addps xmm3, xmm6
+ shufps xmm2, xmm4, 187
+ subps xmm0, xmm6
+ movaps xmm4, xmm5
+ addps xmm4, xmm2
+ mov edi, DWORD PTR [ebx+16]
+ movaps XMMWORD PTR [edi+eax*8], xmm3
+ subps xmm5, xmm2
+ movaps XMMWORD PTR [edx+eax*8], xmm4
+ movaps XMMWORD PTR [ecx+eax*8], xmm0
+ movaps XMMWORD PTR [esi+eax*8], xmm5
+ add eax, 2
+ movzx eax, ax
+ movzx edi, WORD PTR [ebx+8]
+ cmp eax, edi
+ jl fftloop
+
+ mov ebp, DWORD PTR [esp]
+
+l1_is_zero:
+
+ add esp, 8
+ pop esi
+ pop edi
+ mov esp, ebx
+ pop ebx
+ ret
+ }
+}
+#endif
+
+#if 0
+static void passf4pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+ complex_t *ch, const complex_t *wa1, const complex_t *wa2,
+ const complex_t *wa3)
{
uint16_t i, k, ac, ah;
- if (ido == 1)
+ for (k = 0; k < l1; k++)
{
- for (k = 0; k < l1; k+=2)
- {
- __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10;
- __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, n10;
- __m128 neg1 = _mm_set_ps(-1.0, 1.0, 1.0, 1.0);
+ ac = 4*k*ido;
+ ah = k*ido;
- m1 = _mm_load_ps(&RE(cc[4*k]));
- m2 = _mm_load_ps(&RE(cc[4*k+2]));
- n1 = _mm_load_ps(&RE(cc[4*k+4]));
- n2 = _mm_load_ps(&RE(cc[4*k+6]));
+ for (i = 0; i < ido; i+=2)
+ {
+ __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, m16;
+ __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, m17, m18, m19, m20, m21, m22, m23;
+ __m128 w1, w2, w3, w4, w5, w6, m24, m25, m26, m27, m28, m29, m30;
+ __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0);
+ m1 = _mm_load_ps(&RE(cc[ac+i]));
+ m2 = _mm_load_ps(&RE(cc[ac+i+2*ido]));
m3 = _mm_add_ps(m1, m2);
+ m4 = _mm_sub_ps(m1, m2);
- n4 = _mm_mul_ps(neg1, n1);
- n5 = _mm_mul_ps(neg1, n2);
- m4 = _mm_mul_ps(neg1, m1);
- m5 = _mm_mul_ps(neg1, m2);
-
+ n1 = _mm_load_ps(&RE(cc[ac+i+ido]));
+ n2 = _mm_load_ps(&RE(cc[ac+i+3*ido]));
n3 = _mm_add_ps(n1, n2);
- m6 = _mm_sub_ps(m4, m5);
- m7 = _mm_shuffle_ps(m3, n3, _MM_SHUFFLE(1, 0, 1, 0));
+ n4 = _mm_mul_ps(neg1, n1);
+ n5 = _mm_mul_ps(neg1, n2);
n6 = _mm_sub_ps(n4, n5);
- m8 = _mm_shuffle_ps(m3, n3, _MM_SHUFFLE(3, 2, 3, 2));
- n7 = _mm_shuffle_ps(m6, n6, _MM_SHUFFLE(1, 0, 1, 0));
- m9 = _mm_add_ps(m7, m8);
- n8 = _mm_shuffle_ps(m6, n6, _MM_SHUFFLE(2, 3, 2, 3));
+ m5 = _mm_add_ps(m3, n3);
- m10 = _mm_sub_ps(m7, m8);
- n9 = _mm_add_ps(n7, n8);
+ n7 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2, 3, 0, 1));
+ n8 = _mm_add_ps(m4, n7);
- _mm_store_ps(&RE(ch[k]), m9);
- n10 = _mm_sub_ps(n7, n8);
- _mm_store_ps(&RE(ch[k+l1]), n9);
- _mm_store_ps(&RE(ch[k+2*l1]), m10);
- _mm_store_ps(&RE(ch[k+3*l1]), n10);
- }
- } else {
- for (k = 0; k < l1; k++)
- {
- ac = 4*k*ido;
- ah = k*ido;
+ m6 = _mm_sub_ps(m3, n3);
+ n9 = _mm_sub_ps(m4, n7);
- for (i = 0; i < ido; i+=2)
- {
- __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, m16;
- __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, m17, m18, m19, m20, m21, m22, m23;
- __m128 w1, w2, w3, w4, w5, w6, m24, m25, m26, m27, m28, m29, m30;
- __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0);
-
- m1 = _mm_load_ps(&RE(cc[ac+i]));
- m2 = _mm_load_ps(&RE(cc[ac+i+2*ido]));
- m3 = _mm_add_ps(m1, m2);
- m4 = _mm_sub_ps(m1, m2);
-
- n1 = _mm_load_ps(&RE(cc[ac+i+ido]));
- n2 = _mm_load_ps(&RE(cc[ac+i+3*ido]));
- n3 = _mm_add_ps(n1, n2);
-
- n4 = _mm_mul_ps(neg1, n1);
- n5 = _mm_mul_ps(neg1, n2);
- n6 = _mm_sub_ps(n4, n5);
-
- m5 = _mm_add_ps(m3, n3);
-
- n7 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2, 3, 0, 1));
- n8 = _mm_add_ps(m4, n7);
-
- m6 = _mm_sub_ps(m3, n3);
- n9 = _mm_sub_ps(m4, n7);
-
- _mm_store_ps(&RE(ch[ah+i]), m5);
+ _mm_store_ps(&RE(ch[ah+i]), m5);
#if 0
- static INLINE void ComplexMult(real_t *y1, real_t *y2,
- real_t x1, real_t x2, real_t c1, real_t c2)
- {
- *y1 = MUL_F(x1, c1) + MUL_F(x2, c2);
- *y2 = MUL_F(x2, c1) - MUL_F(x1, c2);
- }
-
- m7.0 = RE(c2)*RE(wa1[i])
- m7.1 = IM(c2)*IM(wa1[i])
- m7.2 = RE(c6)*RE(wa1[i+1])
- m7.3 = IM(c6)*IM(wa1[i+1])
-
- m8.0 = RE(c2)*IM(wa1[i])
- m8.1 = IM(c2)*RE(wa1[i])
- m8.2 = RE(c6)*IM(wa1[i+1])
- m8.3 = IM(c6)*RE(wa1[i+1])
-
- RE(0) = m7.0 - m7.1
- IM(0) = m8.0 + m8.1
- RE(1) = m7.2 - m7.3
- IM(1) = m8.2 + m8.3
-
-////
- RE(0) = RE(c2)*RE(wa1[i]) - IM(c2)*IM(wa1[i])
- IM(0) = RE(c2)*IM(wa1[i]) + IM(c2)*RE(wa1[i])
- RE(1) = RE(c6)*RE(wa1[i+1]) - IM(c6)*IM(wa1[i+1])
- IM(1) = RE(c6)*IM(wa1[i+1]) + IM(c6)*RE(wa1[i+1])
+ static INLINE void ComplexMult(real_t *y1, real_t *y2,
+ real_t x1, real_t x2, real_t c1, real_t c2)
+ {
+ *y1 = MUL_F(x1, c1) + MUL_F(x2, c2);
+ *y2 = MUL_F(x2, c1) - MUL_F(x1, c2);
+ }
+
+ m7.0 = RE(c2)*RE(wa1[i])
+ m7.1 = IM(c2)*IM(wa1[i])
+ m7.2 = RE(c6)*RE(wa1[i+1])
+ m7.3 = IM(c6)*IM(wa1[i+1])
+
+ m8.0 = RE(c2)*IM(wa1[i])
+ m8.1 = IM(c2)*RE(wa1[i])
+ m8.2 = RE(c6)*IM(wa1[i+1])
+ m8.3 = IM(c6)*RE(wa1[i+1])
+
+ RE(0) = m7.0 - m7.1
+ IM(0) = m8.0 + m8.1
+ RE(1) = m7.2 - m7.3
+ IM(1) = m8.2 + m8.3
+
+ ////
+ RE(0) = RE(c2)*RE(wa1[i]) - IM(c2)*IM(wa1[i])
+ IM(0) = RE(c2)*IM(wa1[i]) + IM(c2)*RE(wa1[i])
+ RE(1) = RE(c6)*RE(wa1[i+1]) - IM(c6)*IM(wa1[i+1])
+ IM(1) = RE(c6)*IM(wa1[i+1]) + IM(c6)*RE(wa1[i+1])
#endif
- w1 = _mm_load_ps(&RE(wa1[i]));
- w3 = _mm_load_ps(&RE(wa2[i]));
- w5 = _mm_load_ps(&RE(wa3[i]));
-
- w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
- w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));
- w6 = _mm_shuffle_ps(w5, w5, _MM_SHUFFLE(2, 3, 0, 1));
-
- m7 = _mm_mul_ps(n8, w1);
- m15 = _mm_mul_ps(m6, w3);
- m23 = _mm_mul_ps(n9, w5);
- m8 = _mm_mul_ps(n8, w2);
- m16 = _mm_mul_ps(m6, w4);
- m24 = _mm_mul_ps(n9, w6);
-
- m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
- m17 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(2, 0, 2, 0));
- m25 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(2, 0, 2, 0));
- m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
- m18 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(3, 1, 3, 1));
- m26 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(3, 1, 3, 1));
-
- m11 = _mm_add_ps(m9, m10);
- m19 = _mm_add_ps(m17, m18);
- m27 = _mm_add_ps(m25, m26);
- m12 = _mm_sub_ps(m9, m10);
- m20 = _mm_sub_ps(m17, m18);
- m28 = _mm_sub_ps(m25, m26);
-
- m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
- m21 = _mm_shuffle_ps(m19, m19, _MM_SHUFFLE(0, 0, 3, 2));
- m29 = _mm_shuffle_ps(m27, m27, _MM_SHUFFLE(0, 0, 3, 2));
- m14 = _mm_unpacklo_ps(m12, m13);
- m22 = _mm_unpacklo_ps(m20, m21);
- m30 = _mm_unpacklo_ps(m28, m29);
-
- _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
- _mm_store_ps(&RE(ch[ah+i+2*l1*ido]), m22);
- _mm_store_ps(&RE(ch[ah+i+3*l1*ido]), m30);
- }
+ w1 = _mm_load_ps(&RE(wa1[i]));
+ w3 = _mm_load_ps(&RE(wa2[i]));
+ w5 = _mm_load_ps(&RE(wa3[i]));
+
+ w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
+ w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));
+ w6 = _mm_shuffle_ps(w5, w5, _MM_SHUFFLE(2, 3, 0, 1));
+
+ m7 = _mm_mul_ps(n8, w1);
+ m15 = _mm_mul_ps(m6, w3);
+ m23 = _mm_mul_ps(n9, w5);
+ m8 = _mm_mul_ps(n8, w2);
+ m16 = _mm_mul_ps(m6, w4);
+ m24 = _mm_mul_ps(n9, w6);
+
+ m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
+ m17 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(2, 0, 2, 0));
+ m25 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(2, 0, 2, 0));
+ m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
+ m18 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(3, 1, 3, 1));
+ m26 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(3, 1, 3, 1));
+
+ m11 = _mm_add_ps(m9, m10);
+ m19 = _mm_add_ps(m17, m18);
+ m27 = _mm_add_ps(m25, m26);
+ m12 = _mm_sub_ps(m9, m10);
+ m20 = _mm_sub_ps(m17, m18);
+ m28 = _mm_sub_ps(m25, m26);
+
+ m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
+ m21 = _mm_shuffle_ps(m19, m19, _MM_SHUFFLE(0, 0, 3, 2));
+ m29 = _mm_shuffle_ps(m27, m27, _MM_SHUFFLE(0, 0, 3, 2));
+ m14 = _mm_unpacklo_ps(m12, m13);
+ m22 = _mm_unpacklo_ps(m20, m21);
+ m30 = _mm_unpacklo_ps(m28, m29);
+
+ _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
+ _mm_store_ps(&RE(ch[ah+i+2*l1*ido]), m22);
+ _mm_store_ps(&RE(ch[ah+i+3*l1*ido]), m30);
}
}
}
@@ -555,12 +665,21 @@ static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc
IM(ch[ah+i]) = IM(t2) + IM(t3);
IM(c3) = IM(t2) - IM(t3);
+#if 1
ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i]));
ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]),
IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i]));
ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]),
IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i]));
+#else
+ ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
+ RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i]));
+ ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]),
+ RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i]));
+ ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]),
+ RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i]));
+#endif
}
}
}
@@ -633,12 +752,21 @@ static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc
IM(ch[ah+i]) = IM(t2) + IM(t3);
IM(c3) = IM(t2) - IM(t3);
+#if 1
ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i]));
ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]),
RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i]));
ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]),
RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i]));
+#else
+ ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
+ IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i]));
+ ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]),
+ IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i]));
+ ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]),
+ IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i]));
+#endif
}
}
}
@@ -774,6 +902,7 @@ static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc,
RE(d3) = RE(c3) - IM(c4);
IM(d4) = IM(c3) - RE(c4);
+#if 1
ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
@@ -782,6 +911,16 @@ static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc,
IM(d4), RE(d4), RE(wa3[i]), IM(wa3[i]));
ComplexMult(&IM(ch[ah+4*l1*ido]), &RE(ch[ah+4*l1*ido]),
IM(d5), RE(d5), RE(wa4[i]), IM(wa4[i]));
+#else
+ ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
+ RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
+ ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
+ RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
+ ComplexMult(&RE(ch[ah+3*l1*ido]), &IM(ch[ah+3*l1*ido]),
+ RE(d4), IM(d4), RE(wa3[i]), IM(wa3[i]));
+ ComplexMult(&RE(ch[ah+4*l1*ido]), &IM(ch[ah+4*l1*ido]),
+ RE(d5), IM(d5), RE(wa4[i]), IM(wa4[i]));
+#endif
}
}
} else {
@@ -823,6 +962,7 @@ static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc,
RE(d3) = RE(c3) + IM(c4);
IM(d4) = IM(c3) + RE(c4);
+#if 1
ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
@@ -831,6 +971,16 @@ static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc,
RE(d4), IM(d4), RE(wa3[i]), IM(wa3[i]));
ComplexMult(&RE(ch[ah+4*l1*ido]), &IM(ch[ah+4*l1*ido]),
RE(d5), IM(d5), RE(wa4[i]), IM(wa4[i]));
+#else
+ ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
+ IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
+ ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
+ IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
+ ComplexMult(&IM(ch[ah+3*l1*ido]), &RE(ch[ah+3*l1*ido]),
+ IM(d4), RE(d4), RE(wa3[i]), IM(wa3[i]));
+ ComplexMult(&IM(ch[ah+4*l1*ido]), &RE(ch[ah+4*l1*ido]),
+ IM(d5), RE(d5), RE(wa4[i]), IM(wa4[i]));
+#endif
}
}
}
@@ -843,8 +993,12 @@ static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc,
----------------------------------------------------------------------*/
#ifdef USE_SSE
-INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch,
- const uint16_t *ifac, const complex_t *wa, const int8_t isign)
+
+#define CONV(A,B,C) ( (A<<2) | ((B & 0x1)<<1) | ((C==1)&0x1) )
+
+static INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch,
+ const uint16_t *ifac, const complex_t *wa,
+ const int8_t isign)
{
uint16_t i;
uint16_t k1, l1, l2;
@@ -862,51 +1016,62 @@ INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch,
ido = n / l2;
idl1 = ido*l1;
- switch (ip)
- {
- case 4:
- ix2 = iw + ido;
- ix3 = ix2 + ido;
-
- if (na == 0)
- passf4pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
- else
- passf4pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
+ ix2 = iw + ido;
+ ix3 = ix2 + ido;
+ ix4 = ix3 + ido;
- na = 1 - na;
+ switch (CONV(ip,na,ido))
+ {
+ case CONV(4,0,0):
+ //passf4pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
+ passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
break;
- case 2:
- if (na == 0)
- passf2pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
- else
- passf2pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
-
- na = 1 - na;
+ case CONV(4,0,1):
+ passf4pos_sse((const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
break;
- case 3:
- ix2 = iw + ido;
-
- if (na == 0)
- passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
- else
- passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
-
- na = 1 - na;
+ case CONV(4,1,0):
+ passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
+ //passf4pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
break;
- case 5:
- ix2 = iw + ido;
- ix3 = ix2 + ido;
- ix4 = ix3 + ido;
-
- if (na == 0)
- passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
- else
- passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
-
- na = 1 - na;
+ case CONV(4,1,1):
+ passf4pos_sse((const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
+ break;
+ case CONV(2,0,0):
+ passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
+ //passf2pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
+ break;
+ case CONV(2,0,1):
+ passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
+ //passf2pos_sse((const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
+ break;
+ case CONV(2,1,0):
+ passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
+ //passf2pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
+ break;
+ case CONV(2,1,1):
+ passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
+ //passf2pos_sse((const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
+ break;
+ case CONV(3,0,0):
+ case CONV(3,0,1):
+ passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
+ break;
+ case CONV(3,1,0):
+ case CONV(3,1,1):
+ passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
+ break;
+ case CONV(5,0,0):
+ case CONV(5,0,1):
+ passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+ break;
+ case CONV(5,1,0):
+ case CONV(5,1,1):
+ passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
break;
}
+ na = 1 - na;
+
l1 = l2;
iw += (ip-1) * ido;
}
@@ -922,8 +1087,9 @@ INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch,
}
#endif
-INLINE void cfftf1pos(uint16_t n, complex_t *c, complex_t *ch,
- const uint16_t *ifac, const complex_t *wa, const int8_t isign)
+static INLINE void cfftf1pos(uint16_t n, complex_t *c, complex_t *ch,
+ const uint16_t *ifac, const complex_t *wa,
+ const int8_t isign)
{
uint16_t i;
uint16_t k1, l1, l2;
@@ -1000,8 +1166,9 @@ INLINE void cfftf1pos(uint16_t n, complex_t *c, complex_t *ch,
}
}
-INLINE void cfftf1neg(uint16_t n, complex_t *c, complex_t *ch,
- const uint16_t *ifac, const complex_t *wa, const int8_t isign)
+static INLINE void cfftf1neg(uint16_t n, complex_t *c, complex_t *ch,
+ const uint16_t *ifac, const complex_t *wa,
+ const int8_t isign)
{
uint16_t i;
uint16_t k1, l1, l2;
@@ -1104,7 +1271,7 @@ static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac)
uint16_t i1, k1, l1, l2;
uint16_t ld, ii, ip;
#endif
- uint16_t ntry, i, j;
+ uint16_t ntry = 0, i, j;
uint16_t ib;
uint16_t nf, nl, nq, nr;
@@ -1174,7 +1341,11 @@ startloop:
fi++;
arg = fi * argld;
RE(wa[i]) = (real_t)cos(arg);
+#if 1
IM(wa[i]) = (real_t)sin(arg);
+#else
+ IM(wa[i]) = (real_t)-sin(arg);
+#endif
}
if (ip > 5)