diff options
Diffstat (limited to 'src/libfaad/cfft.c')
-rw-r--r-- | src/libfaad/cfft.c | 641 |
1 files changed, 406 insertions, 235 deletions
diff --git a/src/libfaad/cfft.c b/src/libfaad/cfft.c index bbdfc36df..fa858303c 100644 --- a/src/libfaad/cfft.c +++ b/src/libfaad/cfft.c @@ -1,6 +1,6 @@ /* ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding -** Copyright (C) 2003 M. Bakker, Ahead Software AG, http://www.nero.com +** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by @@ -22,7 +22,7 @@ ** Commercial non-GPL licensing of this software is possible. ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com. ** -** $Id: cfft.c,v 1.7 2003/12/30 02:00:10 miguelfreitas Exp $ +** $Id: cfft.c,v 1.8 2004/01/11 15:44:04 mroi Exp $ **/ /* @@ -43,84 +43,115 @@ #include "cfft_tab.h" +/* static function declarations */ +#ifdef USE_SSE +static void passf2pos_sse(const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa); +static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa); +static void passf4pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch, + const complex_t *wa1, const complex_t *wa2, const complex_t *wa3); +#endif +static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa); +static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa); +static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa1, const complex_t *wa2, const int8_t isign); +static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch, + const complex_t *wa1, const complex_t *wa2, const complex_t *wa3); +static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch, + const complex_t *wa1, const complex_t *wa2, const complex_t *wa3); +static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch, + const complex_t *wa1, const complex_t *wa2, const complex_t *wa3, + const complex_t *wa4, const int8_t isign); +INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch, + const uint16_t *ifac, const complex_t *wa, const int8_t isign); +static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac); + + /*---------------------------------------------------------------------- passf2, passf3, passf4, passf5. Complex FFT passes fwd and bwd. ----------------------------------------------------------------------*/ -#ifdef USE_SSE -static void passf2pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc, +#if 0 //def USE_SSE +static void passf2pos_sse(const uint16_t l1, const complex_t *cc, complex_t *ch, const complex_t *wa) { - uint16_t i, k, ah, ac; + uint16_t k, ah, ac; - if (ido == 1) + for (k = 0; k < l1; k++) { - for (k = 0; k < l1; k++) - { - ah = 2*k; - ac = 4*k; + ah = 2*k; + ac = 4*k; - RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]); - IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]); + RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]); + IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]); - RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]); - IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]); - } - } else { - for (k = 0; k < l1; k++) + RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]); + IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]); + } +} + +static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa) +{ + uint16_t i, k, ah, ac; + + for (k = 0; k < l1; k++) + { + ah = k*ido; + ac = 2*k*ido; + + for (i = 0; i < ido; i+=4) { - ah = k*ido; - ac = 2*k*ido; + __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14; + __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24; + __m128 w1, w2, w3, w4; - for (i = 0; i < ido; i+=4) - { - __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14; - __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24; - __m128 w1, w2, w3, w4; + m1 = _mm_load_ps(&RE(cc[ac+i])); + m2 = _mm_load_ps(&RE(cc[ac+ido+i])); + m5 = _mm_load_ps(&RE(cc[ac+i+2])); + m6 = _mm_load_ps(&RE(cc[ac+ido+i+2])); + w1 = _mm_load_ps(&RE(wa[i])); + w3 = _mm_load_ps(&RE(wa[i+2])); - m1 = _mm_load_ps(&RE(cc[ac+i])); - m2 = _mm_load_ps(&RE(cc[ac+ido+i])); - m5 = _mm_load_ps(&RE(cc[ac+i+2])); - m6 = _mm_load_ps(&RE(cc[ac+ido+i+2])); - w1 = _mm_load_ps(&RE(wa[i])); - w3 = _mm_load_ps(&RE(wa[i+2])); + m3 = _mm_add_ps(m1, m2); + m15 = _mm_add_ps(m5, m6); - m3 = _mm_add_ps(m1, m2); - m15 = _mm_add_ps(m5, m6); + m4 = _mm_sub_ps(m1, m2); + m16 = _mm_sub_ps(m5, m6); - m4 = _mm_sub_ps(m1, m2); - m16 = _mm_sub_ps(m5, m6); + _mm_store_ps(&RE(ch[ah+i]), m3); + _mm_store_ps(&RE(ch[ah+i+2]), m15); - _mm_store_ps(&RE(ch[ah+i]), m3); - _mm_store_ps(&RE(ch[ah+i+2]), m15); - w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1)); - w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1)); + w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1)); + w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1)); - m7 = _mm_mul_ps(m4, w1); - m17 = _mm_mul_ps(m16, w3); - m8 = _mm_mul_ps(m4, w2); - m18 = _mm_mul_ps(m16, w4); + m7 = _mm_mul_ps(m4, w1); + m17 = _mm_mul_ps(m16, w3); + m8 = _mm_mul_ps(m4, w2); + m18 = _mm_mul_ps(m16, w4); - m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0)); - m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0)); - m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1)); - m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1)); + m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0)); + m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0)); + m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1)); + m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1)); - m11 = _mm_add_ps(m9, m10); - m21 = _mm_add_ps(m19, m20); - m12 = _mm_sub_ps(m9, m10); - m22 = _mm_sub_ps(m19, m20); + m11 = _mm_add_ps(m9, m10); + m21 = _mm_add_ps(m19, m20); + m12 = _mm_sub_ps(m9, m10); + m22 = _mm_sub_ps(m19, m20); - m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2)); - m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2)); + m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2)); + m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2)); - m14 = _mm_unpacklo_ps(m12, m13); - m24 = _mm_unpacklo_ps(m22, m23); + m14 = _mm_unpacklo_ps(m12, m13); + m24 = _mm_unpacklo_ps(m22, m23); - _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14); - _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24); - } + _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14); + _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24); } } } @@ -159,8 +190,13 @@ static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]); IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]); +#if 1 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]), IM(t2), RE(t2), RE(wa[i]), IM(wa[i])); +#else + ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]), + RE(t2), IM(t2), RE(wa[i]), IM(wa[i])); +#endif } } } @@ -199,8 +235,13 @@ static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]); IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]); +#if 1 ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]), RE(t2), IM(t2), RE(wa[i]), IM(wa[i])); +#else + ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]), + IM(t2), RE(t2), RE(wa[i]), IM(wa[i])); +#endif } } } @@ -290,10 +331,17 @@ static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc, RE(d3) = RE(c2) + IM(c3); IM(d2) = IM(c2) + RE(c3); +#if 1 ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]), IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i])); ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]), IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i])); +#else + ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]), + RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i])); + ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]), + RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i])); +#endif } } } else { @@ -320,10 +368,17 @@ static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc, RE(d3) = RE(c2) - IM(c3); IM(d2) = IM(c2) - RE(c3); +#if 1 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]), RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i])); ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]), RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i])); +#else + ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]), + IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i])); + ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]), + IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i])); +#endif } } } @@ -331,158 +386,213 @@ static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc, } #ifdef USE_SSE -static void passf4pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc, - complex_t *ch, const complex_t *wa1, const complex_t *wa2, - const complex_t *wa3) +ALIGN static const int32_t negate[4] = { 0x0, 0x0, 0x0, 0x80000000 }; + +__declspec(naked) static void passf4pos_sse(const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa1, const complex_t *wa2, + const complex_t *wa3) +{ + __asm { + push ebx + mov ebx, esp + and esp, -16 + push edi + push esi + sub esp, 8 + movzx edi, WORD PTR [ebx+8] + + movaps xmm1, XMMWORD PTR negate + + test edi, edi + jle l1_is_zero + + lea esi, DWORD PTR [edi+edi] + add esi, esi + sub esi, edi + add esi, esi + add esi, esi + add esi, esi + mov eax, DWORD PTR [ebx+16] + add esi, eax + lea ecx, DWORD PTR [edi+edi] + add ecx, ecx + add ecx, ecx + add ecx, ecx + add ecx, eax + lea edx, DWORD PTR [edi+edi] + add edx, edx + add edx, edx + add edx, eax + xor eax, eax + mov DWORD PTR [esp], ebp + mov ebp, DWORD PTR [ebx+12] + +fftloop: + lea edi, DWORD PTR [eax+eax] + add edi, edi + movaps xmm2, XMMWORD PTR [ebp+edi*8] + movaps xmm0, XMMWORD PTR [ebp+edi*8+16] + movaps xmm7, XMMWORD PTR [ebp+edi*8+32] + movaps xmm5, XMMWORD PTR [ebp+edi*8+48] + movaps xmm6, xmm2 + addps xmm6, xmm0 + movaps xmm4, xmm1 + xorps xmm4, xmm7 + movaps xmm3, xmm1 + xorps xmm3, xmm5 + xorps xmm2, xmm1 + xorps xmm0, xmm1 + addps xmm7, xmm5 + subps xmm2, xmm0 + movaps xmm0, xmm6 + shufps xmm0, xmm7, 68 + subps xmm4, xmm3 + shufps xmm6, xmm7, 238 + movaps xmm5, xmm2 + shufps xmm5, xmm4, 68 + movaps xmm3, xmm0 + addps xmm3, xmm6 + shufps xmm2, xmm4, 187 + subps xmm0, xmm6 + movaps xmm4, xmm5 + addps xmm4, xmm2 + mov edi, DWORD PTR [ebx+16] + movaps XMMWORD PTR [edi+eax*8], xmm3 + subps xmm5, xmm2 + movaps XMMWORD PTR [edx+eax*8], xmm4 + movaps XMMWORD PTR [ecx+eax*8], xmm0 + movaps XMMWORD PTR [esi+eax*8], xmm5 + add eax, 2 + movzx eax, ax + movzx edi, WORD PTR [ebx+8] + cmp eax, edi + jl fftloop + + mov ebp, DWORD PTR [esp] + +l1_is_zero: + + add esp, 8 + pop esi + pop edi + mov esp, ebx + pop ebx + ret + } +} +#endif + +#if 0 +static void passf4pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, + complex_t *ch, const complex_t *wa1, const complex_t *wa2, + const complex_t *wa3) { uint16_t i, k, ac, ah; - if (ido == 1) + for (k = 0; k < l1; k++) { - for (k = 0; k < l1; k+=2) - { - __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10; - __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, n10; - __m128 neg1 = _mm_set_ps(-1.0, 1.0, 1.0, 1.0); + ac = 4*k*ido; + ah = k*ido; - m1 = _mm_load_ps(&RE(cc[4*k])); - m2 = _mm_load_ps(&RE(cc[4*k+2])); - n1 = _mm_load_ps(&RE(cc[4*k+4])); - n2 = _mm_load_ps(&RE(cc[4*k+6])); + for (i = 0; i < ido; i+=2) + { + __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, m16; + __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, m17, m18, m19, m20, m21, m22, m23; + __m128 w1, w2, w3, w4, w5, w6, m24, m25, m26, m27, m28, m29, m30; + __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0); + m1 = _mm_load_ps(&RE(cc[ac+i])); + m2 = _mm_load_ps(&RE(cc[ac+i+2*ido])); m3 = _mm_add_ps(m1, m2); + m4 = _mm_sub_ps(m1, m2); - n4 = _mm_mul_ps(neg1, n1); - n5 = _mm_mul_ps(neg1, n2); - m4 = _mm_mul_ps(neg1, m1); - m5 = _mm_mul_ps(neg1, m2); - + n1 = _mm_load_ps(&RE(cc[ac+i+ido])); + n2 = _mm_load_ps(&RE(cc[ac+i+3*ido])); n3 = _mm_add_ps(n1, n2); - m6 = _mm_sub_ps(m4, m5); - m7 = _mm_shuffle_ps(m3, n3, _MM_SHUFFLE(1, 0, 1, 0)); + n4 = _mm_mul_ps(neg1, n1); + n5 = _mm_mul_ps(neg1, n2); n6 = _mm_sub_ps(n4, n5); - m8 = _mm_shuffle_ps(m3, n3, _MM_SHUFFLE(3, 2, 3, 2)); - n7 = _mm_shuffle_ps(m6, n6, _MM_SHUFFLE(1, 0, 1, 0)); - m9 = _mm_add_ps(m7, m8); - n8 = _mm_shuffle_ps(m6, n6, _MM_SHUFFLE(2, 3, 2, 3)); + m5 = _mm_add_ps(m3, n3); - m10 = _mm_sub_ps(m7, m8); - n9 = _mm_add_ps(n7, n8); + n7 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2, 3, 0, 1)); + n8 = _mm_add_ps(m4, n7); - _mm_store_ps(&RE(ch[k]), m9); - n10 = _mm_sub_ps(n7, n8); - _mm_store_ps(&RE(ch[k+l1]), n9); - _mm_store_ps(&RE(ch[k+2*l1]), m10); - _mm_store_ps(&RE(ch[k+3*l1]), n10); - } - } else { - for (k = 0; k < l1; k++) - { - ac = 4*k*ido; - ah = k*ido; + m6 = _mm_sub_ps(m3, n3); + n9 = _mm_sub_ps(m4, n7); - for (i = 0; i < ido; i+=2) - { - __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, m16; - __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, m17, m18, m19, m20, m21, m22, m23; - __m128 w1, w2, w3, w4, w5, w6, m24, m25, m26, m27, m28, m29, m30; - __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0); - - m1 = _mm_load_ps(&RE(cc[ac+i])); - m2 = _mm_load_ps(&RE(cc[ac+i+2*ido])); - m3 = _mm_add_ps(m1, m2); - m4 = _mm_sub_ps(m1, m2); - - n1 = _mm_load_ps(&RE(cc[ac+i+ido])); - n2 = _mm_load_ps(&RE(cc[ac+i+3*ido])); - n3 = _mm_add_ps(n1, n2); - - n4 = _mm_mul_ps(neg1, n1); - n5 = _mm_mul_ps(neg1, n2); - n6 = _mm_sub_ps(n4, n5); - - m5 = _mm_add_ps(m3, n3); - - n7 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2, 3, 0, 1)); - n8 = _mm_add_ps(m4, n7); - - m6 = _mm_sub_ps(m3, n3); - n9 = _mm_sub_ps(m4, n7); - - _mm_store_ps(&RE(ch[ah+i]), m5); + _mm_store_ps(&RE(ch[ah+i]), m5); #if 0 - static INLINE void ComplexMult(real_t *y1, real_t *y2, - real_t x1, real_t x2, real_t c1, real_t c2) - { - *y1 = MUL_F(x1, c1) + MUL_F(x2, c2); - *y2 = MUL_F(x2, c1) - MUL_F(x1, c2); - } - - m7.0 = RE(c2)*RE(wa1[i]) - m7.1 = IM(c2)*IM(wa1[i]) - m7.2 = RE(c6)*RE(wa1[i+1]) - m7.3 = IM(c6)*IM(wa1[i+1]) - - m8.0 = RE(c2)*IM(wa1[i]) - m8.1 = IM(c2)*RE(wa1[i]) - m8.2 = RE(c6)*IM(wa1[i+1]) - m8.3 = IM(c6)*RE(wa1[i+1]) - - RE(0) = m7.0 - m7.1 - IM(0) = m8.0 + m8.1 - RE(1) = m7.2 - m7.3 - IM(1) = m8.2 + m8.3 - -//// - RE(0) = RE(c2)*RE(wa1[i]) - IM(c2)*IM(wa1[i]) - IM(0) = RE(c2)*IM(wa1[i]) + IM(c2)*RE(wa1[i]) - RE(1) = RE(c6)*RE(wa1[i+1]) - IM(c6)*IM(wa1[i+1]) - IM(1) = RE(c6)*IM(wa1[i+1]) + IM(c6)*RE(wa1[i+1]) + static INLINE void ComplexMult(real_t *y1, real_t *y2, + real_t x1, real_t x2, real_t c1, real_t c2) + { + *y1 = MUL_F(x1, c1) + MUL_F(x2, c2); + *y2 = MUL_F(x2, c1) - MUL_F(x1, c2); + } + + m7.0 = RE(c2)*RE(wa1[i]) + m7.1 = IM(c2)*IM(wa1[i]) + m7.2 = RE(c6)*RE(wa1[i+1]) + m7.3 = IM(c6)*IM(wa1[i+1]) + + m8.0 = RE(c2)*IM(wa1[i]) + m8.1 = IM(c2)*RE(wa1[i]) + m8.2 = RE(c6)*IM(wa1[i+1]) + m8.3 = IM(c6)*RE(wa1[i+1]) + + RE(0) = m7.0 - m7.1 + IM(0) = m8.0 + m8.1 + RE(1) = m7.2 - m7.3 + IM(1) = m8.2 + m8.3 + + //// + RE(0) = RE(c2)*RE(wa1[i]) - IM(c2)*IM(wa1[i]) + IM(0) = RE(c2)*IM(wa1[i]) + IM(c2)*RE(wa1[i]) + RE(1) = RE(c6)*RE(wa1[i+1]) - IM(c6)*IM(wa1[i+1]) + IM(1) = RE(c6)*IM(wa1[i+1]) + IM(c6)*RE(wa1[i+1]) #endif - w1 = _mm_load_ps(&RE(wa1[i])); - w3 = _mm_load_ps(&RE(wa2[i])); - w5 = _mm_load_ps(&RE(wa3[i])); - - w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1)); - w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1)); - w6 = _mm_shuffle_ps(w5, w5, _MM_SHUFFLE(2, 3, 0, 1)); - - m7 = _mm_mul_ps(n8, w1); - m15 = _mm_mul_ps(m6, w3); - m23 = _mm_mul_ps(n9, w5); - m8 = _mm_mul_ps(n8, w2); - m16 = _mm_mul_ps(m6, w4); - m24 = _mm_mul_ps(n9, w6); - - m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0)); - m17 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(2, 0, 2, 0)); - m25 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(2, 0, 2, 0)); - m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1)); - m18 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(3, 1, 3, 1)); - m26 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(3, 1, 3, 1)); - - m11 = _mm_add_ps(m9, m10); - m19 = _mm_add_ps(m17, m18); - m27 = _mm_add_ps(m25, m26); - m12 = _mm_sub_ps(m9, m10); - m20 = _mm_sub_ps(m17, m18); - m28 = _mm_sub_ps(m25, m26); - - m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2)); - m21 = _mm_shuffle_ps(m19, m19, _MM_SHUFFLE(0, 0, 3, 2)); - m29 = _mm_shuffle_ps(m27, m27, _MM_SHUFFLE(0, 0, 3, 2)); - m14 = _mm_unpacklo_ps(m12, m13); - m22 = _mm_unpacklo_ps(m20, m21); - m30 = _mm_unpacklo_ps(m28, m29); - - _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14); - _mm_store_ps(&RE(ch[ah+i+2*l1*ido]), m22); - _mm_store_ps(&RE(ch[ah+i+3*l1*ido]), m30); - } + w1 = _mm_load_ps(&RE(wa1[i])); + w3 = _mm_load_ps(&RE(wa2[i])); + w5 = _mm_load_ps(&RE(wa3[i])); + + w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1)); + w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1)); + w6 = _mm_shuffle_ps(w5, w5, _MM_SHUFFLE(2, 3, 0, 1)); + + m7 = _mm_mul_ps(n8, w1); + m15 = _mm_mul_ps(m6, w3); + m23 = _mm_mul_ps(n9, w5); + m8 = _mm_mul_ps(n8, w2); + m16 = _mm_mul_ps(m6, w4); + m24 = _mm_mul_ps(n9, w6); + + m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0)); + m17 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(2, 0, 2, 0)); + m25 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(2, 0, 2, 0)); + m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1)); + m18 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(3, 1, 3, 1)); + m26 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(3, 1, 3, 1)); + + m11 = _mm_add_ps(m9, m10); + m19 = _mm_add_ps(m17, m18); + m27 = _mm_add_ps(m25, m26); + m12 = _mm_sub_ps(m9, m10); + m20 = _mm_sub_ps(m17, m18); + m28 = _mm_sub_ps(m25, m26); + + m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2)); + m21 = _mm_shuffle_ps(m19, m19, _MM_SHUFFLE(0, 0, 3, 2)); + m29 = _mm_shuffle_ps(m27, m27, _MM_SHUFFLE(0, 0, 3, 2)); + m14 = _mm_unpacklo_ps(m12, m13); + m22 = _mm_unpacklo_ps(m20, m21); + m30 = _mm_unpacklo_ps(m28, m29); + + _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14); + _mm_store_ps(&RE(ch[ah+i+2*l1*ido]), m22); + _mm_store_ps(&RE(ch[ah+i+3*l1*ido]), m30); } } } @@ -555,12 +665,21 @@ static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc IM(ch[ah+i]) = IM(t2) + IM(t3); IM(c3) = IM(t2) - IM(t3); +#if 1 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]), IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i])); ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]), IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i])); ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]), IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i])); +#else + ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]), + RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i])); + ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]), + RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i])); + ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]), + RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i])); +#endif } } } @@ -633,12 +752,21 @@ static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc IM(ch[ah+i]) = IM(t2) + IM(t3); IM(c3) = IM(t2) - IM(t3); +#if 1 ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]), RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i])); ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]), RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i])); ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]), RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i])); +#else + ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]), + IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i])); + ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]), + IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i])); + ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]), + IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i])); +#endif } } } @@ -774,6 +902,7 @@ static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc, RE(d3) = RE(c3) - IM(c4); IM(d4) = IM(c3) - RE(c4); +#if 1 ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]), IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i])); ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]), @@ -782,6 +911,16 @@ static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc, IM(d4), RE(d4), RE(wa3[i]), IM(wa3[i])); ComplexMult(&IM(ch[ah+4*l1*ido]), &RE(ch[ah+4*l1*ido]), IM(d5), RE(d5), RE(wa4[i]), IM(wa4[i])); +#else + ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]), + RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i])); + ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]), + RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i])); + ComplexMult(&RE(ch[ah+3*l1*ido]), &IM(ch[ah+3*l1*ido]), + RE(d4), IM(d4), RE(wa3[i]), IM(wa3[i])); + ComplexMult(&RE(ch[ah+4*l1*ido]), &IM(ch[ah+4*l1*ido]), + RE(d5), IM(d5), RE(wa4[i]), IM(wa4[i])); +#endif } } } else { @@ -823,6 +962,7 @@ static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc, RE(d3) = RE(c3) + IM(c4); IM(d4) = IM(c3) + RE(c4); +#if 1 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]), RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i])); ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]), @@ -831,6 +971,16 @@ static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc, RE(d4), IM(d4), RE(wa3[i]), IM(wa3[i])); ComplexMult(&RE(ch[ah+4*l1*ido]), &IM(ch[ah+4*l1*ido]), RE(d5), IM(d5), RE(wa4[i]), IM(wa4[i])); +#else + ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]), + IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i])); + ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]), + IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i])); + ComplexMult(&IM(ch[ah+3*l1*ido]), &RE(ch[ah+3*l1*ido]), + IM(d4), RE(d4), RE(wa3[i]), IM(wa3[i])); + ComplexMult(&IM(ch[ah+4*l1*ido]), &RE(ch[ah+4*l1*ido]), + IM(d5), RE(d5), RE(wa4[i]), IM(wa4[i])); +#endif } } } @@ -843,8 +993,12 @@ static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc, ----------------------------------------------------------------------*/ #ifdef USE_SSE -INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch, - const uint16_t *ifac, const complex_t *wa, const int8_t isign) + +#define CONV(A,B,C) ( (A<<2) | ((B & 0x1)<<1) | ((C==1)&0x1) ) + +static INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch, + const uint16_t *ifac, const complex_t *wa, + const int8_t isign) { uint16_t i; uint16_t k1, l1, l2; @@ -862,51 +1016,62 @@ INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch, ido = n / l2; idl1 = ido*l1; - switch (ip) - { - case 4: - ix2 = iw + ido; - ix3 = ix2 + ido; - - if (na == 0) - passf4pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]); - else - passf4pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]); + ix2 = iw + ido; + ix3 = ix2 + ido; + ix4 = ix3 + ido; - na = 1 - na; + switch (CONV(ip,na,ido)) + { + case CONV(4,0,0): + //passf4pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]); + passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]); break; - case 2: - if (na == 0) - passf2pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]); - else - passf2pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]); - - na = 1 - na; + case CONV(4,0,1): + passf4pos_sse((const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]); break; - case 3: - ix2 = iw + ido; - - if (na == 0) - passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign); - else - passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign); - - na = 1 - na; + case CONV(4,1,0): + passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]); + //passf4pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]); break; - case 5: - ix2 = iw + ido; - ix3 = ix2 + ido; - ix4 = ix3 + ido; - - if (na == 0) - passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); - else - passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); - - na = 1 - na; + case CONV(4,1,1): + passf4pos_sse((const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]); + break; + case CONV(2,0,0): + passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]); + //passf2pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]); + break; + case CONV(2,0,1): + passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]); + //passf2pos_sse((const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]); + break; + case CONV(2,1,0): + passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]); + //passf2pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]); + break; + case CONV(2,1,1): + passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]); + //passf2pos_sse((const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]); + break; + case CONV(3,0,0): + case CONV(3,0,1): + passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign); + break; + case CONV(3,1,0): + case CONV(3,1,1): + passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign); + break; + case CONV(5,0,0): + case CONV(5,0,1): + passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); + break; + case CONV(5,1,0): + case CONV(5,1,1): + passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); break; } + na = 1 - na; + l1 = l2; iw += (ip-1) * ido; } @@ -922,8 +1087,9 @@ INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch, } #endif -INLINE void cfftf1pos(uint16_t n, complex_t *c, complex_t *ch, - const uint16_t *ifac, const complex_t *wa, const int8_t isign) +static INLINE void cfftf1pos(uint16_t n, complex_t *c, complex_t *ch, + const uint16_t *ifac, const complex_t *wa, + const int8_t isign) { uint16_t i; uint16_t k1, l1, l2; @@ -1000,8 +1166,9 @@ INLINE void cfftf1pos(uint16_t n, complex_t *c, complex_t *ch, } } -INLINE void cfftf1neg(uint16_t n, complex_t *c, complex_t *ch, - const uint16_t *ifac, const complex_t *wa, const int8_t isign) +static INLINE void cfftf1neg(uint16_t n, complex_t *c, complex_t *ch, + const uint16_t *ifac, const complex_t *wa, + const int8_t isign) { uint16_t i; uint16_t k1, l1, l2; @@ -1104,7 +1271,7 @@ static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac) uint16_t i1, k1, l1, l2; uint16_t ld, ii, ip; #endif - uint16_t ntry, i, j; + uint16_t ntry = 0, i, j; uint16_t ib; uint16_t nf, nl, nq, nr; @@ -1174,7 +1341,11 @@ startloop: fi++; arg = fi * argld; RE(wa[i]) = (real_t)cos(arg); +#if 1 IM(wa[i]) = (real_t)sin(arg); +#else + IM(wa[i]) = (real_t)-sin(arg); +#endif } if (ip > 5) |