diff options
Diffstat (limited to 'src/libfaad/cfft.c')
-rw-r--r-- | src/libfaad/cfft.c | 420 |
1 files changed, 8 insertions, 412 deletions
diff --git a/src/libfaad/cfft.c b/src/libfaad/cfft.c index 73811e62d..ad3ebd1a5 100644 --- a/src/libfaad/cfft.c +++ b/src/libfaad/cfft.c @@ -22,7 +22,7 @@ ** Commercial non-GPL licensing of this software is possible. ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com. ** -** $Id: cfft.c,v 1.10 2004/12/03 01:15:29 tmattern Exp $ +** $Id: cfft.c,v 1.11 2005/10/29 23:57:06 tmmm Exp $ **/ /* @@ -44,14 +44,6 @@ /* static function declarations */ -#ifdef USE_SSE -static void passf2pos_sse(const uint16_t l1, const complex_t *cc, - complex_t *ch, const complex_t *wa); -static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, - complex_t *ch, const complex_t *wa); -static void passf4pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch, - const complex_t *wa1, const complex_t *wa2, const complex_t *wa3); -#endif static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch, const complex_t *wa); static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc, @@ -74,89 +66,6 @@ static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac); passf2, passf3, passf4, passf5. Complex FFT passes fwd and bwd. ----------------------------------------------------------------------*/ -#if 0 //def USE_SSE -static void passf2pos_sse(const uint16_t l1, const complex_t *cc, - complex_t *ch, const complex_t *wa) -{ - uint16_t k, ah, ac; - - for (k = 0; k < l1; k++) - { - ah = 2*k; - ac = 4*k; - - RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]); - IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]); - - RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]); - IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]); - } -} - -static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, - complex_t *ch, const complex_t *wa) -{ - uint16_t i, k, ah, ac; - - for (k = 0; k < l1; k++) - { - ah = k*ido; - ac = 2*k*ido; - - for (i = 0; i < ido; i+=4) - { - __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14; - __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24; - __m128 w1, w2, w3, w4; - - m1 = _mm_load_ps(&RE(cc[ac+i])); - m2 = _mm_load_ps(&RE(cc[ac+ido+i])); - m5 = _mm_load_ps(&RE(cc[ac+i+2])); - m6 = _mm_load_ps(&RE(cc[ac+ido+i+2])); - w1 = _mm_load_ps(&RE(wa[i])); - w3 = _mm_load_ps(&RE(wa[i+2])); - - m3 = _mm_add_ps(m1, m2); - m15 = _mm_add_ps(m5, m6); - - m4 = _mm_sub_ps(m1, m2); - m16 = _mm_sub_ps(m5, m6); - - _mm_store_ps(&RE(ch[ah+i]), m3); - _mm_store_ps(&RE(ch[ah+i+2]), m15); - - - w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1)); - w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1)); - - m7 = _mm_mul_ps(m4, w1); - m17 = _mm_mul_ps(m16, w3); - m8 = _mm_mul_ps(m4, w2); - m18 = _mm_mul_ps(m16, w4); - - m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0)); - m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0)); - m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1)); - m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1)); - - m11 = _mm_add_ps(m9, m10); - m21 = _mm_add_ps(m19, m20); - m12 = _mm_sub_ps(m9, m10); - m22 = _mm_sub_ps(m19, m20); - - m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2)); - m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2)); - - m14 = _mm_unpacklo_ps(m12, m13); - m24 = _mm_unpacklo_ps(m22, m23); - - _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14); - _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24); - } - } -} -#endif - static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch, const complex_t *wa) { @@ -385,218 +294,6 @@ static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc, } } -#ifdef USE_SSE -ALIGN static const int32_t negate[4] = { 0x0, 0x0, 0x0, 0x80000000 }; - -__declspec(naked) static void passf4pos_sse(const uint16_t l1, const complex_t *cc, - complex_t *ch, const complex_t *wa1, const complex_t *wa2, - const complex_t *wa3) -{ - __asm { - push ebx - mov ebx, esp - and esp, -16 - push edi - push esi - sub esp, 8 - movzx edi, WORD PTR [ebx+8] - - movaps xmm1, XMMWORD PTR negate - - test edi, edi - jle l1_is_zero - - lea esi, DWORD PTR [edi+edi] - add esi, esi - sub esi, edi - add esi, esi - add esi, esi - add esi, esi - mov eax, DWORD PTR [ebx+16] - add esi, eax - lea ecx, DWORD PTR [edi+edi] - add ecx, ecx - add ecx, ecx - add ecx, ecx - add ecx, eax - lea edx, DWORD PTR [edi+edi] - add edx, edx - add edx, edx - add edx, eax - xor eax, eax - mov DWORD PTR [esp], ebp - mov ebp, DWORD PTR [ebx+12] - -fftloop: - lea edi, DWORD PTR [eax+eax] - add edi, edi - movaps xmm2, XMMWORD PTR [ebp+edi*8] - movaps xmm0, XMMWORD PTR [ebp+edi*8+16] - movaps xmm7, XMMWORD PTR [ebp+edi*8+32] - movaps xmm5, XMMWORD PTR [ebp+edi*8+48] - movaps xmm6, xmm2 - addps xmm6, xmm0 - movaps xmm4, xmm1 - xorps xmm4, xmm7 - movaps xmm3, xmm1 - xorps xmm3, xmm5 - xorps xmm2, xmm1 - xorps xmm0, xmm1 - addps xmm7, xmm5 - subps xmm2, xmm0 - movaps xmm0, xmm6 - shufps xmm0, xmm7, 68 - subps xmm4, xmm3 - shufps xmm6, xmm7, 238 - movaps xmm5, xmm2 - shufps xmm5, xmm4, 68 - movaps xmm3, xmm0 - addps xmm3, xmm6 - shufps xmm2, xmm4, 187 - subps xmm0, xmm6 - movaps xmm4, xmm5 - addps xmm4, xmm2 - mov edi, DWORD PTR [ebx+16] - movaps XMMWORD PTR [edi+eax*8], xmm3 - subps xmm5, xmm2 - movaps XMMWORD PTR [edx+eax*8], xmm4 - movaps XMMWORD PTR [ecx+eax*8], xmm0 - movaps XMMWORD PTR [esi+eax*8], xmm5 - add eax, 2 - movzx eax, ax - movzx edi, WORD PTR [ebx+8] - cmp eax, edi - jl fftloop - - mov ebp, DWORD PTR [esp] - -l1_is_zero: - - add esp, 8 - pop esi - pop edi - mov esp, ebx - pop ebx - ret - } -} -#endif - -#if 0 -static void passf4pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, - complex_t *ch, const complex_t *wa1, const complex_t *wa2, - const complex_t *wa3) -{ - uint16_t i, k, ac, ah; - - for (k = 0; k < l1; k++) - { - ac = 4*k*ido; - ah = k*ido; - - for (i = 0; i < ido; i+=2) - { - __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, m16; - __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, m17, m18, m19, m20, m21, m22, m23; - __m128 w1, w2, w3, w4, w5, w6, m24, m25, m26, m27, m28, m29, m30; - __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0); - - m1 = _mm_load_ps(&RE(cc[ac+i])); - m2 = _mm_load_ps(&RE(cc[ac+i+2*ido])); - m3 = _mm_add_ps(m1, m2); - m4 = _mm_sub_ps(m1, m2); - - n1 = _mm_load_ps(&RE(cc[ac+i+ido])); - n2 = _mm_load_ps(&RE(cc[ac+i+3*ido])); - n3 = _mm_add_ps(n1, n2); - - n4 = _mm_mul_ps(neg1, n1); - n5 = _mm_mul_ps(neg1, n2); - n6 = _mm_sub_ps(n4, n5); - - m5 = _mm_add_ps(m3, n3); - - n7 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2, 3, 0, 1)); - n8 = _mm_add_ps(m4, n7); - - m6 = _mm_sub_ps(m3, n3); - n9 = _mm_sub_ps(m4, n7); - - _mm_store_ps(&RE(ch[ah+i]), m5); - -#if 0 - static INLINE void ComplexMult(real_t *y1, real_t *y2, - real_t x1, real_t x2, real_t c1, real_t c2) - { - *y1 = MUL_F(x1, c1) + MUL_F(x2, c2); - *y2 = MUL_F(x2, c1) - MUL_F(x1, c2); - } - - m7.0 = RE(c2)*RE(wa1[i]) - m7.1 = IM(c2)*IM(wa1[i]) - m7.2 = RE(c6)*RE(wa1[i+1]) - m7.3 = IM(c6)*IM(wa1[i+1]) - - m8.0 = RE(c2)*IM(wa1[i]) - m8.1 = IM(c2)*RE(wa1[i]) - m8.2 = RE(c6)*IM(wa1[i+1]) - m8.3 = IM(c6)*RE(wa1[i+1]) - - RE(0) = m7.0 - m7.1 - IM(0) = m8.0 + m8.1 - RE(1) = m7.2 - m7.3 - IM(1) = m8.2 + m8.3 - - //// - RE(0) = RE(c2)*RE(wa1[i]) - IM(c2)*IM(wa1[i]) - IM(0) = RE(c2)*IM(wa1[i]) + IM(c2)*RE(wa1[i]) - RE(1) = RE(c6)*RE(wa1[i+1]) - IM(c6)*IM(wa1[i+1]) - IM(1) = RE(c6)*IM(wa1[i+1]) + IM(c6)*RE(wa1[i+1]) -#endif - - w1 = _mm_load_ps(&RE(wa1[i])); - w3 = _mm_load_ps(&RE(wa2[i])); - w5 = _mm_load_ps(&RE(wa3[i])); - - w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1)); - w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1)); - w6 = _mm_shuffle_ps(w5, w5, _MM_SHUFFLE(2, 3, 0, 1)); - - m7 = _mm_mul_ps(n8, w1); - m15 = _mm_mul_ps(m6, w3); - m23 = _mm_mul_ps(n9, w5); - m8 = _mm_mul_ps(n8, w2); - m16 = _mm_mul_ps(m6, w4); - m24 = _mm_mul_ps(n9, w6); - - m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0)); - m17 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(2, 0, 2, 0)); - m25 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(2, 0, 2, 0)); - m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1)); - m18 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(3, 1, 3, 1)); - m26 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(3, 1, 3, 1)); - - m11 = _mm_add_ps(m9, m10); - m19 = _mm_add_ps(m17, m18); - m27 = _mm_add_ps(m25, m26); - m12 = _mm_sub_ps(m9, m10); - m20 = _mm_sub_ps(m17, m18); - m28 = _mm_sub_ps(m25, m26); - - m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2)); - m21 = _mm_shuffle_ps(m19, m19, _MM_SHUFFLE(0, 0, 3, 2)); - m29 = _mm_shuffle_ps(m27, m27, _MM_SHUFFLE(0, 0, 3, 2)); - m14 = _mm_unpacklo_ps(m12, m13); - m22 = _mm_unpacklo_ps(m20, m21); - m30 = _mm_unpacklo_ps(m28, m29); - - _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14); - _mm_store_ps(&RE(ch[ah+i+2*l1*ido]), m22); - _mm_store_ps(&RE(ch[ah+i+3*l1*ido]), m30); - } - } -} -#endif static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch, const complex_t *wa1, const complex_t *wa2, @@ -992,101 +689,6 @@ static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc, cfftf1, cfftf, cfftb, cffti1, cffti. Complex FFTs. ----------------------------------------------------------------------*/ -#ifdef USE_SSE - -#define CONV(A,B,C) ( (A<<2) | ((B & 0x1)<<1) | ((C==1)&0x1) ) - -static INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch, - const uint16_t *ifac, const complex_t *wa, - const int8_t isign) -{ - uint16_t i; - uint16_t k1, l1, l2; - uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1; - - nf = ifac[1]; - na = 0; - l1 = 1; - iw = 0; - - for (k1 = 2; k1 <= nf+1; k1++) - { - ip = ifac[k1]; - l2 = ip*l1; - ido = n / l2; - idl1 = ido*l1; - - ix2 = iw + ido; - ix3 = ix2 + ido; - ix4 = ix3 + ido; - - switch (CONV(ip,na,ido)) - { - case CONV(4,0,0): - //passf4pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]); - passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]); - break; - case CONV(4,0,1): - passf4pos_sse((const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]); - break; - case CONV(4,1,0): - passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]); - //passf4pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]); - break; - case CONV(4,1,1): - passf4pos_sse((const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]); - break; - case CONV(2,0,0): - passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]); - //passf2pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]); - break; - case CONV(2,0,1): - passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]); - //passf2pos_sse((const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]); - break; - case CONV(2,1,0): - passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]); - //passf2pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]); - break; - case CONV(2,1,1): - passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]); - //passf2pos_sse((const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]); - break; - case CONV(3,0,0): - case CONV(3,0,1): - passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign); - break; - case CONV(3,1,0): - case CONV(3,1,1): - passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign); - break; - case CONV(5,0,0): - case CONV(5,0,1): - passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); - break; - case CONV(5,1,0): - case CONV(5,1,1): - passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); - break; - } - - na = 1 - na; - - l1 = l2; - iw += (ip-1) * ido; - } - - if (na == 0) - return; - - for (i = 0; i < n; i++) - { - RE(c[i]) = RE(ch[i]); - IM(c[i]) = IM(ch[i]); - } -} -#endif - static INLINE void cfftf1pos(uint16_t n, complex_t *c, complex_t *ch, const uint16_t *ifac, const complex_t *wa, const int8_t isign) @@ -1255,13 +857,6 @@ void cfftb(cfft_info *cfft, complex_t *c) cfftf1pos(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1); } -#ifdef USE_SSE -void cfftb_sse(cfft_info *cfft, complex_t *c) -{ - cfftf1pos_sse(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1); -} -#endif - static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac) { static uint16_t ntryh[4] = {3, 4, 2, 5}; @@ -1375,19 +970,20 @@ cfft_info *cffti(uint16_t n) switch (n) { - case 64: cfft->tab = cfft_tab_64; break; - case 512: cfft->tab = cfft_tab_512; break; + case 64: cfft->tab = (complex_t*)cfft_tab_64; break; + case 512: cfft->tab = (complex_t*)cfft_tab_512; break; #ifdef LD_DEC - case 256: cfft->tab = cfft_tab_256; break; + case 256: cfft->tab = (complex_t*)cfft_tab_256; break; #endif #ifdef ALLOW_SMALL_FRAMELENGTH - case 60: cfft->tab = cfft_tab_60; break; - case 480: cfft->tab = cfft_tab_480; break; + case 60: cfft->tab = (complex_t*)cfft_tab_60; break; + case 480: cfft->tab = (complex_t*)cfft_tab_480; break; #ifdef LD_DEC - case 240: cfft->tab = cfft_tab_240; break; + case 240: cfft->tab = (complex_t*)cfft_tab_240; break; #endif #endif + case 128: cfft->tab = (complex_t*)cfft_tab_128; break; } #endif |