summaryrefslogtreecommitdiff
path: root/ac3dec/srfft_kni.S
diff options
context:
space:
mode:
Diffstat (limited to 'ac3dec/srfft_kni.S')
-rw-r--r--ac3dec/srfft_kni.S289
1 files changed, 289 insertions, 0 deletions
diff --git a/ac3dec/srfft_kni.S b/ac3dec/srfft_kni.S
new file mode 100644
index 0000000..a42a41b
--- /dev/null
+++ b/ac3dec/srfft_kni.S
@@ -0,0 +1,289 @@
+/*
+ * srfft_kni.S
+ *
+ * Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - October 2000
+ *
+ *
+ * srfft_kni.S is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * srfft_kni.S is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Make; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifdef __i386__
+
+.section .rodata
+ .align 16
+hsqrt2: .float 0f0.707106781188
+ .float 0f0.707106781188
+ .float 0f-0.707106781188
+ .float 0f-0.707106781188
+C_1: .float 0f-1.0
+ .float 0f1.0
+ .float 0f-1.0
+ .float 0f1.0
+
+.text
+ .align 4
+.global fft_4_kni
+ .type fft_4_kni, @function
+fft_4_kni:
+ pushl %ebp
+ movl %esp, %ebp
+ movl 8(%ebp), %eax /* complex_t * */
+
+ movaps (%eax), %xmm0 /* x[1] | x[0] */
+ movaps 16(%eax), %xmm2 /* x[3] | x[2] */
+ movaps %xmm0, %xmm1 /* x[1] | x[0] */
+ addps %xmm2, %xmm0 /* x[1] + x[3] | x[0] + x[2] */
+ subps %xmm2, %xmm1 /* x[1] - x[3] | x[0] - x[2] */
+ xorps %xmm6, %xmm6
+ movhlps %xmm1, %xmm4 /* x[1] - x[3] */
+ movhlps %xmm0, %xmm3 /* x[1] + x[3] */
+ subss %xmm4, %xmm6 /* -(x[1] - x[3]).re */
+ movlhps %xmm1, %xmm0 /* x[0] - x[2] | x[0] + x[2] */
+ movss %xmm6, %xmm4 /* (x[1] - x[3]).im | (x[3]-x[1]).re */
+ movaps %xmm0, %xmm2 /* x[0] - x[2] | x[0] + x[2] */
+ shufps $0x14, %xmm4, %xmm3 /* -i*(x[2] - x[3] | x[2] + x[3] */
+ addps %xmm3, %xmm0
+ subps %xmm3, %xmm2
+ movaps %xmm0, (%eax)
+ movaps %xmm2, 16(%eax)
+
+ leave
+ ret
+
+
+ .align 4
+.global fft_8_kni
+ .type fft_8_kni, @function
+fft_8_kni:
+ pushl %ebp
+ movl %esp, %ebp
+ movl 8(%ebp), %eax /* complext_t */
+
+ pushl %ebx
+ movlps (%eax), %xmm0 /* x[0] */
+ movlps 32(%eax), %xmm1 /* x[4] */
+ movhps 16(%eax), %xmm0 /* x[2] | x[0] */
+ movhps 48(%eax), %xmm1 /* x[6] | x[4] */
+ movaps %xmm0, %xmm2 /* x[2] | x[0] */
+ xorps %xmm3, %xmm3
+ addps %xmm1, %xmm0 /* x[2] + x[6] | x[0] + x[4] */
+ subps %xmm1, %xmm2 /* x[2] - x[6] | x[0] - x[4] */
+ movhlps %xmm0, %xmm5 /* x[2] + x[6] */
+ movhlps %xmm2, %xmm4
+ movlhps %xmm2, %xmm0 /* x[0] - x[4] | x[0] + x[4] */
+ subss %xmm4, %xmm3 /* -(x[2]-x[6]).re */
+ movaps %xmm0, %xmm7 /* x[0] - x[4] | x[0] + x[4] */
+ movss %xmm3, %xmm4 /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
+ movlps 8(%eax), %xmm1 /* x[1] */
+ shufps $0x14, %xmm4, %xmm5 /* -i*(x[2] - x[6]) | x[2] + x[6] */
+
+ addps %xmm5, %xmm0 /* yt */
+ subps %xmm5, %xmm7 /* yb */
+
+ movhps 24(%eax), %xmm1 /* x[3] | x[1] */
+ movl $hsqrt2, %ebx
+ movlps 40(%eax), %xmm2 /* x[5] */
+ movhps 56(%eax), %xmm2 /* /x[7] | x[5] */
+ movaps %xmm1, %xmm3 /* x[3] | x[1] */
+ addps %xmm2, %xmm1 /* x[3] + x[7] | x[1] + x[5] */
+ subps %xmm2, %xmm3 /* x[3] - x[7] | x[1] - x[5] */
+ movaps (%ebx), %xmm4 /* -1/sqrt2 | -1/sqrt2 | 1/sqrt2 | 1/sqrt2 */
+ movaps %xmm3, %xmm6 /* x[3] - x[7] | x[1] - x[5] */
+ mulps %xmm4, %xmm3
+ shufps $0xc8, %xmm4, %xmm4 /* -1/sqrt2 | 1/sqrt2 | -1/sqrt2 | 1/sqrt2 */
+ shufps $0xb1, %xmm6, %xmm6
+ mulps %xmm4, %xmm6
+ addps %xmm3, %xmm6 /* (-1-i)/sqrt2 * (x[3]-x[7]) | (1-i)/sqrt2 * (x[1] - x[5] */
+ movhlps %xmm1, %xmm5 /* x[3] + x[7] */
+ movlhps %xmm6, %xmm1 /* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
+ shufps $0xe4, %xmm6, %xmm5 /* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
+ movaps %xmm1, %xmm3 /* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
+ movl $C_1, %ebx
+ addps %xmm5, %xmm1 /* u */
+ subps %xmm5, %xmm3 /* v */
+ movaps %xmm0, %xmm2 /* yb */
+ movaps %xmm7, %xmm4 /* yt */
+ movaps (%ebx), %xmm5
+ mulps %xmm5, %xmm3
+ addps %xmm1, %xmm0 /* yt + u */
+ subps %xmm1, %xmm2 /* yt - u */
+ shufps $0xb1, %xmm3, %xmm3 /* -i * v */
+ movaps %xmm0, (%eax)
+ movaps %xmm2, 32(%eax)
+ addps %xmm3, %xmm4 /* yb - i*v */
+ subps %xmm3, %xmm7 /* yb + i*v */
+ movaps %xmm4, 16(%eax)
+ movaps %xmm7, 48(%eax)
+
+ popl %ebx
+ leave
+ ret
+
+ .align 4
+.global fft_asmb_kni
+ .type fft_asmb, @function
+fft_asmb_kni:
+ pushl %ebp
+ movl %esp, %ebp
+
+ subl $4, %esp
+
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ pushl %esi
+ pushl %edi
+
+ movl 8(%ebp), %ecx /* k */
+ movl 12(%ebp), %eax /* x */
+ movl %ecx, -4(%ebp) /* k */
+ movl 16(%ebp), %ebx /* wT */
+ movl 20(%ebp), %edx /* d */
+ movl 24(%ebp), %esi /* d3 */
+ shll $4, %ecx /* 16k */
+ addl $8, %edx
+ leal (%eax, %ecx, 2), %edi
+ addl $8, %esi
+
+ /* TRANSZERO and TRANS */
+ movaps (%eax), %xmm0 /* x[1] | x[0] */
+ movaps (%ebx), %xmm1 /* wT[1] | wT[0] */
+ movaps (%ebx, %ecx), %xmm2 /* wB[1] | wB[0] */
+ movlps (%edx), %xmm3 /* d */
+ movlps (%esi), %xmm4 /* d3 */
+ movhlps %xmm1, %xmm5 /* wT[1] */
+ movhlps %xmm2, %xmm6 /* wB[1] */
+ shufps $0x50, %xmm3, %xmm3 /* d[1].im | d[1].im | d[1].re | d[1].re */
+ shufps $0x50, %xmm4, %xmm4 /* d3[1].im | d3[1].im | d3[i].re | d3[i].re */
+ movlhps %xmm5, %xmm5 /* wT[1] | wT[1] */
+ movlhps %xmm6, %xmm6 /* wB[1] | wB[1] */
+ mulps %xmm3, %xmm5
+ mulps %xmm4, %xmm6
+ movhlps %xmm5, %xmm7 /* wT[1].im * d[1].im | wT[1].re * d[1].im */
+ movlhps %xmm6, %xmm5 /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
+ shufps $0xb1, %xmm6, %xmm7 /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
+ movl $C_1, %edi
+ movaps (%edi), %xmm4
+ mulps %xmm4, %xmm7
+ addps %xmm7, %xmm5 /* wB[1] * d3[1] | wT[1] * d[1] */
+ movlhps %xmm5, %xmm1 /* d[1] * wT[1] | wT[0] */
+ shufps $0xe4, %xmm5, %xmm2 /* d3[1] * wB[1] | wB[0] */
+ movaps %xmm1, %xmm3 /* d[1] * wT[1] | wT[0] */
+ leal (%eax, %ecx, 2), %edi
+ addps %xmm2, %xmm1 /* u */
+ subps %xmm2, %xmm3 /* v */
+ mulps %xmm4, %xmm3
+ movaps (%eax, %ecx), %xmm5 /* xk[1] | xk[0] */
+ shufps $0xb1, %xmm3, %xmm3 /* -i * v */
+ movaps %xmm0, %xmm2 /* x[1] | x[0] */
+ movaps %xmm5, %xmm6 /* xk[1] | xk[0] */
+ addps %xmm1, %xmm0
+ subps %xmm1, %xmm2
+ addps %xmm3, %xmm5
+ subps %xmm3, %xmm6
+ movaps %xmm0, (%eax)
+ movaps %xmm2, (%edi)
+ movaps %xmm5, (%eax, %ecx)
+ movaps %xmm6, (%edi, %ecx)
+ addl $16, %eax
+ addl $16, %ebx
+ addl $8, %edx
+ addl $8, %esi
+ decl -4(%ebp)
+
+.loop:
+ movaps (%ebx), %xmm0 /* wT[1] | wT[0] */
+ movaps (%edx), %xmm1 /* d[1] | d[0] */
+
+ movaps (%ebx, %ecx), %xmm4 /* wB[1] | wB[0] */
+ movaps (%esi), %xmm5 /* d3[1] | d3[0] */
+
+ movhlps %xmm0, %xmm2 /* wT[1] */
+ movhlps %xmm1, %xmm3 /* d[1] */
+
+ movhlps %xmm4, %xmm6 /* wB[1] */
+ movhlps %xmm5, %xmm7 /* d3[1] */
+
+ shufps $0x50, %xmm1, %xmm1 /* d[0].im | d[0].im | d[0].re | d[0].re */
+ shufps $0x50, %xmm3, %xmm3 /* d[1].im | d[1].im | d[1].re | d[1].re */
+
+ movlhps %xmm0, %xmm0 /* wT[0] | wT[0] */
+ shufps $0x50, %xmm5, %xmm5 /* d3[0].im | d3[0].im | d3[0].re | d3[0].re */
+ movlhps %xmm2, %xmm2 /* wT[1] | wT[1] */
+ shufps $0x50, %xmm7, %xmm7 /* d3[1].im | d3[1].im | d3[1].re | d3[1].re */
+
+ mulps %xmm1, %xmm0 /* d[0].im * wT[0].im | d[0].im * wT[0].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
+ mulps %xmm3, %xmm2 /* d[1].im * wT[1].im | d[1].im * wT[1].re | d[1].re * wT[1].im | d[1].re * wT[1].re */
+ movlhps %xmm4, %xmm4 /* wB[0] | wB[0] */
+ movlhps %xmm6, %xmm6 /* wB[1] | wB[1] */
+
+ movhlps %xmm0, %xmm1 /* d[0].im * wT[0].im | d[0].im * wT[0].re */
+ movlhps %xmm2, %xmm0 /* d[1].re * wT[1].im | d[1].re * wT[1].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
+ mulps %xmm5, %xmm4 /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */
+ mulps %xmm7, %xmm6 /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
+ shufps $0xb1, %xmm2, %xmm1 /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
+ movl $C_1, %edi
+ movaps (%edi), %xmm3 /* 1.0 | -1.0 | 1.0 | -1.0 */
+
+ movhlps %xmm4, %xmm5 /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
+ mulps %xmm3, %xmm1 /* d[1].im * wT[1].re | -d[1].im * wT[1].im | d[0].im * wT[0].re | -d[0].im * wT[0].im */
+ movlhps %xmm6, %xmm4 /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wB[0].im * d3[0].re | wB[0].im * d3[0].re */
+ addps %xmm1, %xmm0 /* wT[1] * d[1] | wT[0] * d[0] */
+
+ shufps $0xb1, %xmm6, %xmm5 /* wB[1].re * d3[1].im | wB[1].im * d3[1].im | wB[0].re * d3[0].im | wB[0].im * d3[0].im */
+ mulps %xmm3, %xmm5 /* wB[1].re * d3[1].im | -wB[1].im * d3[1].im | wB[0].re * d3[0].im | -wB[0].im * d3[0].im */
+ addps %xmm5, %xmm4 /* wB[1] * d3[1] | wB[0] * d3[0] */
+
+ movaps %xmm0, %xmm1 /* wT[1] * d[1] | wT[0] * d[0] */
+ addps %xmm4, %xmm0 /* u */
+ subps %xmm4, %xmm1 /* v */
+ movaps (%eax), %xmm6 /* x[1] | x[0] */
+ leal (%eax, %ecx, 2), %edi
+ mulps %xmm3, %xmm1
+ addl $16, %ebx
+ addl $16, %esi
+ shufps $0xb1, %xmm1, %xmm1 /* -i * v */
+ movaps (%eax, %ecx), %xmm7 /* xk[1] | xk[0] */
+ movaps %xmm6, %xmm2
+ movaps %xmm7, %xmm4
+ addps %xmm0, %xmm6
+ subps %xmm0, %xmm2
+ movaps %xmm6, (%eax)
+ movaps %xmm2, (%edi)
+ addps %xmm1, %xmm7
+ subps %xmm1, %xmm4
+ addl $16, %edx
+ movaps %xmm7, (%eax, %ecx)
+ movaps %xmm4, (%edi, %ecx)
+
+ addl $16, %eax
+ decl -4(%ebp)
+ jnz .loop
+
+.end:
+ popl %edi
+ popl %esi
+ popl %edx
+ popl %ecx
+ popl %ebx
+ popl %eax
+
+ addl $4, %esp
+
+ leave
+ ret
+#endif