summaryrefslogtreecommitdiff
path: root/ac3dec/downmix_kni.S
diff options
context:
space:
mode:
Diffstat (limited to 'ac3dec/downmix_kni.S')
-rw-r--r--ac3dec/downmix_kni.S396
1 files changed, 396 insertions, 0 deletions
diff --git a/ac3dec/downmix_kni.S b/ac3dec/downmix_kni.S
new file mode 100644
index 00000000..7df8c060
--- /dev/null
+++ b/ac3dec/downmix_kni.S
@@ -0,0 +1,396 @@
+/*
+ * downmix_kni.S
+ *
+ * Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - October 2000
+ *
+ *
+ * downmix_kni.S is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * downmix_kni.S is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Make; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifdef __i386__
+
+.section .rodata
+ .align 4
+sqrt2: .float 0f0.7071068
+ .p2align 5,0,
+
+ .section .text
+
+ .align 4
+ .global downmix_3f_2r_to_2ch_kni
+ .type downmix_3f_2r_to_2ch_kni, @function
+
+downmix_3f_2r_to_2ch_kni:
+ pushl %ebp
+ movl %esp, %ebp
+
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+
+ movl 8(%ebp), %eax /* samples[] */
+ movl 12(%ebp), %ebx /* &dm_par */
+ movl $64, %ecx /* loop counter */
+
+ movss (%ebx), %xmm5 /* unit */
+ shufps $0, %xmm5, %xmm5 /* unit | unit | unit | unit */
+
+ movss 4(%ebx), %xmm6 /* clev */
+ shufps $0, %xmm6, %xmm6 /* clev | clev | clev | clev */
+
+ movss 8(%ebx), %xmm7 /* slev */
+ shufps $0, %xmm7, %xmm7 /* slev | slev | slev | slev */
+
+.loop:
+ movaps (%eax), %xmm0 /* left */
+ movaps 2048(%eax), %xmm1 /* right */
+ movaps 1024(%eax), %xmm2 /* center */
+ mulps %xmm5, %xmm0
+ mulps %xmm5, %xmm1
+
+ mulps %xmm6, %xmm2
+ movaps 3072(%eax), %xmm3 /* leftsur */
+ movaps 4096(%eax), %xmm4 /* rithgsur */
+ addps %xmm2, %xmm0
+ addps %xmm2, %xmm1
+
+ mulps %xmm7, %xmm3
+ mulps %xmm7, %xmm4
+ addps %xmm3, %xmm0
+ addps %xmm4, %xmm1
+
+ movaps %xmm0, (%eax)
+ movaps %xmm1, 1024(%eax)
+
+ addl $16, %eax
+ decl %ecx
+ jnz .loop
+
+ popl %ecx
+ popl %ebx
+ popl %eax
+
+ leave
+ ret
+ .p2align 4,,7
+
+ .global downmix_2f_2r_to_2ch_kni
+ .type downmix_2f_2r_to_2ch_kni, @function
+
+downmix_2f_2r_to_2ch_kni:
+ pushl %ebp
+ movl %esp, %ebp
+
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+
+ movl 8(%ebp), %eax /* samples[] */
+ movl 12(%ebp), %ebx /* &dm_par */
+ movl $64, %ecx /* loop counter */
+
+ movss (%ebx), %xmm5 /* unit */
+ shufps $0, %xmm5, %xmm5 /* unit | unit | unit | unit */
+
+ movss 8(%ebx), %xmm7 /* slev */
+ shufps $0, %xmm7, %xmm7 /* slev | slev | slev | slev */
+
+.loop3:
+ movaps (%eax), %xmm0 /* left */
+ movaps 1024(%eax), %xmm1 /* right */
+ movaps 2048(%eax), %xmm3 /* leftsur */
+ mulps %xmm5, %xmm0
+ mulps %xmm5, %xmm1
+
+ movaps 3072(%eax), %xmm4 /* rightsur */
+
+ mulps %xmm7, %xmm3
+ mulps %xmm7, %xmm4
+ addps %xmm3, %xmm0
+ addps %xmm4, %xmm1
+
+ movaps %xmm0, (%eax)
+ movaps %xmm1, 1024(%eax)
+
+ addl $16, %eax
+ decl %ecx
+ jnz .loop3
+
+ popl %ecx
+ popl %ebx
+ popl %eax
+
+ leave
+ ret
+ .p2align 4,,7
+
+ .global downmix_3f_1r_to_2ch_kni
+ .type downmix_3f_1r_to_2ch_kni, @function
+
+downmix_3f_1r_to_2ch_kni:
+ pushl %ebp
+ movl %esp, %ebp
+
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+
+ movl 8(%ebp), %eax /* samples[] */
+ movl 12(%ebp), %ebx /* &dm_par */
+ movl $64, %ecx /* loop counter */
+
+ movss (%ebx), %xmm5 /* unit */
+ shufps $0, %xmm5, %xmm5 /* unit | unit | unit | unit */
+
+ movss 4(%ebx), %xmm6 /* clev */
+ shufps $0, %xmm6, %xmm6 /* clev | clev | clev | clev */
+
+ movss 8(%ebx), %xmm7 /* slev */
+ shufps $0, %xmm7, %xmm7 /* slev | slev | slev | slev */
+
+.loop4:
+ movaps (%eax), %xmm0 /* left */
+ movaps 2048(%eax), %xmm1 /* right */
+ movaps 1024(%eax), %xmm2 /* center */
+ mulps %xmm5, %xmm0
+ mulps %xmm5, %xmm1
+
+ mulps %xmm6, %xmm2
+ movaps 3072(%eax), %xmm3 /* sur */
+
+ addps %xmm2, %xmm0
+ mulps %xmm7, %xmm3
+
+ addps %xmm2, %xmm1
+
+ subps %xmm3, %xmm0
+ addps %xmm3, %xmm1
+
+ movaps %xmm0, (%eax)
+ movaps %xmm1, 1024(%eax)
+
+ addl $16, %eax
+ decl %ecx
+ jnz .loop4
+
+ popl %ecx
+ popl %ebx
+ popl %eax
+
+ leave
+ ret
+ .p2align 4,,7
+
+ .global downmix_2f_1r_to_2ch_kni
+ .type downmix_2f_1r_to_2ch_kni, @function
+
+downmix_2f_1r_to_2ch_kni:
+ pushl %ebp
+ movl %esp, %ebp
+
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+
+ movl 8(%ebp), %eax /* samples[] */
+ movl 12(%ebp), %ebx /* &dm_par */
+ movl $64, %ecx /* loop counter */
+
+ movss (%ebx), %xmm5 /* unit */
+ shufps $0, %xmm5, %xmm5 /* unit | unit | unit | unit */
+
+ movss 8(%ebx), %xmm7 /* slev */
+ shufps $0, %xmm7, %xmm7 /* slev | slev | slev | slev */
+
+.loop5:
+ movaps (%eax), %xmm0 /* left */
+ movaps 1024(%eax), %xmm1 /* right */
+
+ mulps %xmm5, %xmm0
+ mulps %xmm5, %xmm1
+
+ movaps 2048(%eax), %xmm3 /* sur */
+
+ mulps %xmm7, %xmm3
+
+ subps %xmm3, %xmm0
+ addps %xmm3, %xmm1
+
+ movaps %xmm0, (%eax)
+ movaps %xmm1, 1024(%eax)
+
+ addl $16, %eax
+ decl %ecx
+ jnz .loop5
+
+ popl %ecx
+ popl %ebx
+ popl %eax
+
+ leave
+ ret
+ .p2align 4,,7
+
+ .global downmix_3f_0r_to_2ch_kni
+ .type downmix_3f_0r_to_2ch_kni, @function
+
+downmix_3f_0r_to_2ch_kni:
+ pushl %ebp
+ movl %esp, %ebp
+
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+
+ movl 8(%ebp), %eax /* samples[] */
+ movl 12(%ebp), %ebx /* &dm_par */
+ movl $64, %ecx /* loop counter */
+
+ movss (%ebx), %xmm5 /* unit */
+ shufps $0, %xmm5, %xmm5 /* unit | unit | unit | unit */
+
+ movss 4(%ebx), %xmm6 /* clev */
+ shufps $0, %xmm6, %xmm6 /* clev | clev | clev | clev */
+
+
+.loop6:
+ movaps (%eax), %xmm0 /* left */
+ movaps 2048(%eax), %xmm1 /* right */
+ movaps 1024(%eax), %xmm2 /* center */
+ mulps %xmm5, %xmm0
+ mulps %xmm5, %xmm1
+
+ mulps %xmm6, %xmm2
+
+ addps %xmm2, %xmm0
+
+ addps %xmm2, %xmm1
+
+ movaps %xmm0, (%eax)
+ movaps %xmm1, 1024(%eax)
+
+ addl $16, %eax
+ decl %ecx
+ jnz .loop6
+
+ popl %ecx
+ popl %ebx
+ popl %eax
+
+ leave
+ ret
+ .p2align 4,,7
+
+ .global stream_sample_2ch_to_s16_kni
+ .type stream_sample_2ch_to_s16_kni, @function
+
+stream_sample_2ch_to_s16_kni:
+ pushl %ebp
+ movl %esp, %ebp
+
+ pushl %eax
+ pushl %ebx
+ pushl %edx
+ pushl %ecx
+
+ movl 8(%ebp), %eax /* s16_samples */
+ movl 12(%ebp), %ebx /* left */
+ movl 16(%ebp), %edx /* right */
+ movl $64, %ecx
+
+.loop1:
+ movaps (%ebx), %xmm0 /* l3 | l2 | l1 | l0 */
+ movaps (%edx), %xmm1 /* r3 | r2 | r1 | r0 */
+ movhlps %xmm0, %xmm2 /* l3 | l2 */
+ movhlps %xmm1, %xmm3 /* r3 | r2 */
+ unpcklps %xmm1, %xmm0 /* r1 | l1 | r0 | l0 */
+ unpcklps %xmm3, %xmm2 /* r3 | l3 | r2 | l2 */
+
+ cvtps2pi %xmm0, %mm0 /* r0 l0 --> mm0, int_32 */
+ movhlps %xmm0, %xmm0
+ cvtps2pi %xmm0, %mm1 /* r1 l1 --> mm1, int_32 */
+
+ cvtps2pi %xmm2, %mm2 /* r2 l2 --> mm2, int_32 */
+ movhlps %xmm2, %xmm2
+ cvtps2pi %xmm2, %mm3 /* r3 l3 --> mm3, int_32 */
+ packssdw %mm1, %mm0 /* r1 l1 r0 l0 --> mm0, int_16 */
+ packssdw %mm3, %mm2 /* r3 l3 r2 l2 --> mm2, int_16 */
+
+ movq %mm0, (%eax)
+ movq %mm2, 8(%eax)
+ addl $16, %eax
+ addl $16, %ebx
+ addl $16, %edx
+
+ decl %ecx
+ jnz .loop1
+
+ popl %ecx
+ popl %edx
+ popl %ebx
+ popl %eax
+
+ emms
+
+ leave
+ ret
+ .p2align 4,,7
+
+ .global stream_sample_1ch_to_s16_kni
+ .type stream_sample_1ch_to_s16_kni, @function
+
+stream_sample_1ch_to_s16_kni:
+ pushl %ebp
+ movl %esp, %ebp
+
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+
+ movl $sqrt2, %eax
+ movss (%eax), %xmm7
+ movl 8(%ebp), %eax /* s16_samples */
+ movl 12(%ebp), %ebx /* left */
+ shufps $0, %xmm7, %xmm7
+ movl $64, %ecx
+
+.loop2:
+ movaps (%ebx), %xmm0 /* c3 | c2 | c1 | c0 */
+ mulps %xmm7, %xmm0
+ movhlps %xmm0, %xmm2 /* c3 | c2 */
+
+ cvtps2pi %xmm0, %mm0 /* c1 c0 --> mm0, int_32 */
+ cvtps2pi %xmm2, %mm1 /* c3 c2 --> mm1, int_32 */
+
+ packssdw %mm0, %mm0 /* c1 c1 c0 c0 --> mm0, int_16 */
+ packssdw %mm1, %mm1 /* c3 c3 c2 c2 --> mm1, int_16 */
+
+ movq %mm0, (%eax)
+ movq %mm1, 8(%eax)
+ addl $16, %eax
+ addl $16, %ebx
+
+ decl %ecx
+ jnz .loop2
+
+ popl %ecx
+ popl %ebx
+ popl %eax
+
+ emms
+ leave
+ ret
+#endif