diff options
Diffstat (limited to 'ac3dec/downmix_kni.S')
-rw-r--r-- | ac3dec/downmix_kni.S | 396 |
1 files changed, 396 insertions, 0 deletions
diff --git a/ac3dec/downmix_kni.S b/ac3dec/downmix_kni.S new file mode 100644 index 00000000..7df8c060 --- /dev/null +++ b/ac3dec/downmix_kni.S @@ -0,0 +1,396 @@ +/* + * downmix_kni.S + * + * Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - October 2000 + * + * + * downmix_kni.S is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * downmix_kni.S is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifdef __i386__ + +.section .rodata + .align 4 +sqrt2: .float 0f0.7071068 + .p2align 5,0, + + .section .text + + .align 4 + .global downmix_3f_2r_to_2ch_kni + .type downmix_3f_2r_to_2ch_kni, @function + +downmix_3f_2r_to_2ch_kni: + pushl %ebp + movl %esp, %ebp + + pushl %eax + pushl %ebx + pushl %ecx + + movl 8(%ebp), %eax /* samples[] */ + movl 12(%ebp), %ebx /* &dm_par */ + movl $64, %ecx /* loop counter */ + + movss (%ebx), %xmm5 /* unit */ + shufps $0, %xmm5, %xmm5 /* unit | unit | unit | unit */ + + movss 4(%ebx), %xmm6 /* clev */ + shufps $0, %xmm6, %xmm6 /* clev | clev | clev | clev */ + + movss 8(%ebx), %xmm7 /* slev */ + shufps $0, %xmm7, %xmm7 /* slev | slev | slev | slev */ + +.loop: + movaps (%eax), %xmm0 /* left */ + movaps 2048(%eax), %xmm1 /* right */ + movaps 1024(%eax), %xmm2 /* center */ + mulps %xmm5, %xmm0 + mulps %xmm5, %xmm1 + + mulps %xmm6, %xmm2 + movaps 3072(%eax), %xmm3 /* leftsur */ + movaps 4096(%eax), %xmm4 /* rithgsur */ + addps %xmm2, %xmm0 + addps %xmm2, %xmm1 + + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + addps %xmm3, %xmm0 + addps %xmm4, %xmm1 + + movaps %xmm0, (%eax) + movaps %xmm1, 1024(%eax) + + addl $16, %eax + decl %ecx + jnz .loop + + popl %ecx + popl %ebx + popl %eax + + leave + ret + .p2align 4,,7 + + .global downmix_2f_2r_to_2ch_kni + .type downmix_2f_2r_to_2ch_kni, @function + +downmix_2f_2r_to_2ch_kni: + pushl %ebp + movl %esp, %ebp + + pushl %eax + pushl %ebx + pushl %ecx + + movl 8(%ebp), %eax /* samples[] */ + movl 12(%ebp), %ebx /* &dm_par */ + movl $64, %ecx /* loop counter */ + + movss (%ebx), %xmm5 /* unit */ + shufps $0, %xmm5, %xmm5 /* unit | unit | unit | unit */ + + movss 8(%ebx), %xmm7 /* slev */ + shufps $0, %xmm7, %xmm7 /* slev | slev | slev | slev */ + +.loop3: + movaps (%eax), %xmm0 /* left */ + movaps 1024(%eax), %xmm1 /* right */ + movaps 2048(%eax), %xmm3 /* leftsur */ + mulps %xmm5, %xmm0 + mulps %xmm5, %xmm1 + + movaps 3072(%eax), %xmm4 /* rightsur */ + + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + addps %xmm3, %xmm0 + addps %xmm4, %xmm1 + + movaps %xmm0, (%eax) + movaps %xmm1, 1024(%eax) + + addl $16, %eax + decl %ecx + jnz .loop3 + + popl %ecx + popl %ebx + popl %eax + + leave + ret + .p2align 4,,7 + + .global downmix_3f_1r_to_2ch_kni + .type downmix_3f_1r_to_2ch_kni, @function + +downmix_3f_1r_to_2ch_kni: + pushl %ebp + movl %esp, %ebp + + pushl %eax + pushl %ebx + pushl %ecx + + movl 8(%ebp), %eax /* samples[] */ + movl 12(%ebp), %ebx /* &dm_par */ + movl $64, %ecx /* loop counter */ + + movss (%ebx), %xmm5 /* unit */ + shufps $0, %xmm5, %xmm5 /* unit | unit | unit | unit */ + + movss 4(%ebx), %xmm6 /* clev */ + shufps $0, %xmm6, %xmm6 /* clev | clev | clev | clev */ + + movss 8(%ebx), %xmm7 /* slev */ + shufps $0, %xmm7, %xmm7 /* slev | slev | slev | slev */ + +.loop4: + movaps (%eax), %xmm0 /* left */ + movaps 2048(%eax), %xmm1 /* right */ + movaps 1024(%eax), %xmm2 /* center */ + mulps %xmm5, %xmm0 + mulps %xmm5, %xmm1 + + mulps %xmm6, %xmm2 + movaps 3072(%eax), %xmm3 /* sur */ + + addps %xmm2, %xmm0 + mulps %xmm7, %xmm3 + + addps %xmm2, %xmm1 + + subps %xmm3, %xmm0 + addps %xmm3, %xmm1 + + movaps %xmm0, (%eax) + movaps %xmm1, 1024(%eax) + + addl $16, %eax + decl %ecx + jnz .loop4 + + popl %ecx + popl %ebx + popl %eax + + leave + ret + .p2align 4,,7 + + .global downmix_2f_1r_to_2ch_kni + .type downmix_2f_1r_to_2ch_kni, @function + +downmix_2f_1r_to_2ch_kni: + pushl %ebp + movl %esp, %ebp + + pushl %eax + pushl %ebx + pushl %ecx + + movl 8(%ebp), %eax /* samples[] */ + movl 12(%ebp), %ebx /* &dm_par */ + movl $64, %ecx /* loop counter */ + + movss (%ebx), %xmm5 /* unit */ + shufps $0, %xmm5, %xmm5 /* unit | unit | unit | unit */ + + movss 8(%ebx), %xmm7 /* slev */ + shufps $0, %xmm7, %xmm7 /* slev | slev | slev | slev */ + +.loop5: + movaps (%eax), %xmm0 /* left */ + movaps 1024(%eax), %xmm1 /* right */ + + mulps %xmm5, %xmm0 + mulps %xmm5, %xmm1 + + movaps 2048(%eax), %xmm3 /* sur */ + + mulps %xmm7, %xmm3 + + subps %xmm3, %xmm0 + addps %xmm3, %xmm1 + + movaps %xmm0, (%eax) + movaps %xmm1, 1024(%eax) + + addl $16, %eax + decl %ecx + jnz .loop5 + + popl %ecx + popl %ebx + popl %eax + + leave + ret + .p2align 4,,7 + + .global downmix_3f_0r_to_2ch_kni + .type downmix_3f_0r_to_2ch_kni, @function + +downmix_3f_0r_to_2ch_kni: + pushl %ebp + movl %esp, %ebp + + pushl %eax + pushl %ebx + pushl %ecx + + movl 8(%ebp), %eax /* samples[] */ + movl 12(%ebp), %ebx /* &dm_par */ + movl $64, %ecx /* loop counter */ + + movss (%ebx), %xmm5 /* unit */ + shufps $0, %xmm5, %xmm5 /* unit | unit | unit | unit */ + + movss 4(%ebx), %xmm6 /* clev */ + shufps $0, %xmm6, %xmm6 /* clev | clev | clev | clev */ + + +.loop6: + movaps (%eax), %xmm0 /* left */ + movaps 2048(%eax), %xmm1 /* right */ + movaps 1024(%eax), %xmm2 /* center */ + mulps %xmm5, %xmm0 + mulps %xmm5, %xmm1 + + mulps %xmm6, %xmm2 + + addps %xmm2, %xmm0 + + addps %xmm2, %xmm1 + + movaps %xmm0, (%eax) + movaps %xmm1, 1024(%eax) + + addl $16, %eax + decl %ecx + jnz .loop6 + + popl %ecx + popl %ebx + popl %eax + + leave + ret + .p2align 4,,7 + + .global stream_sample_2ch_to_s16_kni + .type stream_sample_2ch_to_s16_kni, @function + +stream_sample_2ch_to_s16_kni: + pushl %ebp + movl %esp, %ebp + + pushl %eax + pushl %ebx + pushl %edx + pushl %ecx + + movl 8(%ebp), %eax /* s16_samples */ + movl 12(%ebp), %ebx /* left */ + movl 16(%ebp), %edx /* right */ + movl $64, %ecx + +.loop1: + movaps (%ebx), %xmm0 /* l3 | l2 | l1 | l0 */ + movaps (%edx), %xmm1 /* r3 | r2 | r1 | r0 */ + movhlps %xmm0, %xmm2 /* l3 | l2 */ + movhlps %xmm1, %xmm3 /* r3 | r2 */ + unpcklps %xmm1, %xmm0 /* r1 | l1 | r0 | l0 */ + unpcklps %xmm3, %xmm2 /* r3 | l3 | r2 | l2 */ + + cvtps2pi %xmm0, %mm0 /* r0 l0 --> mm0, int_32 */ + movhlps %xmm0, %xmm0 + cvtps2pi %xmm0, %mm1 /* r1 l1 --> mm1, int_32 */ + + cvtps2pi %xmm2, %mm2 /* r2 l2 --> mm2, int_32 */ + movhlps %xmm2, %xmm2 + cvtps2pi %xmm2, %mm3 /* r3 l3 --> mm3, int_32 */ + packssdw %mm1, %mm0 /* r1 l1 r0 l0 --> mm0, int_16 */ + packssdw %mm3, %mm2 /* r3 l3 r2 l2 --> mm2, int_16 */ + + movq %mm0, (%eax) + movq %mm2, 8(%eax) + addl $16, %eax + addl $16, %ebx + addl $16, %edx + + decl %ecx + jnz .loop1 + + popl %ecx + popl %edx + popl %ebx + popl %eax + + emms + + leave + ret + .p2align 4,,7 + + .global stream_sample_1ch_to_s16_kni + .type stream_sample_1ch_to_s16_kni, @function + +stream_sample_1ch_to_s16_kni: + pushl %ebp + movl %esp, %ebp + + pushl %eax + pushl %ebx + pushl %ecx + + movl $sqrt2, %eax + movss (%eax), %xmm7 + movl 8(%ebp), %eax /* s16_samples */ + movl 12(%ebp), %ebx /* left */ + shufps $0, %xmm7, %xmm7 + movl $64, %ecx + +.loop2: + movaps (%ebx), %xmm0 /* c3 | c2 | c1 | c0 */ + mulps %xmm7, %xmm0 + movhlps %xmm0, %xmm2 /* c3 | c2 */ + + cvtps2pi %xmm0, %mm0 /* c1 c0 --> mm0, int_32 */ + cvtps2pi %xmm2, %mm1 /* c3 c2 --> mm1, int_32 */ + + packssdw %mm0, %mm0 /* c1 c1 c0 c0 --> mm0, int_16 */ + packssdw %mm1, %mm1 /* c3 c3 c2 c2 --> mm1, int_16 */ + + movq %mm0, (%eax) + movq %mm1, 8(%eax) + addl $16, %eax + addl $16, %ebx + + decl %ecx + jnz .loop2 + + popl %ecx + popl %ebx + popl %eax + + emms + leave + ret +#endif |