/* 
 *  downmix_kni.S
 *
 *  Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - October 2000
 *
 *
 *  downmix_kni.S is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *
 *  downmix_kni.S is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 */

#ifdef __i386__

.section .rodata
	.align 4
sqrt2:	.float 0f0.7071068
	.p2align 5,0,
	
	.section .text
	
	.align 4
	.global downmix_3f_2r_to_2ch_kni
	.type downmix_3f_2r_to_2ch_kni, @function

downmix_3f_2r_to_2ch_kni:
	pushl	%ebp
	movl  	%esp, %ebp
	
	pushl	%eax
	pushl	%ebx
	pushl	%ecx

	movl 	8(%ebp), %eax /* samples[] */
	movl	12(%ebp), %ebx /* &dm_par */
	movl	$64, %ecx		/* loop counter */

	movss	(%ebx), %xmm5	/* unit */
	shufps	$0, %xmm5, %xmm5	/* unit | unit | unit | unit */

	movss	4(%ebx), %xmm6		/* clev */
	shufps	$0, %xmm6, %xmm6	/* clev | clev | clev | clev */

	movss	8(%ebx), %xmm7		/* slev */
	shufps	$0, %xmm7, %xmm7	/* slev | slev | slev | slev */

.loop:	
	movaps	(%eax), %xmm0 /*  left */
	movaps	2048(%eax), %xmm1 /* right */
	movaps 	1024(%eax), %xmm2	/* center */
	mulps	%xmm5, %xmm0
	mulps	%xmm5, %xmm1
	
	mulps	%xmm6, %xmm2
	movaps	3072(%eax), %xmm3	/* leftsur */
	movaps	4096(%eax), %xmm4	/* rithgsur */
	addps	%xmm2, %xmm0
	addps 	%xmm2, %xmm1

	mulps	%xmm7, %xmm3
	mulps	%xmm7, %xmm4
	addps	%xmm3, %xmm0
	addps	%xmm4, %xmm1

	movaps	%xmm0, (%eax)
	movaps	%xmm1, 1024(%eax)

	addl	$16, %eax
	decl 	%ecx
	jnz	.loop

	popl	%ecx
	popl	%ebx
	popl 	%eax

	leave
	ret
	.p2align 4,,7

	.global downmix_2f_2r_to_2ch_kni
	.type downmix_2f_2r_to_2ch_kni, @function

downmix_2f_2r_to_2ch_kni:
	pushl %ebp
	movl  %esp, %ebp

	pushl %eax
	pushl %ebx
	pushl %ecx

	movl 8(%ebp), %eax /* samples[] */
	movl 12(%ebp), %ebx /* &dm_par */
	movl $64, %ecx	/* loop counter */

	movss (%ebx), %xmm5	/* unit */
	shufps $0, %xmm5, %xmm5 /* unit | unit | unit | unit */

	movss	8(%ebx), %xmm7		/* slev */
	shufps	$0, %xmm7, %xmm7	/* slev | slev | slev | slev */

.loop3:	
	movaps	(%eax), %xmm0 /*  left */
	movaps	1024(%eax), %xmm1 /* right */
	movaps 	2048(%eax), %xmm3	/* leftsur */
	mulps	%xmm5, %xmm0
	mulps	%xmm5, %xmm1
	
	movaps	3072(%eax), %xmm4	/* rightsur */

	mulps	%xmm7, %xmm3
	mulps	%xmm7, %xmm4
	addps	%xmm3, %xmm0
	addps	%xmm4, %xmm1

	movaps	%xmm0, (%eax)
	movaps	%xmm1, 1024(%eax)

	addl	$16, %eax
	decl 	%ecx
	jnz	.loop3

	popl	%ecx
	popl	%ebx
	popl 	%eax

	leave
	ret
	.p2align 4,,7
	
	.global downmix_3f_1r_to_2ch_kni
	.type downmix_3f_1r_to_2ch_kni, @function

downmix_3f_1r_to_2ch_kni:
	pushl	%ebp
	movl  	%esp, %ebp
	
	pushl	%eax
	pushl	%ebx
	pushl	%ecx

	movl 	8(%ebp), %eax /* samples[] */
	movl	12(%ebp), %ebx /* &dm_par */
	movl	$64, %ecx		/* loop counter */

	movss	(%ebx), %xmm5	/* unit */
	shufps	$0, %xmm5, %xmm5	/* unit | unit | unit | unit */

	movss	4(%ebx), %xmm6		/* clev */
	shufps	$0, %xmm6, %xmm6	/* clev | clev | clev | clev */

	movss	8(%ebx), %xmm7		/* slev */
	shufps	$0, %xmm7, %xmm7	/* slev | slev | slev | slev */

.loop4:	
	movaps	(%eax), %xmm0 /*  left */
	movaps	2048(%eax), %xmm1 /* right */
	movaps 	1024(%eax), %xmm2	/* center */
	mulps	%xmm5, %xmm0
	mulps	%xmm5, %xmm1
	
	mulps	%xmm6, %xmm2
	movaps	3072(%eax), %xmm3	/* sur */

	addps	%xmm2, %xmm0
	mulps	%xmm7, %xmm3
	
	addps 	%xmm2, %xmm1

	subps	%xmm3, %xmm0
	addps	%xmm3, %xmm1

	movaps	%xmm0, (%eax)
	movaps	%xmm1, 1024(%eax)

	addl	$16, %eax
	decl 	%ecx
	jnz	.loop4

	popl	%ecx
	popl	%ebx
	popl 	%eax

	leave
	ret
	.p2align 4,,7
		
	.global downmix_2f_1r_to_2ch_kni
	.type downmix_2f_1r_to_2ch_kni, @function

downmix_2f_1r_to_2ch_kni:
	pushl	%ebp
	movl  	%esp, %ebp
	
	pushl	%eax
	pushl	%ebx
	pushl	%ecx

	movl 	8(%ebp), %eax /* samples[] */
	movl	12(%ebp), %ebx /* &dm_par */
	movl	$64, %ecx		/* loop counter */

	movss	(%ebx), %xmm5	/* unit */
	shufps	$0, %xmm5, %xmm5	/* unit | unit | unit | unit */

	movss	8(%ebx), %xmm7		/* slev */
	shufps	$0, %xmm7, %xmm7	/* slev | slev | slev | slev */

.loop5:	
	movaps	(%eax), %xmm0 /*  left */
	movaps	1024(%eax), %xmm1 /* right */
	
	mulps	%xmm5, %xmm0
	mulps	%xmm5, %xmm1
	
	movaps	2048(%eax), %xmm3	/* sur */

	mulps	%xmm7, %xmm3
	
	subps	%xmm3, %xmm0
	addps	%xmm3, %xmm1

	movaps	%xmm0, (%eax)
	movaps	%xmm1, 1024(%eax)

	addl	$16, %eax
	decl 	%ecx
	jnz	.loop5

	popl	%ecx
	popl	%ebx
	popl 	%eax

	leave
	ret
	.p2align 4,,7
	
	.global downmix_3f_0r_to_2ch_kni
	.type downmix_3f_0r_to_2ch_kni, @function

downmix_3f_0r_to_2ch_kni:
	pushl	%ebp
	movl  	%esp, %ebp
	
	pushl	%eax
	pushl	%ebx
	pushl	%ecx

	movl 	8(%ebp), %eax /* samples[] */
	movl	12(%ebp), %ebx /* &dm_par */
	movl	$64, %ecx		/* loop counter */

	movss	(%ebx), %xmm5	/* unit */
	shufps	$0, %xmm5, %xmm5	/* unit | unit | unit | unit */

	movss	4(%ebx), %xmm6		/* clev */
	shufps	$0, %xmm6, %xmm6	/* clev | clev | clev | clev */


.loop6:	
	movaps	(%eax), %xmm0 /*  left */
	movaps	2048(%eax), %xmm1 /* right */
	movaps 	1024(%eax), %xmm2	/* center */
	mulps	%xmm5, %xmm0
	mulps	%xmm5, %xmm1
	
	mulps	%xmm6, %xmm2

	addps	%xmm2, %xmm0
	
	addps 	%xmm2, %xmm1

	movaps	%xmm0, (%eax)
	movaps	%xmm1, 1024(%eax)

	addl	$16, %eax
	decl 	%ecx
	jnz	.loop6

	popl	%ecx
	popl	%ebx
	popl 	%eax

	leave
	ret
	.p2align 4,,7
	
	.global stream_sample_2ch_to_s16_kni
	.type stream_sample_2ch_to_s16_kni, @function

stream_sample_2ch_to_s16_kni:
	pushl %ebp
	movl  %esp, %ebp

	pushl %eax
	pushl %ebx
	pushl %edx
	pushl %ecx

	movl 8(%ebp), %eax	/* s16_samples */
	movl 12(%ebp), %ebx	/* left */
	movl 16(%ebp), %edx	/* right */
	movl $64, %ecx

.loop1:
	movaps (%ebx), %xmm0	/* l3 | l2 | l1 | l0 */
	movaps (%edx), %xmm1	/* r3 | r2 | r1 | r0 */
	movhlps %xmm0, %xmm2	/* l3 | l2 */
	movhlps %xmm1, %xmm3	/* r3 | r2 */
	unpcklps %xmm1, %xmm0	/* r1 | l1 | r0 | l0 */
	unpcklps %xmm3, %xmm2	/* r3 | l3 | r2 | l2 */

	cvtps2pi %xmm0, %mm0	/* r0 l0 --> mm0, int_32 */
	movhlps %xmm0, %xmm0	
	cvtps2pi %xmm0, %mm1	/* r1 l1 --> mm1, int_32 */

	cvtps2pi %xmm2, %mm2	/* r2 l2 --> mm2, int_32 */
	movhlps %xmm2, %xmm2
	cvtps2pi %xmm2, %mm3	/* r3 l3 --> mm3, int_32 */
	packssdw %mm1, %mm0	/* r1 l1 r0 l0 --> mm0, int_16 */
	packssdw %mm3, %mm2	/* r3 l3 r2 l2 --> mm2, int_16 */

	movq %mm0, (%eax)
	movq %mm2, 8(%eax)
	addl $16, %eax
	addl $16, %ebx
	addl $16, %edx

	decl %ecx
	jnz .loop1

	popl %ecx
	popl %edx
	popl %ebx
	popl %eax

	emms

	leave
	ret
	.p2align 4,,7
	
	.global stream_sample_1ch_to_s16_kni
	.type stream_sample_1ch_to_s16_kni, @function

stream_sample_1ch_to_s16_kni:
	pushl %ebp
	movl  %esp, %ebp

	pushl %eax
	pushl %ebx
	pushl %ecx

	movl $sqrt2, %eax
	movss (%eax), %xmm7
	movl 8(%ebp), %eax	/* s16_samples */
	movl 12(%ebp), %ebx	/* left */
	shufps $0, %xmm7, %xmm7
	movl $64, %ecx

.loop2:
	movaps (%ebx), %xmm0	/* c3 | c2 | c1 | c0 */
	mulps %xmm7, %xmm0
	movhlps %xmm0, %xmm2	/* c3 | c2 */

	cvtps2pi %xmm0, %mm0	/* c1 c0 --> mm0, int_32 */
	cvtps2pi %xmm2, %mm1	/* c3 c2 --> mm1, int_32 */

	packssdw %mm0, %mm0	/* c1 c1 c0 c0 --> mm0, int_16 */
	packssdw %mm1, %mm1	/* c3 c3 c2 c2 --> mm1, int_16 */

	movq %mm0, (%eax)
	movq %mm1, 8(%eax)
	addl $16, %eax
	addl $16, %ebx

	decl %ecx
	jnz .loop2

	popl %ecx
	popl %ebx
	popl %eax
	
	emms
	leave
	ret 
#endif