Regenerated optimized x86 basic ops with latest GCC 4.4.0 snapshot

2009-03-04 12:10:02 +01:00
parent ca8439301b
commit 18bbed4fd6
3 changed files with 383 additions and 342 deletions
--- a/src/core/basic_ops_x86_mmx.s
+++ b/src/core/basic_ops_x86_mmx.s
@@ -10,11 +10,11 @@ alignedMemCpyMMX:
 	movl	124(%esp), %eax
 	shrl	$6, %ebx
 #APP
-# 42 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
+# 42 "/home/toby/development/git/lmms/src/core/basic_ops_x86.c" 1
 	 fsave 4(%esp); fwait

 # 0 "" 2
-# 44 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
+# 44 "/home/toby/development/git/lmms/src/core/basic_ops_x86.c" 1
 	1: prefetchnta (%eax)
   prefetchnta 64(%eax)
   prefetchnta 128(%eax)
@@ -31,7 +31,7 @@ alignedMemCpyMMX:
 	.p2align 3
 .L3:
 #APP
-# 53 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
+# 53 "/home/toby/development/git/lmms/src/core/basic_ops_x86.c" 1
 	1: prefetchnta 320(%eax)
 2: movq (%eax), %mm0
   movq 8(%eax), %mm1
@@ -59,7 +59,7 @@ alignedMemCpyMMX:
 	jne	.L3
 .L2:
 #APP
-# 75 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
+# 75 "/home/toby/development/git/lmms/src/core/basic_ops_x86.c" 1
 	 fsave 4(%esp); fwait

 # 0 "" 2
@@ -83,7 +83,7 @@ alignedMemClearMMX:
 	.p2align 3
 .L9:
 #APP
-# 90 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
+# 90 "/home/toby/development/git/lmms/src/core/basic_ops_x86.c" 1
 	movq    %mm0, (%edx)
 movq    %mm0, 8(%edx)
 movq    %mm0, 16(%edx)
@@ -103,5 +103,5 @@ movq    %mm0, 56(%edx)
 	emms
 	ret
 	.size	alignedMemClearMMX, .-alignedMemClearMMX
-	.ident	"GCC: (GNU) 4.4.0 20081204 (experimental)"
+	.ident	"GCC: (GNU) 4.4.0 20090304 (experimental)"
 	.section	.note.GNU-stack,"",@progbits
--- a/src/core/basic_ops_x86_sse.s
+++ b/src/core/basic_ops_x86_sse.s
@@ -77,16 +77,16 @@ alignedBufApplyGainSSE:
 	.p2align 4,,7
 	.p2align 3
 .L14:
-	movaps	%xmm0, %xmm3
+	movaps	16(%eax), %xmm3
 	addl	$1, %edx
-	movaps	%xmm0, %xmm2
-	movaps	%xmm0, %xmm1
-	movaps	%xmm0, %xmm4
-	mulps	16(%eax), %xmm3
-	mulps	32(%eax), %xmm2
-	mulps	48(%eax), %xmm1
+	movaps	32(%eax), %xmm2
+	mulps	%xmm0, %xmm3
+	movaps	48(%eax), %xmm1
+	mulps	%xmm0, %xmm2
+	movaps	(%eax), %xmm4
+	mulps	%xmm0, %xmm1
 	movaps	%xmm3, 16(%eax)
-	mulps	(%eax), %xmm4
+	mulps	%xmm0, %xmm4
 	movaps	%xmm2, 32(%eax)
 	movaps	%xmm1, 48(%eax)
 	movaps	%xmm4, (%eax)
@@ -148,28 +148,28 @@ alignedBufMixLRCoeffSSE:
 	movl	16(%esp), %ebx
 	testl	%esi, %esi
 	jle	.L25
-	movss	24(%esp), %xmm0
+	movss	24(%esp), %xmm2
 	subl	$1, %esi
-	movss	20(%esp), %xmm1
+	movss	20(%esp), %xmm0
 	xorl	%eax, %eax
 	shrl	$2, %esi
 	xorl	%ecx, %ecx
 	addl	$1, %esi
-	unpcklps	%xmm0, %xmm1
-	movaps	%xmm1, %xmm0
-	movlhps	%xmm1, %xmm0
+	unpcklps	%xmm2, %xmm0
+	movaps	%xmm0, %xmm2
+	movlhps	%xmm0, %xmm2
 	.p2align 4,,7
 	.p2align 3
 .L24:
-	movaps	%xmm0, %xmm1
+	movaps	16(%ebx,%eax), %xmm0
 	addl	$1, %ecx
-	movaps	%xmm0, %xmm2
-	mulps	16(%ebx,%eax), %xmm1
-	mulps	(%ebx,%eax), %xmm2
-	addps	16(%edx,%eax), %xmm1
-	addps	(%edx,%eax), %xmm2
-	movaps	%xmm1, 16(%edx,%eax)
-	movaps	%xmm2, (%edx,%eax)
+	movaps	(%ebx,%eax), %xmm1
+	mulps	%xmm2, %xmm0
+	mulps	%xmm2, %xmm1
+	addps	16(%edx,%eax), %xmm0
+	addps	(%edx,%eax), %xmm1
+	movaps	%xmm0, 16(%edx,%eax)
+	movaps	%xmm1, (%edx,%eax)
 	addl	$32, %eax
 	cmpl	%ecx, %esi
 	ja	.L24
@@ -189,31 +189,31 @@ alignedBufWetDryMixSSE:
 	movl	16(%esp), %ebx
 	testl	%esi, %esi
 	jle	.L30
-	movss	24(%esp), %xmm1
+	movss	24(%esp), %xmm3
 	subl	$1, %esi
-	movss	20(%esp), %xmm0
+	movss	20(%esp), %xmm2
 	xorl	%eax, %eax
 	shrl	$2, %esi
 	xorl	%ecx, %ecx
-	shufps	$0, %xmm1, %xmm1
+	shufps	$0, %xmm3, %xmm3
 	addl	$1, %esi
-	shufps	$0, %xmm0, %xmm0
+	shufps	$0, %xmm2, %xmm2
 	.p2align 4,,7
 	.p2align 3
 .L29:
-	movaps	%xmm1, %xmm3
+	movaps	16(%ebx,%eax), %xmm1
 	addl	$1, %ecx
-	movaps	%xmm0, %xmm2
-	movaps	%xmm1, %xmm4
-	mulps	16(%edx,%eax), %xmm3
-	mulps	16(%ebx,%eax), %xmm2
-	mulps	(%edx,%eax), %xmm4
-	addps	%xmm3, %xmm2
-	movaps	%xmm0, %xmm3
-	mulps	(%ebx,%eax), %xmm3
-	movaps	%xmm2, 16(%edx,%eax)
-	addps	%xmm4, %xmm3
-	movaps	%xmm3, (%edx,%eax)
+	movaps	16(%edx,%eax), %xmm0
+	mulps	%xmm2, %xmm1
+	movaps	(%ebx,%eax), %xmm4
+	mulps	%xmm3, %xmm0
+	mulps	%xmm2, %xmm4
+	addps	%xmm1, %xmm0
+	movaps	(%edx,%eax), %xmm1
+	mulps	%xmm3, %xmm1
+	movaps	%xmm0, 16(%edx,%eax)
+	addps	%xmm4, %xmm1
+	movaps	%xmm1, (%edx,%eax)
 	addl	$32, %eax
 	cmpl	%ecx, %esi
 	ja	.L29
@@ -230,131 +230,129 @@ alignedBufWetDryMixSplittedSSE:
 	pushl	%edi
 	pushl	%esi
 	pushl	%ebx
-	subl	$124, %esp
-	movl	164(%esp), %eax
-	movl	144(%esp), %edx
-	movl	148(%esp), %esi
-	movl	152(%esp), %ecx
+	subl	$140, %esp
+	movl	180(%esp), %eax
+	flds	172(%esp)
+	movl	160(%esp), %edx
+	movl	164(%esp), %esi
 	testl	%eax, %eax
-	movss	156(%esp), %xmm4
-	movss	160(%esp), %xmm5
-	jle	.L39
-	movl	164(%esp), %eax
+	movl	168(%esp), %ecx
+	flds	176(%esp)
+	jle	.L43
+	movl	180(%esp), %eax
 	subl	$1, %eax
 	shrl	%eax
 	addl	$1, %eax
 	movl	%eax, %ebp
-	movl	%eax, 96(%esp)
+	movl	%eax, 120(%esp)
 	shrl	$2, %ebp
-	cmpl	$3, 96(%esp)
+	cmpl	$3, 120(%esp)
 	leal	0(,%ebp,4), %eax
-	movl	%eax, 100(%esp)
+	movl	%eax, 124(%esp)
 	jbe	.L40
 	testl	%eax, %eax
 	jne	.L34
 .L40:
+	fxch	%st(1)
 	xorl	%edi, %edi
 	jmp	.L36
 	.p2align 4,,7
 	.p2align 3
 .L34:
-	movaps	%xmm4, %xmm2
-	xorps	%xmm6, %xmm6
-	shufps	$0, %xmm2, %xmm2
-	movaps	%xmm5, %xmm1
+	fsts	12(%esp)
+	fxch	%st(1)
+	xorps	%xmm7, %xmm7
+	movss	12(%esp), %xmm0
 	movl	%esi, %ebx
-	shufps	$0, %xmm1, %xmm1
-	movaps	%xmm2, (%esp)
+	fsts	12(%esp)
 	xorl	%eax, %eax
 	xorl	%edi, %edi
-	movss	%xmm5, 108(%esp)
-	movss	%xmm4, 104(%esp)
-	movaps	%xmm1, %xmm4
+	shufps	$0, %xmm0, %xmm0
+	movaps	%xmm0, 32(%esp)
+	movss	12(%esp), %xmm0
+	shufps	$0, %xmm0, %xmm0
+	movaps	%xmm0, 16(%esp)
 	.p2align 4,,7
 	.p2align 3
 .L37:
-	movaps	16(%edx,%eax,2), %xmm3
+	movaps	(%edx,%eax,2), %xmm5
 	addl	$1, %edi
-	movaps	(%edx,%eax,2), %xmm2
-	movaps	32(%edx,%eax,2), %xmm1
-	movaps	%xmm2, %xmm7
-	shufps	$221, %xmm3, %xmm2
-	movaps	48(%edx,%eax,2), %xmm0
-	shufps	$136, %xmm3, %xmm7
-	movaps	%xmm2, 64(%esp)
-	movaps	%xmm1, %xmm2
-	shufps	$221, %xmm0, %xmm1
-	shufps	$136, %xmm0, %xmm2
-	movaps	%xmm6, %xmm3
-	movaps	%xmm2, 48(%esp)
-	movlps	(%ebx), %xmm3
-	movhps	8(%ebx), %xmm3
-	movaps	%xmm7, %xmm5
-	movaps	%xmm3, %xmm0
-	movaps	%xmm6, %xmm2
-	movlps	16(%ebx), %xmm2
-	shufps	$136, 48(%esp), %xmm5
-	movhps	24(%ebx), %xmm2
-	shufps	$136, %xmm2, %xmm0
-	addl	$32, %ebx
-	mulps	%xmm4, %xmm5
-	shufps	$221, %xmm2, %xmm3
-	movaps	%xmm1, 32(%esp)
-	mulps	(%esp), %xmm0
-	movaps	%xmm6, %xmm1
-	shufps	$221, 48(%esp), %xmm7
-	movlps	(%ecx,%eax), %xmm1
-	movhps	8(%ecx,%eax), %xmm1
-	movaps	64(%esp), %xmm2
-	mulps	%xmm4, %xmm7
-	addps	%xmm0, %xmm5
-	movaps	%xmm6, %xmm0
-	movlps	16(%ecx,%eax), %xmm0
-	movhps	24(%ecx,%eax), %xmm0
-	shufps	$221, 32(%esp), %xmm2
-	movaps	%xmm5, 16(%esp)
-	movaps	64(%esp), %xmm5
-	mulps	%xmm4, %xmm2
-	shufps	$136, 32(%esp), %xmm5
-	mulps	(%esp), %xmm3
-	mulps	%xmm4, %xmm5
-	addps	%xmm3, %xmm7
-	movaps	16(%esp), %xmm3
-	movaps	%xmm5, 80(%esp)
-	movaps	%xmm1, %xmm5
-	shufps	$221, %xmm0, %xmm1
-	shufps	$136, %xmm0, %xmm5
-	mulps	(%esp), %xmm1
-	unpcklps	%xmm7, %xmm3
-	mulps	(%esp), %xmm5
-	movaps	16(%esp), %xmm0
-	addps	%xmm1, %xmm2
-	movaps	%xmm3, %xmm1
-	addps	80(%esp), %xmm5
-	unpckhps	%xmm7, %xmm0
-	movaps	%xmm0, %xmm7
+	movaps	16(%edx,%eax,2), %xmm6
 	movaps	%xmm5, %xmm0
-	unpcklps	%xmm2, %xmm0
-	unpckhps	%xmm2, %xmm5
-	unpcklps	%xmm0, %xmm1
-	unpckhps	%xmm0, %xmm3
-	movaps	%xmm7, %xmm0
-	unpckhps	%xmm5, %xmm7
-	unpcklps	%xmm5, %xmm0
+	shufps	$136, %xmm6, %xmm0
+	movaps	32(%edx,%eax,2), %xmm4
+	shufps	$221, %xmm6, %xmm5
+	movaps	%xmm0, 96(%esp)
+	movaps	48(%edx,%eax,2), %xmm3
+	movaps	%xmm4, %xmm0
+	shufps	$136, %xmm3, %xmm0
+	movaps	96(%esp), %xmm2
+	shufps	$221, %xmm3, %xmm4
+	movaps	%xmm7, %xmm6
+	movlps	(%ebx), %xmm6
+	movaps	%xmm5, 80(%esp)
+	movhps	8(%ebx), %xmm6
+	shufps	$136, %xmm0, %xmm2
+	movaps	%xmm0, 64(%esp)
+	movaps	%xmm7, %xmm5
+	movaps	%xmm6, %xmm0
+	movlps	16(%ebx), %xmm5
+	movhps	24(%ebx), %xmm5
+	shufps	$136, %xmm5, %xmm0
+	mulps	32(%esp), %xmm2
+	shufps	$221, %xmm5, %xmm6
+	movaps	%xmm4, 48(%esp)
+	addl	$32, %ebx
+	mulps	16(%esp), %xmm0
+	movaps	%xmm7, %xmm4
+	movlps	(%eax,%ecx), %xmm4
+	movaps	%xmm7, %xmm3
+	movhps	8(%eax,%ecx), %xmm4
+	movaps	%xmm4, %xmm1
+	movlps	16(%ecx,%eax), %xmm3
+	movhps	24(%ecx,%eax), %xmm3
+	shufps	$136, %xmm3, %xmm1
+	addps	%xmm0, %xmm2
+	movaps	80(%esp), %xmm0
+	shufps	$221, %xmm3, %xmm4
+	shufps	$136, 48(%esp), %xmm0
+	mulps	16(%esp), %xmm1
+	movaps	%xmm2, %xmm3
+	movaps	80(%esp), %xmm5
+	mulps	32(%esp), %xmm0
+	shufps	$221, 48(%esp), %xmm5
+	mulps	16(%esp), %xmm6
+	addps	%xmm1, %xmm0
+	movaps	96(%esp), %xmm1
+	shufps	$221, 64(%esp), %xmm1
+	mulps	16(%esp), %xmm4
+	mulps	32(%esp), %xmm1
+	mulps	32(%esp), %xmm5
+	addps	%xmm6, %xmm1
+	addps	%xmm4, %xmm5
+	movaps	%xmm0, %xmm4
+	unpcklps	%xmm1, %xmm3
+	unpcklps	%xmm5, %xmm4
+	unpckhps	%xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	unpckhps	%xmm5, %xmm0
+	unpcklps	%xmm4, %xmm1
+	unpckhps	%xmm4, %xmm3
 	movaps	%xmm1, (%edx,%eax,2)
+	movaps	%xmm2, %xmm1
+	unpckhps	%xmm0, %xmm2
+	unpcklps	%xmm0, %xmm1
 	movaps	%xmm3, 16(%edx,%eax,2)
-	movaps	%xmm0, 32(%edx,%eax,2)
-	movaps	%xmm7, 48(%edx,%eax,2)
+	movaps	%xmm1, 32(%edx,%eax,2)
+	movaps	%xmm2, 48(%edx,%eax,2)
 	addl	$32, %eax
 	cmpl	%edi, %ebp
 	ja	.L37
-	movl	100(%esp), %edi
-	movl	96(%esp), %eax
-	movss	104(%esp), %xmm4
-	movss	108(%esp), %xmm5
+	movl	124(%esp), %edi
+	movl	120(%esp), %eax
 	addl	%edi, %edi
-	cmpl	%eax, 100(%esp)
-	je	.L39
+	cmpl	%eax, 124(%esp)
+	je	.L44
 .L36:
 	leal	(%edx,%edi,8), %ebx
 	xorl	%ebp, %ebp
@@ -363,38 +361,50 @@ alignedBufWetDryMixSplittedSSE:
 	.p2align 4,,7
 	.p2align 3
 .L38:
-	movaps	%xmm5, %xmm1
+	flds	(%ebx)
 	addl	$2, %ebp
-	movaps	%xmm4, %xmm0
-	mulss	(%ebx), %xmm1
-	mulss	(%esi,%eax,4), %xmm0
-	addss	%xmm1, %xmm0
-	movaps	%xmm5, %xmm1
-	movss	%xmm0, (%ebx)
-	movaps	%xmm4, %xmm0
-	mulss	4(%ebx), %xmm1
-	mulss	(%ecx,%eax,4), %xmm0
-	addss	%xmm1, %xmm0
-	movaps	%xmm5, %xmm1
-	movss	%xmm0, 4(%ebx)
+	fmul	%st(2), %st
+	flds	(%esi,%eax,4)
+	fmul	%st(2), %st
+	faddp	%st, %st(1)
+	fstps	(%ebx)
+	flds	4(%ebx)
+	fmul	%st(2), %st
+	flds	(%ecx,%eax,4)
+	fmul	%st(2), %st
+	faddp	%st, %st(1)
+	fstps	4(%ebx)
 	addl	$16, %ebx
-	movaps	%xmm4, %xmm0
-	mulss	(%edx), %xmm1
-	mulss	4(%esi,%eax,4), %xmm0
-	addss	%xmm1, %xmm0
-	movaps	%xmm5, %xmm1
-	movss	%xmm0, (%edx)
-	movaps	%xmm4, %xmm0
-	mulss	4(%edx), %xmm1
-	mulss	4(%ecx,%eax,4), %xmm0
+	flds	(%edx)
+	fmul	%st(2), %st
+	flds	4(%esi,%eax,4)
+	fmul	%st(2), %st
+	faddp	%st, %st(1)
+	fstps	(%edx)
+	flds	4(%edx)
+	fmul	%st(2), %st
+	flds	4(%ecx,%eax,4)
 	leal	(%edi,%ebp), %eax
-	addss	%xmm1, %xmm0
-	movss	%xmm0, 4(%edx)
+	fmul	%st(2), %st
+	faddp	%st, %st(1)
+	fstps	4(%edx)
 	addl	$16, %edx
-	cmpl	%eax, 164(%esp)
+	cmpl	%eax, 180(%esp)
 	jg	.L38
+	fstp	%st(0)
+	fstp	%st(0)
+	jmp	.L39
+.L43:
+	fstp	%st(0)
+	fstp	%st(0)
+	jmp	.L39
+.L44:
+	fstp	%st(0)
+	fstp	%st(0)
+	.p2align 4,,7
+	.p2align 3
 .L39:
-	addl	$124, %esp
+	addl	$140, %esp
 	popl	%ebx
 	popl	%esi
 	popl	%edi
@@ -407,34 +417,39 @@ alignedBufWetDryMixSplittedSSE:
 unalignedBufMixLRCoeffSSE:
 	pushl	%esi
 	pushl	%ebx
-	movl	28(%esp), %esi
-	movl	12(%esp), %eax
-	movl	16(%esp), %edx
-	movss	20(%esp), %xmm0
-	movl	%esi, %ecx
-	shrl	$31, %ecx
-	leal	(%esi,%ecx), %ebx
-	andl	$1, %ebx
-	cmpl	%ecx, %ebx
-	movss	24(%esp), %xmm3
-	jne	.L52
-.L44:
+	subl	$4, %esp
+	movl	32(%esp), %esi
+	flds	24(%esp)
+	movl	16(%esp), %eax
+	movl	20(%esp), %edx
+	movl	%esi, %ebx
+	flds	28(%esp)
+	shrl	$31, %ebx
+	leal	(%esi,%ebx), %ecx
+	andl	$1, %ecx
+	cmpl	%ebx, %ecx
+	jne	.L54
+.L46:
 	testl	%esi, %esi
-	jle	.L49
+	jle	.L55
 	leal	-1(%esi), %ebx
 	shrl	%ebx
 	testb	$15, %al
-	jne	.L46
-	movaps	%xmm0, %xmm1
+	jne	.L48
+	fxch	%st(1)
+	fstps	(%esp)
 	xorps	%xmm2, %xmm2
-	unpcklps	%xmm3, %xmm1
+	movss	(%esp), %xmm0
 	addl	$1, %ebx
+	fstps	(%esp)
 	xorl	%ecx, %ecx
-	movaps	%xmm1, %xmm3
-	movlhps	%xmm1, %xmm3
+	movss	(%esp), %xmm1
+	unpcklps	%xmm1, %xmm0
+	movaps	%xmm0, %xmm3
+	movlhps	%xmm0, %xmm3
 	.p2align 4,,7
 	.p2align 3
-.L47:
+.L49:
 	movaps	%xmm2, %xmm1
 	addl	$1, %ecx
 	movlps	(%edx), %xmm1
@@ -448,55 +463,65 @@ unalignedBufMixLRCoeffSSE:
 	movaps	%xmm0, (%eax)
 	addl	$16, %eax
 	cmpl	%ebx, %ecx
-	jb	.L47
-.L49:
+	jb	.L49
+	jmp	.L51
+	.p2align 4,,7
+	.p2align 3
+.L55:
+	fstp	%st(0)
+	fstp	%st(0)
+	.p2align 4,,7
+	.p2align 3
+.L51:
+	addl	$4, %esp
 	popl	%ebx
 	popl	%esi
 	ret
 	.p2align 4,,7
 	.p2align 3
-.L46:
-	xorl	%ecx, %ecx
-	.p2align 4,,7
-	.p2align 3
 .L48:
-	movaps	%xmm0, %xmm1
-	mulss	(%edx,%ecx,8), %xmm1
-	addss	(%eax,%ecx,8), %xmm1
-	movss	%xmm1, (%eax,%ecx,8)
-	movaps	%xmm3, %xmm1
-	mulss	4(%edx,%ecx,8), %xmm1
-	addss	4(%eax,%ecx,8), %xmm1
-	movss	%xmm1, 4(%eax,%ecx,8)
-	movaps	%xmm0, %xmm1
-	mulss	8(%edx,%ecx,8), %xmm1
-	addss	8(%eax,%ecx,8), %xmm1
-	movss	%xmm1, 8(%eax,%ecx,8)
-	movaps	%xmm3, %xmm1
-	mulss	12(%edx,%ecx,8), %xmm1
-	addss	12(%eax,%ecx,8), %xmm1
-	movss	%xmm1, 12(%eax,%ecx,8)
+	xorl	%ecx, %ecx
+	.p2align 4,,7
+	.p2align 3
+.L50:
+	flds	(%edx,%ecx,8)
+	fmul	%st(2), %st
+	fadds	(%eax,%ecx,8)
+	fstps	(%eax,%ecx,8)
+	flds	4(%edx,%ecx,8)
+	fmul	%st(1), %st
+	fadds	4(%eax,%ecx,8)
+	fstps	4(%eax,%ecx,8)
+	flds	8(%edx,%ecx,8)
+	fmul	%st(2), %st
+	fadds	8(%eax,%ecx,8)
+	fstps	8(%eax,%ecx,8)
+	flds	12(%edx,%ecx,8)
+	fmul	%st(1), %st
+	fadds	12(%eax,%ecx,8)
+	fstps	12(%eax,%ecx,8)
 	addl	$2, %ecx
 	cmpl	%ecx, %esi
-	jg	.L48
+	jg	.L50
+	fstp	%st(0)
+	fstp	%st(0)
+	addl	$4, %esp
 	popl	%ebx
 	popl	%esi
 	ret
-.L52:
-	movaps	%xmm0, %xmm1
+.L54:
+	flds	(%edx)
 	subl	$1, %esi
-	movss	(%eax), %xmm2
-	mulss	(%edx), %xmm1
-	addss	%xmm2, %xmm1
-	movss	4(%eax), %xmm2
-	movss	%xmm1, (%eax)
-	movaps	%xmm3, %xmm1
-	mulss	4(%edx), %xmm1
+	fmul	%st(2), %st
+	fadds	(%eax)
+	fstps	(%eax)
+	flds	4(%edx)
 	addl	$8, %edx
-	addss	%xmm2, %xmm1
-	movss	%xmm1, 4(%eax)
+	fmul	%st(1), %st
+	fadds	4(%eax)
+	fstps	4(%eax)
 	addl	$8, %eax
-	jmp	.L44
+	jmp	.L46
 	.size	unalignedBufMixLRCoeffSSE, .-unalignedBufMixLRCoeffSSE
-	.ident	"GCC: (GNU) 4.4.0 20081204 (experimental)"
+	.ident	"GCC: (GNU) 4.4.0 20090304 (experimental)"
 	.section	.note.GNU-stack,"",@progbits
--- a/src/core/basic_ops_x86_sse2.s
+++ b/src/core/basic_ops_x86_sse2.s
@@ -70,19 +70,19 @@ alignedConvertToS16SSE2:
 	pushl	%ebx
 	subl	$8, %esp
 	movl	36(%esp), %eax
-	movss	.LC0, %xmm4
 	cmpb	$0, 44(%esp)
+	flds	.LC0
 	movl	28(%esp), %edx
 	movl	32(%esp), %ebx
 	movl	%eax, %esi
-	mulss	40(%esp), %xmm4
+	fmuls	40(%esp)
 	jne	.L13
 	testw	%ax, %ax
-	jle	.L15
+	jle	.L35
 	movl	%eax, %edi
 	shrw	$2, %di
 	cmpw	$3, %ax
-	movw	%ax, 2(%esp)
+	movw	%ax, 4(%esp)
 	leal	0(,%edi,4), %ebp
 	ja	.L33
 .L28:
@@ -98,17 +98,19 @@ alignedConvertToS16SSE2:
 	.p2align 4,,7
 	.p2align 3
 .L25:
-	movaps	%xmm4, %xmm0
-	mulss	(%edx), %xmm0
-	cvttss2si	%xmm0, %ecx
-	movaps	%xmm4, %xmm0
-	mulss	4(%edx), %xmm0
+	flds	(%edx)
+	fmul	%st(1), %st
+	fstps	4(%esp)
+	cvttss2si	4(%esp), %ecx
+	flds	4(%edx)
+	fmul	%st(1), %st
 	cmpl	$-32768, %ecx
 	cmovl	%edi, %ecx
 	cmpl	$32767, %ecx
 	cmovg	%ebx, %ecx
+	fstps	4(%esp)
 	movw	%cx, (%eax)
-	cvttss2si	%xmm0, %ecx
+	cvttss2si	4(%esp), %ecx
 	cmpl	$-32768, %ecx
 	cmovl	%edi, %ecx
 	cmpl	$32767, %ecx
@@ -119,6 +121,15 @@ alignedConvertToS16SSE2:
 	addl	$4, %eax
 	cmpw	%bp, %si
 	jg	.L25
+	fstp	%st(0)
+	jmp	.L15
+.L35:
+	fstp	%st(0)
+	jmp	.L15
+.L36:
+	fstp	%st(0)
+	.p2align 4,,7
+	.p2align 3
 .L15:
 	movswl	%si,%esi
 	addl	$8, %esp
@@ -132,11 +143,11 @@ alignedConvertToS16SSE2:
 	.p2align 3
 .L13:
 	testw	%ax, %ax
-	jle	.L15
+	jle	.L36
 	movl	%eax, %ebp
 	shrw	$2, %bp
 	cmpw	$3, %si
-	movw	%ax, 2(%esp)
+	movw	%ax, 4(%esp)
 	leal	0(,%ebp,4), %eax
 	ja	.L34
 .L27:
@@ -151,12 +162,13 @@ alignedConvertToS16SSE2:
 	.p2align 4,,7
 	.p2align 3
 .L20:
-	movaps	%xmm4, %xmm0
+	flds	(%ecx)
 	movl	$32767, %ebp
-	mulss	(%ecx), %xmm0
-	cvttss2si	%xmm0, %ebx
-	movaps	%xmm4, %xmm0
-	mulss	4(%ecx), %xmm0
+	fmul	%st(1), %st
+	fstps	4(%esp)
+	cvttss2si	4(%esp), %ebx
+	flds	4(%ecx)
+	fmul	%st(1), %st
 	cmpl	$-32768, %ebx
 	cmovl	%edi, %ebx
 	cmpl	$32767, %ebx
@@ -165,8 +177,9 @@ alignedConvertToS16SSE2:
 	sall	$8, %ebx
 	orl	%ebp, %ebx
 	movl	$32767, %ebp
+	fstps	4(%esp)
 	movw	%bx, (%edx)
-	cvttss2si	%xmm0, %ebx
+	cvttss2si	4(%esp), %ebx
 	cmpl	$-32768, %ebx
 	cmovl	%edi, %ebx
 	cmpl	$32767, %ebx
@@ -180,146 +193,149 @@ alignedConvertToS16SSE2:
 	addl	$4, %edx
 	cmpw	%ax, %si
 	jg	.L20
+	fstp	%st(0)
 	jmp	.L15
 	.p2align 4,,7
 	.p2align 3
 .L34:
 	testw	%ax, %ax
 	je	.L27
-	movaps	%xmm4, %xmm0
+	fsts	(%esp)
 	xorl	%ecx, %ecx
-	movdqa	.LC1, %xmm1
-	movss	%xmm4, 4(%esp)
-	shufps	$0, %xmm0, %xmm0
+	movdqa	.LC1, %xmm3
+	movss	(%esp), %xmm0
 	xorl	%edi, %edi
+	movdqa	.LC2, %xmm2
+	shufps	$0, %xmm0, %xmm0
 	movaps	%xmm0, %xmm7
-	movdqa	.LC2, %xmm0
 	.p2align 4,,7
 	.p2align 3
 .L19:
-	movaps	%xmm7, %xmm3
-	movdqa	%xmm0, %xmm5
-	movdqa	%xmm0, %xmm6
-	movaps	%xmm7, %xmm2
+	movaps	(%edx,%ecx,2), %xmm0
+	movdqa	%xmm2, %xmm4
+	movdqa	%xmm2, %xmm6
 	addl	$1, %edi
-	mulps	(%edx,%ecx,2), %xmm3
-	mulps	16(%edx,%ecx,2), %xmm2
-	cvttps2dq	%xmm3, %xmm3
-	movdqa	%xmm3, %xmm4
-	pcmpgtd	%xmm1, %xmm4
-	pand	%xmm4, %xmm3
-	pandn	%xmm1, %xmm4
-	por	%xmm4, %xmm3
-	cvttps2dq	%xmm2, %xmm2
-	movdqa	%xmm3, %xmm4
-	pcmpgtd	%xmm0, %xmm4
-	pand	%xmm4, %xmm5
-	pandn	%xmm3, %xmm4
-	movdqa	%xmm4, %xmm3
-	movdqa	%xmm2, %xmm4
-	por	%xmm5, %xmm3
-	pcmpgtd	%xmm1, %xmm4
+	movaps	16(%edx,%ecx,2), %xmm5
+	mulps	%xmm7, %xmm0
+	mulps	%xmm7, %xmm5
+	cvttps2dq	%xmm0, %xmm0
+	movdqa	%xmm0, %xmm1
+	pcmpgtd	%xmm3, %xmm1
+	pand	%xmm1, %xmm0
+	pandn	%xmm3, %xmm1
+	por	%xmm0, %xmm1
+	cvttps2dq	%xmm5, %xmm5
+	movdqa	%xmm1, %xmm0
+	pcmpgtd	%xmm2, %xmm0
+	pand	%xmm0, %xmm4
+	pandn	%xmm1, %xmm0
+	movdqa	%xmm0, %xmm1
+	movdqa	%xmm5, %xmm0
+	por	%xmm4, %xmm1
+	pcmpgtd	%xmm3, %xmm0
+	movdqa	.LC3, %xmm4
+	pand	%xmm0, %xmm5
+	pand	%xmm1, %xmm4
+	pandn	%xmm3, %xmm0
+	psrad	$8, %xmm4
+	por	%xmm5, %xmm0
+	pslld	$8, %xmm1
+	movdqa	%xmm0, %xmm5
+	pcmpgtd	%xmm2, %xmm5
+	pand	%xmm5, %xmm6
+	pandn	%xmm0, %xmm5
+	movdqa	%xmm5, %xmm0
 	movdqa	.LC3, %xmm5
-	pand	%xmm4, %xmm2
-	pand	%xmm3, %xmm5
-	pandn	%xmm1, %xmm4
+	por	%xmm6, %xmm0
+	pand	%xmm0, %xmm5
+	pslld	$8, %xmm0
 	psrad	$8, %xmm5
-	por	%xmm4, %xmm2
-	pslld	$8, %xmm3
-	movdqa	%xmm2, %xmm4
-	pcmpgtd	%xmm0, %xmm4
-	pand	%xmm4, %xmm6
-	pandn	%xmm2, %xmm4
-	movdqa	%xmm4, %xmm2
-	por	%xmm6, %xmm2
-	movdqa	.LC3, %xmm6
-	pand	%xmm2, %xmm6
-	pslld	$8, %xmm2
-	psrad	$8, %xmm6
-	movdqa	%xmm5, %xmm4
-	punpcklwd	%xmm6, %xmm5
-	punpckhwd	%xmm6, %xmm4
-	movdqa	%xmm5, %xmm6
-	punpcklwd	%xmm4, %xmm5
-	punpckhwd	%xmm4, %xmm6
-	movdqa	%xmm3, %xmm4
-	punpcklwd	%xmm6, %xmm5
-	punpckhwd	%xmm2, %xmm4
-	punpcklwd	%xmm2, %xmm3
-	movdqa	%xmm3, %xmm6
-	punpcklwd	%xmm4, %xmm3
-	punpckhwd	%xmm4, %xmm6
-	punpcklwd	%xmm6, %xmm3
-	por	%xmm3, %xmm5
-	movdqa	%xmm5, (%ebx,%ecx)
+	movdqa	%xmm4, %xmm6
+	punpcklwd	%xmm5, %xmm4
+	punpckhwd	%xmm5, %xmm6
+	movdqa	%xmm4, %xmm5
+	punpcklwd	%xmm6, %xmm4
+	punpckhwd	%xmm6, %xmm5
+	punpcklwd	%xmm5, %xmm4
+	movdqa	%xmm1, %xmm5
+	punpcklwd	%xmm0, %xmm1
+	punpckhwd	%xmm0, %xmm5
+	movdqa	%xmm1, %xmm0
+	punpcklwd	%xmm5, %xmm1
+	punpckhwd	%xmm5, %xmm0
+	punpcklwd	%xmm0, %xmm1
+	por	%xmm1, %xmm4
+	movdqa	%xmm4, (%ebx,%ecx)
 	addl	$16, %ecx
 	cmpw	%di, %bp
 	ja	.L19
-	cmpw	2(%esp), %ax
-	movss	4(%esp), %xmm4
+	cmpw	4(%esp), %ax
 	jne	.L18
+	fstp	%st(0)
 	jmp	.L15
 	.p2align 4,,7
 	.p2align 3
 .L33:
 	testw	%bp, %bp
-	.p2align 4,,3
+	.p2align 4,,4
 	.p2align 3
 	je	.L28
-	movaps	%xmm4, %xmm0
+	fsts	(%esp)
 	xorl	%eax, %eax
-	movdqa	.LC1, %xmm1
-	shufps	$0, %xmm0, %xmm0
+	movdqa	.LC1, %xmm3
+	movss	(%esp), %xmm0
 	xorl	%ecx, %ecx
-	movaps	%xmm0, %xmm6
-	movdqa	.LC2, %xmm0
+	movdqa	.LC2, %xmm2
+	shufps	$0, %xmm0, %xmm0
+	movaps	%xmm0, %xmm5
 	.p2align 4,,7
 	.p2align 3
 .L24:
-	movaps	%xmm6, %xmm3
+	movaps	(%edx,%eax,2), %xmm0
 	addl	$1, %ecx
-	movdqa	%xmm0, %xmm7
-	movaps	%xmm6, %xmm2
-	mulps	(%edx,%eax,2), %xmm3
-	mulps	16(%edx,%eax,2), %xmm2
-	cvttps2dq	%xmm3, %xmm3
-	movdqa	%xmm3, %xmm5
-	pcmpgtd	%xmm1, %xmm5
-	pand	%xmm5, %xmm3
-	pandn	%xmm1, %xmm5
-	por	%xmm5, %xmm3
-	cvttps2dq	%xmm2, %xmm2
-	movdqa	%xmm3, %xmm5
-	pcmpgtd	%xmm0, %xmm5
-	pand	%xmm5, %xmm7
-	pandn	%xmm3, %xmm5
-	movdqa	%xmm5, %xmm3
-	movdqa	%xmm2, %xmm5
-	por	%xmm7, %xmm3
-	pcmpgtd	%xmm1, %xmm5
-	movdqa	%xmm0, %xmm7
-	pand	%xmm5, %xmm2
-	pandn	%xmm1, %xmm5
-	por	%xmm5, %xmm2
-	movdqa	%xmm2, %xmm5
-	pcmpgtd	%xmm0, %xmm5
-	pand	%xmm5, %xmm7
-	pandn	%xmm2, %xmm5
-	movdqa	%xmm5, %xmm2
-	movdqa	%xmm3, %xmm5
-	por	%xmm7, %xmm2
-	punpckhwd	%xmm2, %xmm5
-	punpcklwd	%xmm2, %xmm3
-	movdqa	%xmm3, %xmm7
-	punpcklwd	%xmm5, %xmm3
-	punpckhwd	%xmm5, %xmm7
-	punpcklwd	%xmm7, %xmm3
-	movdqa	%xmm3, (%ebx,%eax)
+	movdqa	%xmm2, %xmm6
+	movaps	16(%edx,%eax,2), %xmm4
+	mulps	%xmm5, %xmm0
+	mulps	%xmm5, %xmm4
+	cvttps2dq	%xmm0, %xmm0
+	movdqa	%xmm0, %xmm1
+	pcmpgtd	%xmm3, %xmm1
+	pand	%xmm1, %xmm0
+	pandn	%xmm3, %xmm1
+	por	%xmm0, %xmm1
+	cvttps2dq	%xmm4, %xmm4
+	movdqa	%xmm1, %xmm0
+	pcmpgtd	%xmm2, %xmm0
+	pand	%xmm0, %xmm6
+	pandn	%xmm1, %xmm0
+	movdqa	%xmm0, %xmm1
+	movdqa	%xmm4, %xmm0
+	por	%xmm6, %xmm1
+	pcmpgtd	%xmm3, %xmm0
+	movdqa	%xmm2, %xmm6
+	pand	%xmm0, %xmm4
+	pandn	%xmm3, %xmm0
+	por	%xmm4, %xmm0
+	movdqa	%xmm0, %xmm4
+	pcmpgtd	%xmm2, %xmm4
+	pand	%xmm4, %xmm6
+	pandn	%xmm0, %xmm4
+	movdqa	%xmm4, %xmm0
+	movdqa	%xmm1, %xmm4
+	por	%xmm6, %xmm0
+	punpckhwd	%xmm0, %xmm4
+	punpcklwd	%xmm0, %xmm1
+	movdqa	%xmm1, %xmm0
+	punpcklwd	%xmm4, %xmm1
+	punpckhwd	%xmm4, %xmm0
+	punpcklwd	%xmm0, %xmm1
+	movdqa	%xmm1, (%ebx,%eax)
 	addl	$16, %eax
 	cmpw	%cx, %di
 	ja	.L24
-	cmpw	%bp, 2(%esp)
+	cmpw	%bp, 4(%esp)
 	jne	.L23
+	fstp	%st(0)
 	jmp	.L15
 	.size	alignedConvertToS16SSE2, .-alignedConvertToS16SSE2
 	.section	.rodata.cst4,"aM",@progbits,4
@@ -345,5 +361,5 @@ alignedConvertToS16SSE2:
 	.long	65280
 	.long	65280
 	.long	65280
-	.ident	"GCC: (GNU) 4.4.0 20081204 (experimental)"
+	.ident	"GCC: (GNU) 4.4.0 20090304 (experimental)"
 	.section	.note.GNU-stack,"",@progbits