Regenerated optimized x86_64 basic ops with latest GCC 4.4.0 snapshot

2009-03-04 16:30:33 +01:00
parent 18bbed4fd6
commit 2887895df7
2 changed files with 241 additions and 247 deletions
--- a/src/core/basic_ops_x86_64_sse.s
+++ b/src/core/basic_ops_x86_64_sse.s
@@ -36,14 +36,14 @@ alignedMemCpySSE:
 	.type	alignedMemClearSSE, @function
 alignedMemClearSSE:
 .LFB510:
-	movslq	%esi,%rax
-	shrq	$6, %rax
-	testl	%eax, %eax
+	movslq	%esi,%rsi
+	shrq	$6, %rsi
+	testl	%esi, %esi
 	jle	.L10
-	subl	$1, %eax
+	subl	$1, %esi
 	xorps	%xmm0, %xmm0
-	salq	$6, %rax
-	leaq	64(%rax,%rdi), %rax
+	salq	$6, %rsi
+	leaq	64(%rdi,%rsi), %rax
 	.align 16
 .L9:
 	movaps	%xmm0, (%rdi)
@@ -65,23 +65,23 @@ alignedBufApplyGainSSE:
 .LFB511:
 	testl	%esi, %esi
 	jle	.L15
-	subl	$1, %esi
+	leal	-1(%rsi), %edx
 	shufps	$0, %xmm0, %xmm0
-	shrl	$3, %esi
 	xorl	%eax, %eax
-	leal	1(%rsi), %edx
+	shrl	$3, %edx
+	addl	$1, %edx
 	.align 16
 .L14:
-	movaps	%xmm0, %xmm3
+	movaps	16(%rdi), %xmm3
 	addl	$1, %eax
-	movaps	%xmm0, %xmm2
-	movaps	%xmm0, %xmm1
-	movaps	%xmm0, %xmm4
-	mulps	16(%rdi), %xmm3
-	mulps	32(%rdi), %xmm2
-	mulps	48(%rdi), %xmm1
-	mulps	(%rdi), %xmm4
+	movaps	32(%rdi), %xmm2
+	mulps	%xmm0, %xmm3
+	movaps	48(%rdi), %xmm1
+	mulps	%xmm0, %xmm2
+	movaps	(%rdi), %xmm4
+	mulps	%xmm0, %xmm1
 	movaps	%xmm3, 16(%rdi)
+	mulps	%xmm0, %xmm4
 	movaps	%xmm2, 32(%rdi)
 	movaps	%xmm1, 48(%rdi)
 	movaps	%xmm4, (%rdi)
@@ -100,11 +100,11 @@ alignedBufMixSSE:
 .LFB512:
 	testl	%edx, %edx
 	jle	.L20
-	subl	$1, %edx
+	leal	-1(%rdx), %ecx
 	xorl	%eax, %eax
-	shrl	$3, %edx
-	leal	1(%rdx), %ecx
 	xorl	%edx, %edx
+	shrl	$3, %ecx
+	addl	$1, %ecx
 	.align 16
 .L19:
 	movaps	16(%rdi,%rax), %xmm2
@@ -136,23 +136,23 @@ alignedBufMixLRCoeffSSE:
 	testl	%edx, %edx
 	jle	.L25
 	unpcklps	%xmm1, %xmm0
-	subl	$1, %edx
-	shrl	$2, %edx
+	leal	-1(%rdx), %ecx
 	xorl	%eax, %eax
-	leal	1(%rdx), %ecx
 	xorl	%edx, %edx
+	shrl	$2, %ecx
 	movlhps	%xmm0, %xmm0
+	addl	$1, %ecx
 	.align 16
 .L24:
-	movaps	%xmm0, %xmm1
+	movaps	16(%rsi,%rax), %xmm2
 	addl	$1, %edx
-	movaps	%xmm0, %xmm2
-	mulps	16(%rsi,%rax), %xmm1
-	mulps	(%rsi,%rax), %xmm2
-	addps	16(%rdi,%rax), %xmm1
-	addps	(%rdi,%rax), %xmm2
-	movaps	%xmm1, 16(%rdi,%rax)
-	movaps	%xmm2, (%rdi,%rax)
+	movaps	(%rsi,%rax), %xmm3
+	mulps	%xmm0, %xmm2
+	mulps	%xmm0, %xmm3
+	addps	16(%rdi,%rax), %xmm2
+	addps	(%rdi,%rax), %xmm3
+	movaps	%xmm2, 16(%rdi,%rax)
+	movaps	%xmm3, (%rdi,%rax)
 	addq	$32, %rax
 	cmpl	%edx, %ecx
 	ja	.L24
@@ -168,25 +168,25 @@ alignedBufWetDryMixSSE:
 .LFB515:
 	testl	%edx, %edx
 	jle	.L30
-	subl	$1, %edx
+	leal	-1(%rdx), %ecx
 	shufps	$0, %xmm1, %xmm1
 	shufps	$0, %xmm0, %xmm0
-	shrl	$2, %edx
-	leal	1(%rdx), %ecx
 	xorl	%eax, %eax
+	shrl	$2, %ecx
 	xorl	%edx, %edx
+	addl	$1, %ecx
 	.align 16
 .L29:
-	movaps	%xmm1, %xmm3
+	movaps	16(%rsi,%rax), %xmm3
 	addl	$1, %edx
-	movaps	%xmm0, %xmm2
-	mulps	16(%rdi,%rax), %xmm3
-	movaps	%xmm1, %xmm4
-	mulps	16(%rsi,%rax), %xmm2
-	mulps	(%rdi,%rax), %xmm4
+	movaps	16(%rdi,%rax), %xmm2
+	mulps	%xmm0, %xmm3
+	movaps	(%rsi,%rax), %xmm4
+	mulps	%xmm1, %xmm2
+	mulps	%xmm0, %xmm4
 	addps	%xmm3, %xmm2
-	movaps	%xmm0, %xmm3
-	mulps	(%rsi,%rax), %xmm3
+	movaps	(%rdi,%rax), %xmm3
+	mulps	%xmm1, %xmm3
 	movaps	%xmm2, 16(%rdi,%rax)
 	addps	%xmm4, %xmm3
 	movaps	%xmm3, (%rdi,%rax)
@@ -226,84 +226,80 @@ alignedBufWetDryMixSplittedSSE:
 .L34:
 	movaps	%xmm1, %xmm2
 	movq	%rdi, %rax
-	xorps	%xmm6, %xmm6
+	xorps	%xmm10, %xmm10
 	movq	%rsi, %r9
 	shufps	$0, %xmm2, %xmm2
 	movq	%rdx, %r8
 	xorl	%r10d, %r10d
-	movaps	%xmm2, %xmm8
+	movaps	%xmm2, %xmm12
 	movaps	%xmm0, %xmm2
 	shufps	$0, %xmm2, %xmm2
-	movaps	%xmm2, %xmm7
+	movaps	%xmm2, %xmm11
 	.align 16
 .L37:
-	movaps	(%rax), %xmm12
+	movaps	(%rax), %xmm2
 	addl	$1, %r10d
-	movaps	%xmm6, %xmm3
+	movaps	%xmm10, %xmm9
 	movaps	16(%rax), %xmm5
-	movaps	%xmm12, %xmm14
-	movlps	(%r8), %xmm3
-	movaps	32(%rax), %xmm9
-	shufps	$136, %xmm5, %xmm14
-	shufps	$221, %xmm5, %xmm12
-	movhps	8(%r8), %xmm3
-	movaps	48(%rax), %xmm4
-	movaps	%xmm9, %xmm13
-	movaps	%xmm6, %xmm5
-	shufps	$221, %xmm4, %xmm9
-	movlps	(%r9), %xmm5
-	shufps	$136, %xmm4, %xmm13
-	movaps	%xmm6, %xmm4
-	movhps	8(%r9), %xmm5
-	movaps	%xmm14, %xmm11
-	movlps	16(%r9), %xmm4
-	movaps	%xmm12, %xmm15
-	movaps	%xmm5, %xmm2
-	movhps	24(%r9), %xmm4
-	shufps	$136, %xmm13, %xmm11
-	movaps	%xmm3, %xmm10
+	movaps	%xmm2, %xmm4
+	movlps	(%r9), %xmm9
+	movaps	%xmm10, %xmm8
+	movaps	32(%rax), %xmm14
+	shufps	$136, %xmm5, %xmm4
+	movhps	8(%r9), %xmm9
+	movaps	48(%rax), %xmm3
+	movaps	%xmm14, %xmm15
+	movlps	16(%r9), %xmm8
+	shufps	$221, %xmm5, %xmm2
+	shufps	$136, %xmm3, %xmm15
+	movhps	24(%r9), %xmm8
+	shufps	$221, %xmm3, %xmm14
+	movaps	%xmm4, %xmm5
 	addq	$32, %r9
-	shufps	$136, %xmm4, %xmm2
-	mulps	%xmm8, %xmm11
-	mulps	%xmm7, %xmm2
-	shufps	$221, %xmm13, %xmm14
-	shufps	$136, %xmm9, %xmm15
-	shufps	$221, %xmm4, %xmm5
-	addps	%xmm2, %xmm11
-	movaps	%xmm6, %xmm2
-	shufps	$221, %xmm9, %xmm12
-	movlps	16(%r8), %xmm2
-	mulps	%xmm8, %xmm14
-	movhps	24(%r8), %xmm2
-	mulps	%xmm7, %xmm5
-	movaps	%xmm11, %xmm9
+	movaps	%xmm9, %xmm3
+	shufps	$136, %xmm15, %xmm5
+	movaps	%xmm10, %xmm7
+	shufps	$136, %xmm8, %xmm3
+	movlps	(%r8), %xmm7
+	movaps	%xmm10, %xmm6
+	mulps	%xmm12, %xmm5
+	movhps	8(%r8), %xmm7
+	mulps	%xmm11, %xmm3
+	movlps	16(%r8), %xmm6
+	movaps	%xmm7, %xmm13
+	movhps	24(%r8), %xmm6
+	shufps	$221, %xmm15, %xmm4
+	shufps	$221, %xmm8, %xmm9
 	addq	$32, %r8
-	shufps	$136, %xmm2, %xmm10
-	shufps	$221, %xmm2, %xmm3
-	movaps	%xmm14, %xmm4
-	mulps	%xmm8, %xmm15
-	addps	%xmm5, %xmm4
-	mulps	%xmm7, %xmm10
-	movaps	%xmm11, %xmm5
-	mulps	%xmm8, %xmm12
-	mulps	%xmm7, %xmm3
-	addps	%xmm15, %xmm10
-	unpcklps	%xmm4, %xmm9
-	movaps	%xmm12, %xmm2
+	shufps	$136, %xmm6, %xmm13
+	addps	%xmm3, %xmm5
+	movaps	%xmm2, %xmm3
+	shufps	$221, %xmm6, %xmm7
+	shufps	$136, %xmm14, %xmm3
+	shufps	$221, %xmm14, %xmm2
+	mulps	%xmm11, %xmm13
+	movaps	%xmm5, %xmm6
+	mulps	%xmm12, %xmm3
+	mulps	%xmm12, %xmm4
+	mulps	%xmm11, %xmm9
+	addps	%xmm13, %xmm3
+	mulps	%xmm12, %xmm2
+	mulps	%xmm11, %xmm7
+	addps	%xmm9, %xmm4
+	addps	%xmm7, %xmm2
+	unpcklps	%xmm4, %xmm6
 	unpckhps	%xmm4, %xmm5
-	addps	%xmm3, %xmm2
-	movaps	%xmm10, %xmm4
-	movaps	%xmm10, %xmm3
+	movaps	%xmm3, %xmm4
 	unpcklps	%xmm2, %xmm4
 	unpckhps	%xmm2, %xmm3
-	movaps	%xmm9, %xmm2
+	movaps	%xmm6, %xmm2
 	unpcklps	%xmm4, %xmm2
-	unpckhps	%xmm4, %xmm9
+	unpckhps	%xmm4, %xmm6
 	movaps	%xmm2, (%rax)
 	movaps	%xmm5, %xmm2
 	unpckhps	%xmm3, %xmm5
 	unpcklps	%xmm3, %xmm2
-	movaps	%xmm9, 16(%rax)
+	movaps	%xmm6, 16(%rax)
 	movaps	%xmm2, 32(%rax)
 	movaps	%xmm5, 48(%rax)
 	addq	$64, %rax
@@ -325,34 +321,34 @@ alignedBufWetDryMixSplittedSSE:
 	addq	%rbx, %rdx
 	.align 16
 .L38:
-	movaps	%xmm1, %xmm3
+	movss	(%r11), %xmm3
 	addl	$2, %r9d
-	movaps	%xmm0, %xmm2
-	mulss	(%r8), %xmm3
-	mulss	(%r11), %xmm2
+	movss	(%r8), %xmm2
+	mulss	%xmm0, %xmm3
+	mulss	%xmm1, %xmm2
 	addq	$8, %r11
 	addss	%xmm3, %xmm2
-	movaps	%xmm1, %xmm3
-	mulss	4(%r8), %xmm3
 	movss	%xmm2, (%r8)
-	movaps	%xmm0, %xmm2
-	mulss	(%r10), %xmm2
+	movss	4(%r8), %xmm2
+	movss	(%r10), %xmm3
+	mulss	%xmm1, %xmm2
 	addq	$8, %r10
+	mulss	%xmm0, %xmm3
 	addss	%xmm3, %xmm2
-	movaps	%xmm1, %xmm3
 	movss	%xmm2, 4(%r8)
-	movaps	%xmm0, %xmm2
 	addq	$16, %r8
-	mulss	(%rax), %xmm3
-	mulss	(%rsi), %xmm2
+	movss	(%rsi), %xmm3
 	addq	$8, %rsi
+	movss	(%rax), %xmm2
+	mulss	%xmm0, %xmm3
+	mulss	%xmm1, %xmm2
 	addss	%xmm3, %xmm2
-	movaps	%xmm1, %xmm3
-	mulss	4(%rax), %xmm3
 	movss	%xmm2, (%rax)
-	movaps	%xmm0, %xmm2
-	mulss	(%rdx), %xmm2
+	movss	4(%rax), %xmm2
+	movss	(%rdx), %xmm3
+	mulss	%xmm1, %xmm2
 	addq	$8, %rdx
+	mulss	%xmm0, %xmm3
 	addss	%xmm3, %xmm2
 	movss	%xmm2, 4(%rax)
 	addq	$16, %rax
@@ -369,11 +365,11 @@ alignedBufWetDryMixSplittedSSE:
 	.type	unalignedBufMixLRCoeffSSE, @function
 unalignedBufMixLRCoeffSSE:
 .LFB514:
-	movl	%edx, %eax
-	shrl	$31, %eax
-	leal	(%rdx,%rax), %ecx
-	andl	$1, %ecx
-	cmpl	%eax, %ecx
+	movl	%edx, %ecx
+	shrl	$31, %ecx
+	leal	(%rdx,%rcx), %eax
+	andl	$1, %eax
+	cmpl	%ecx, %eax
 	jne	.L52
 .L44:
 	testl	%edx, %edx
@@ -413,20 +409,20 @@ unalignedBufMixLRCoeffSSE:
 	salq	$4, %rdx
 	.align 16
 .L48:
-	movaps	%xmm0, %xmm2
-	mulss	(%rsi,%rax), %xmm2
+	movss	(%rsi,%rax), %xmm2
+	mulss	%xmm0, %xmm2
 	addss	(%rdi,%rax), %xmm2
 	movss	%xmm2, (%rdi,%rax)
-	movaps	%xmm1, %xmm2
-	mulss	4(%rsi,%rax), %xmm2
+	movss	4(%rsi,%rax), %xmm2
+	mulss	%xmm1, %xmm2
 	addss	4(%rdi,%rax), %xmm2
 	movss	%xmm2, 4(%rdi,%rax)
-	movaps	%xmm0, %xmm2
-	mulss	8(%rsi,%rax), %xmm2
+	movss	8(%rsi,%rax), %xmm2
+	mulss	%xmm0, %xmm2
 	addss	8(%rdi,%rax), %xmm2
 	movss	%xmm2, 8(%rdi,%rax)
-	movaps	%xmm1, %xmm2
-	mulss	12(%rsi,%rax), %xmm2
+	movss	12(%rsi,%rax), %xmm2
+	mulss	%xmm1, %xmm2
 	addss	12(%rdi,%rax), %xmm2
 	movss	%xmm2, 12(%rdi,%rax)
 	addq	$16, %rax
@@ -436,17 +432,15 @@ unalignedBufMixLRCoeffSSE:
 	rep
 	ret
 .L52:
-	movaps	%xmm0, %xmm2
+	movss	(%rsi), %xmm2
 	subl	$1, %edx
-	movss	(%rdi), %xmm3
-	mulss	(%rsi), %xmm2
-	addss	%xmm3, %xmm2
-	movss	4(%rdi), %xmm3
+	mulss	%xmm0, %xmm2
+	addss	(%rdi), %xmm2
 	movss	%xmm2, (%rdi)
-	movaps	%xmm1, %xmm2
-	mulss	4(%rsi), %xmm2
+	movss	4(%rsi), %xmm2
 	addq	$8, %rsi
-	addss	%xmm3, %xmm2
+	mulss	%xmm1, %xmm2
+	addss	4(%rdi), %xmm2
 	movss	%xmm2, 4(%rdi)
 	addq	$8, %rdi
 	jmp	.L44
@@ -558,4 +552,4 @@ unalignedBufMixLRCoeffSSE:
 	.byte	0x0
 	.align 8
 .LEFDE15:
-	.ident	"GCC: (GNU) 4.4.0 20081204 (experimental)"
+	.ident	"GCC: (GNU) 4.4.0 20090304 (experimental)"
--- a/src/core/basic_ops_x86_64_sse2.s
+++ b/src/core/basic_ops_x86_64_sse2.s
@@ -36,14 +36,14 @@ alignedMemCpySSE2:
 	.type	alignedMemClearSSE2, @function
 alignedMemClearSSE2:
 .LFB510:
-	movslq	%esi,%rax
-	shrq	$6, %rax
-	testl	%eax, %eax
+	movslq	%esi,%rsi
+	shrq	$6, %rsi
+	testl	%esi, %esi
 	jle	.L10
-	subl	$1, %eax
+	subl	$1, %esi
 	pxor	%xmm0, %xmm0
-	salq	$6, %rax
-	leaq	64(%rax,%rdi), %rax
+	salq	$6, %rsi
+	leaq	64(%rdi,%rsi), %rax
 	.align 16
 .L9:
 	movdqa	%xmm0, (%rdi)
@@ -89,11 +89,11 @@ alignedConvertToS16SSE2:
 	movl	$-32768, %edi
 	.align 16
 .L25:
-	movaps	%xmm0, %xmm1
-	mulss	(%rcx), %xmm1
+	movss	(%rcx), %xmm1
+	mulss	%xmm0, %xmm1
 	cvttss2si	%xmm1, %esi
-	movaps	%xmm0, %xmm1
-	mulss	4(%rcx), %xmm1
+	movss	4(%rcx), %xmm1
+	mulss	%xmm0, %xmm1
 	cmpl	$-32768, %esi
 	cmovl	%edi, %esi
 	cmpl	$32767, %esi
@@ -136,11 +136,11 @@ alignedConvertToS16SSE2:
 	movl	$32767, %esi
 	.align 16
 .L20:
-	movaps	%xmm0, %xmm1
-	mulss	(%rcx), %xmm1
+	movss	(%rcx), %xmm1
+	mulss	%xmm0, %xmm1
 	cvttss2si	%xmm1, %ebx
-	movaps	%xmm0, %xmm1
-	mulss	4(%rcx), %xmm1
+	movss	4(%rcx), %xmm1
+	mulss	%xmm0, %xmm1
 	cmpl	$-32768, %ebx
 	cmovl	%edi, %ebx
 	cmpl	$32767, %ebx
@@ -176,71 +176,71 @@ alignedConvertToS16SSE2:
 	je	.L27
 	movaps	%xmm0, %xmm1
 	movq	%rdi, %rcx
-	movdqa	.LC1(%rip), %xmm2
+	movdqa	.LC1(%rip), %xmm4
 	movq	%rsi, %r10
 	shufps	$0, %xmm1, %xmm1
 	xorl	%r9d, %r9d
-	movdqa	.LC3(%rip), %xmm8
+	movdqa	.LC2(%rip), %xmm3
 	movaps	%xmm1, %xmm9
-	movdqa	.LC2(%rip), %xmm1
+	movdqa	.LC3(%rip), %xmm8
 	.align 16
 .L19:
-	movaps	%xmm9, %xmm4
+	movaps	(%rcx), %xmm1
 	addl	$1, %r9d
-	movaps	%xmm9, %xmm3
-	mulps	(%rcx), %xmm4
-	movdqa	%xmm1, %xmm6
-	mulps	16(%rcx), %xmm3
+	movdqa	%xmm3, %xmm5
+	mulps	%xmm9, %xmm1
+	movaps	16(%rcx), %xmm6
+	movdqa	%xmm3, %xmm7
 	addq	$32, %rcx
-	cvttps2dq	%xmm4, %xmm4
-	movdqa	%xmm4, %xmm5
-	pcmpgtd	%xmm2, %xmm5
-	cvttps2dq	%xmm3, %xmm3
-	pand	%xmm5, %xmm4
-	pandn	%xmm2, %xmm5
-	por	%xmm5, %xmm4
-	movdqa	%xmm4, %xmm5
-	pcmpgtd	%xmm1, %xmm5
-	pand	%xmm5, %xmm6
-	pandn	%xmm4, %xmm5
-	movdqa	%xmm5, %xmm4
-	movdqa	%xmm3, %xmm5
-	por	%xmm6, %xmm4
-	movdqa	%xmm1, %xmm6
-	pcmpgtd	%xmm2, %xmm5
-	pand	%xmm5, %xmm3
-	pandn	%xmm2, %xmm5
-	movdqa	%xmm4, %xmm7
-	pslld	$8, %xmm4
-	pand	%xmm8, %xmm7
-	por	%xmm5, %xmm3
-	psrad	$8, %xmm7
-	movdqa	%xmm3, %xmm5
-	pcmpgtd	%xmm1, %xmm5
-	pand	%xmm5, %xmm6
-	pandn	%xmm3, %xmm5
-	movdqa	%xmm5, %xmm3
-	por	%xmm6, %xmm3
-	movdqa	%xmm7, %xmm6
-	movdqa	%xmm3, %xmm5
-	pslld	$8, %xmm3
+	mulps	%xmm9, %xmm6
+	cvttps2dq	%xmm1, %xmm1
+	movdqa	%xmm1, %xmm2
+	pcmpgtd	%xmm4, %xmm2
+	cvttps2dq	%xmm6, %xmm6
+	pand	%xmm2, %xmm1
+	pandn	%xmm4, %xmm2
+	por	%xmm1, %xmm2
+	movdqa	%xmm2, %xmm1
+	pcmpgtd	%xmm3, %xmm1
+	pand	%xmm1, %xmm5
+	pandn	%xmm2, %xmm1
+	movdqa	%xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	por	%xmm5, %xmm2
+	pcmpgtd	%xmm4, %xmm1
+	pand	%xmm1, %xmm6
+	pandn	%xmm4, %xmm1
+	movdqa	%xmm2, %xmm5
+	pslld	$8, %xmm2
 	pand	%xmm8, %xmm5
+	por	%xmm6, %xmm1
 	psrad	$8, %xmm5
-	punpcklwd	%xmm5, %xmm7
-	punpckhwd	%xmm5, %xmm6
-	movdqa	%xmm4, %xmm5
-	punpcklwd	%xmm3, %xmm4
-	movdqa	%xmm7, %xmm10
-	punpckhwd	%xmm3, %xmm5
-	punpcklwd	%xmm6, %xmm7
-	punpckhwd	%xmm6, %xmm10
-	punpcklwd	%xmm10, %xmm7
-	movdqa	%xmm4, %xmm10
-	punpcklwd	%xmm5, %xmm4
-	punpckhwd	%xmm5, %xmm10
-	punpcklwd	%xmm10, %xmm4
-	por	%xmm7, %xmm4
-	movdqa	%xmm4, (%r10)
+	movdqa	%xmm1, %xmm6
+	pcmpgtd	%xmm3, %xmm6
+	pand	%xmm6, %xmm7
+	pandn	%xmm1, %xmm6
+	movdqa	%xmm6, %xmm1
+	por	%xmm7, %xmm1
+	movdqa	%xmm5, %xmm7
+	movdqa	%xmm1, %xmm6
+	pslld	$8, %xmm1
+	pand	%xmm8, %xmm6
+	psrad	$8, %xmm6
+	punpcklwd	%xmm6, %xmm5
+	punpckhwd	%xmm6, %xmm7
+	movdqa	%xmm5, %xmm6
+	punpcklwd	%xmm7, %xmm5
+	punpckhwd	%xmm7, %xmm6
+	punpcklwd	%xmm6, %xmm5
+	movdqa	%xmm2, %xmm6
+	punpcklwd	%xmm1, %xmm2
+	punpckhwd	%xmm1, %xmm6
+	movdqa	%xmm2, %xmm1
+	punpcklwd	%xmm6, %xmm2
+	punpckhwd	%xmm6, %xmm1
+	punpcklwd	%xmm1, %xmm2
+	por	%xmm2, %xmm5
+	movdqa	%xmm5, (%r10)
 	addq	$16, %r10
 	cmpw	%r9w, %bx
 	ja	.L19
@@ -253,54 +253,54 @@ alignedConvertToS16SSE2:
 	je	.L28
 	movaps	%xmm0, %xmm1
 	movq	%rdi, %rcx
-	movdqa	.LC1(%rip), %xmm2
+	movdqa	.LC1(%rip), %xmm4
 	movq	%rsi, %r10
 	shufps	$0, %xmm1, %xmm1
 	xorl	%r9d, %r9d
+	movdqa	.LC2(%rip), %xmm3
 	movaps	%xmm1, %xmm6
-	movdqa	.LC2(%rip), %xmm1
 	.align 16
 .L24:
-	movaps	%xmm6, %xmm4
+	movaps	(%rcx), %xmm1
 	addl	$1, %r9d
-	movaps	%xmm6, %xmm3
-	mulps	(%rcx), %xmm4
-	movdqa	%xmm1, %xmm7
-	mulps	16(%rcx), %xmm3
+	movdqa	%xmm3, %xmm7
+	mulps	%xmm6, %xmm1
+	movaps	16(%rcx), %xmm5
 	addq	$32, %rcx
-	cvttps2dq	%xmm4, %xmm4
-	movdqa	%xmm4, %xmm5
-	pcmpgtd	%xmm2, %xmm5
-	cvttps2dq	%xmm3, %xmm3
-	pand	%xmm5, %xmm4
-	pandn	%xmm2, %xmm5
-	por	%xmm5, %xmm4
-	movdqa	%xmm4, %xmm5
-	pcmpgtd	%xmm1, %xmm5
+	mulps	%xmm6, %xmm5
+	cvttps2dq	%xmm1, %xmm1
+	movdqa	%xmm1, %xmm2
+	pcmpgtd	%xmm4, %xmm2
+	cvttps2dq	%xmm5, %xmm5
+	pand	%xmm2, %xmm1
+	pandn	%xmm4, %xmm2
+	por	%xmm1, %xmm2
+	movdqa	%xmm2, %xmm1
+	pcmpgtd	%xmm3, %xmm1
+	pand	%xmm1, %xmm7
+	pandn	%xmm2, %xmm1
+	movdqa	%xmm1, %xmm2
+	movdqa	%xmm5, %xmm1
+	por	%xmm7, %xmm2
+	movdqa	%xmm3, %xmm7
+	pcmpgtd	%xmm4, %xmm1
+	pand	%xmm1, %xmm5
+	pandn	%xmm4, %xmm1
+	por	%xmm5, %xmm1
+	movdqa	%xmm1, %xmm5
+	pcmpgtd	%xmm3, %xmm5
 	pand	%xmm5, %xmm7
-	pandn	%xmm4, %xmm5
-	movdqa	%xmm5, %xmm4
-	movdqa	%xmm3, %xmm5
-	por	%xmm7, %xmm4
-	movdqa	%xmm1, %xmm7
-	pcmpgtd	%xmm2, %xmm5
-	pand	%xmm5, %xmm3
-	pandn	%xmm2, %xmm5
-	por	%xmm5, %xmm3
-	movdqa	%xmm3, %xmm5
-	pcmpgtd	%xmm1, %xmm5
-	pand	%xmm5, %xmm7
-	pandn	%xmm3, %xmm5
-	movdqa	%xmm5, %xmm3
-	movdqa	%xmm4, %xmm5
-	por	%xmm7, %xmm3
-	punpcklwd	%xmm3, %xmm4
-	punpckhwd	%xmm3, %xmm5
-	movdqa	%xmm4, %xmm7
-	punpcklwd	%xmm5, %xmm4
-	punpckhwd	%xmm5, %xmm7
-	punpcklwd	%xmm7, %xmm4
-	movdqa	%xmm4, (%r10)
+	pandn	%xmm1, %xmm5
+	movdqa	%xmm5, %xmm1
+	movdqa	%xmm2, %xmm5
+	por	%xmm7, %xmm1
+	punpcklwd	%xmm1, %xmm2
+	punpckhwd	%xmm1, %xmm5
+	movdqa	%xmm2, %xmm1
+	punpcklwd	%xmm5, %xmm2
+	punpckhwd	%xmm5, %xmm1
+	punpcklwd	%xmm1, %xmm2
+	movdqa	%xmm2, (%r10)
 	addq	$16, %r10
 	cmpw	%r9w, %bx
 	ja	.L24
@@ -392,4 +392,4 @@ alignedConvertToS16SSE2:
 	.byte	0x2
 	.align 8
 .LEFDE5:
-	.ident	"GCC: (GNU) 4.4.0 20081204 (experimental)"
+	.ident	"GCC: (GNU) 4.4.0 20090304 (experimental)"