Regenerated optimized x86 basic ops with latest GCC 4.4.0 snapshot

This commit is contained in:
Tobias Doerffel
2009-03-04 12:10:02 +01:00
parent ca8439301b
commit 18bbed4fd6
3 changed files with 383 additions and 342 deletions

View File

@@ -10,11 +10,11 @@ alignedMemCpyMMX:
movl 124(%esp), %eax
shrl $6, %ebx
#APP
# 42 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
# 42 "/home/toby/development/git/lmms/src/core/basic_ops_x86.c" 1
fsave 4(%esp); fwait
# 0 "" 2
# 44 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
# 44 "/home/toby/development/git/lmms/src/core/basic_ops_x86.c" 1
1: prefetchnta (%eax)
prefetchnta 64(%eax)
prefetchnta 128(%eax)
@@ -31,7 +31,7 @@ alignedMemCpyMMX:
.p2align 3
.L3:
#APP
# 53 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
# 53 "/home/toby/development/git/lmms/src/core/basic_ops_x86.c" 1
1: prefetchnta 320(%eax)
2: movq (%eax), %mm0
movq 8(%eax), %mm1
@@ -59,7 +59,7 @@ alignedMemCpyMMX:
jne .L3
.L2:
#APP
# 75 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
# 75 "/home/toby/development/git/lmms/src/core/basic_ops_x86.c" 1
fsave 4(%esp); fwait
# 0 "" 2
@@ -83,7 +83,7 @@ alignedMemClearMMX:
.p2align 3
.L9:
#APP
# 90 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
# 90 "/home/toby/development/git/lmms/src/core/basic_ops_x86.c" 1
movq %mm0, (%edx)
movq %mm0, 8(%edx)
movq %mm0, 16(%edx)
@@ -103,5 +103,5 @@ movq %mm0, 56(%edx)
emms
ret
.size alignedMemClearMMX, .-alignedMemClearMMX
.ident "GCC: (GNU) 4.4.0 20081204 (experimental)"
.ident "GCC: (GNU) 4.4.0 20090304 (experimental)"
.section .note.GNU-stack,"",@progbits

View File

@@ -77,16 +77,16 @@ alignedBufApplyGainSSE:
.p2align 4,,7
.p2align 3
.L14:
movaps %xmm0, %xmm3
movaps 16(%eax), %xmm3
addl $1, %edx
movaps %xmm0, %xmm2
movaps %xmm0, %xmm1
movaps %xmm0, %xmm4
mulps 16(%eax), %xmm3
mulps 32(%eax), %xmm2
mulps 48(%eax), %xmm1
movaps 32(%eax), %xmm2
mulps %xmm0, %xmm3
movaps 48(%eax), %xmm1
mulps %xmm0, %xmm2
movaps (%eax), %xmm4
mulps %xmm0, %xmm1
movaps %xmm3, 16(%eax)
mulps (%eax), %xmm4
mulps %xmm0, %xmm4
movaps %xmm2, 32(%eax)
movaps %xmm1, 48(%eax)
movaps %xmm4, (%eax)
@@ -148,28 +148,28 @@ alignedBufMixLRCoeffSSE:
movl 16(%esp), %ebx
testl %esi, %esi
jle .L25
movss 24(%esp), %xmm0
movss 24(%esp), %xmm2
subl $1, %esi
movss 20(%esp), %xmm1
movss 20(%esp), %xmm0
xorl %eax, %eax
shrl $2, %esi
xorl %ecx, %ecx
addl $1, %esi
unpcklps %xmm0, %xmm1
movaps %xmm1, %xmm0
movlhps %xmm1, %xmm0
unpcklps %xmm2, %xmm0
movaps %xmm0, %xmm2
movlhps %xmm0, %xmm2
.p2align 4,,7
.p2align 3
.L24:
movaps %xmm0, %xmm1
movaps 16(%ebx,%eax), %xmm0
addl $1, %ecx
movaps %xmm0, %xmm2
mulps 16(%ebx,%eax), %xmm1
mulps (%ebx,%eax), %xmm2
addps 16(%edx,%eax), %xmm1
addps (%edx,%eax), %xmm2
movaps %xmm1, 16(%edx,%eax)
movaps %xmm2, (%edx,%eax)
movaps (%ebx,%eax), %xmm1
mulps %xmm2, %xmm0
mulps %xmm2, %xmm1
addps 16(%edx,%eax), %xmm0
addps (%edx,%eax), %xmm1
movaps %xmm0, 16(%edx,%eax)
movaps %xmm1, (%edx,%eax)
addl $32, %eax
cmpl %ecx, %esi
ja .L24
@@ -189,31 +189,31 @@ alignedBufWetDryMixSSE:
movl 16(%esp), %ebx
testl %esi, %esi
jle .L30
movss 24(%esp), %xmm1
movss 24(%esp), %xmm3
subl $1, %esi
movss 20(%esp), %xmm0
movss 20(%esp), %xmm2
xorl %eax, %eax
shrl $2, %esi
xorl %ecx, %ecx
shufps $0, %xmm1, %xmm1
shufps $0, %xmm3, %xmm3
addl $1, %esi
shufps $0, %xmm0, %xmm0
shufps $0, %xmm2, %xmm2
.p2align 4,,7
.p2align 3
.L29:
movaps %xmm1, %xmm3
movaps 16(%ebx,%eax), %xmm1
addl $1, %ecx
movaps %xmm0, %xmm2
movaps %xmm1, %xmm4
mulps 16(%edx,%eax), %xmm3
mulps 16(%ebx,%eax), %xmm2
mulps (%edx,%eax), %xmm4
addps %xmm3, %xmm2
movaps %xmm0, %xmm3
mulps (%ebx,%eax), %xmm3
movaps %xmm2, 16(%edx,%eax)
addps %xmm4, %xmm3
movaps %xmm3, (%edx,%eax)
movaps 16(%edx,%eax), %xmm0
mulps %xmm2, %xmm1
movaps (%ebx,%eax), %xmm4
mulps %xmm3, %xmm0
mulps %xmm2, %xmm4
addps %xmm1, %xmm0
movaps (%edx,%eax), %xmm1
mulps %xmm3, %xmm1
movaps %xmm0, 16(%edx,%eax)
addps %xmm4, %xmm1
movaps %xmm1, (%edx,%eax)
addl $32, %eax
cmpl %ecx, %esi
ja .L29
@@ -230,131 +230,129 @@ alignedBufWetDryMixSplittedSSE:
pushl %edi
pushl %esi
pushl %ebx
subl $124, %esp
movl 164(%esp), %eax
movl 144(%esp), %edx
movl 148(%esp), %esi
movl 152(%esp), %ecx
subl $140, %esp
movl 180(%esp), %eax
flds 172(%esp)
movl 160(%esp), %edx
movl 164(%esp), %esi
testl %eax, %eax
movss 156(%esp), %xmm4
movss 160(%esp), %xmm5
jle .L39
movl 164(%esp), %eax
movl 168(%esp), %ecx
flds 176(%esp)
jle .L43
movl 180(%esp), %eax
subl $1, %eax
shrl %eax
addl $1, %eax
movl %eax, %ebp
movl %eax, 96(%esp)
movl %eax, 120(%esp)
shrl $2, %ebp
cmpl $3, 96(%esp)
cmpl $3, 120(%esp)
leal 0(,%ebp,4), %eax
movl %eax, 100(%esp)
movl %eax, 124(%esp)
jbe .L40
testl %eax, %eax
jne .L34
.L40:
fxch %st(1)
xorl %edi, %edi
jmp .L36
.p2align 4,,7
.p2align 3
.L34:
movaps %xmm4, %xmm2
xorps %xmm6, %xmm6
shufps $0, %xmm2, %xmm2
movaps %xmm5, %xmm1
fsts 12(%esp)
fxch %st(1)
xorps %xmm7, %xmm7
movss 12(%esp), %xmm0
movl %esi, %ebx
shufps $0, %xmm1, %xmm1
movaps %xmm2, (%esp)
fsts 12(%esp)
xorl %eax, %eax
xorl %edi, %edi
movss %xmm5, 108(%esp)
movss %xmm4, 104(%esp)
movaps %xmm1, %xmm4
shufps $0, %xmm0, %xmm0
movaps %xmm0, 32(%esp)
movss 12(%esp), %xmm0
shufps $0, %xmm0, %xmm0
movaps %xmm0, 16(%esp)
.p2align 4,,7
.p2align 3
.L37:
movaps 16(%edx,%eax,2), %xmm3
movaps (%edx,%eax,2), %xmm5
addl $1, %edi
movaps (%edx,%eax,2), %xmm2
movaps 32(%edx,%eax,2), %xmm1
movaps %xmm2, %xmm7
shufps $221, %xmm3, %xmm2
movaps 48(%edx,%eax,2), %xmm0
shufps $136, %xmm3, %xmm7
movaps %xmm2, 64(%esp)
movaps %xmm1, %xmm2
shufps $221, %xmm0, %xmm1
shufps $136, %xmm0, %xmm2
movaps %xmm6, %xmm3
movaps %xmm2, 48(%esp)
movlps (%ebx), %xmm3
movhps 8(%ebx), %xmm3
movaps %xmm7, %xmm5
movaps %xmm3, %xmm0
movaps %xmm6, %xmm2
movlps 16(%ebx), %xmm2
shufps $136, 48(%esp), %xmm5
movhps 24(%ebx), %xmm2
shufps $136, %xmm2, %xmm0
addl $32, %ebx
mulps %xmm4, %xmm5
shufps $221, %xmm2, %xmm3
movaps %xmm1, 32(%esp)
mulps (%esp), %xmm0
movaps %xmm6, %xmm1
shufps $221, 48(%esp), %xmm7
movlps (%ecx,%eax), %xmm1
movhps 8(%ecx,%eax), %xmm1
movaps 64(%esp), %xmm2
mulps %xmm4, %xmm7
addps %xmm0, %xmm5
movaps %xmm6, %xmm0
movlps 16(%ecx,%eax), %xmm0
movhps 24(%ecx,%eax), %xmm0
shufps $221, 32(%esp), %xmm2
movaps %xmm5, 16(%esp)
movaps 64(%esp), %xmm5
mulps %xmm4, %xmm2
shufps $136, 32(%esp), %xmm5
mulps (%esp), %xmm3
mulps %xmm4, %xmm5
addps %xmm3, %xmm7
movaps 16(%esp), %xmm3
movaps %xmm5, 80(%esp)
movaps %xmm1, %xmm5
shufps $221, %xmm0, %xmm1
shufps $136, %xmm0, %xmm5
mulps (%esp), %xmm1
unpcklps %xmm7, %xmm3
mulps (%esp), %xmm5
movaps 16(%esp), %xmm0
addps %xmm1, %xmm2
movaps %xmm3, %xmm1
addps 80(%esp), %xmm5
unpckhps %xmm7, %xmm0
movaps %xmm0, %xmm7
movaps 16(%edx,%eax,2), %xmm6
movaps %xmm5, %xmm0
unpcklps %xmm2, %xmm0
unpckhps %xmm2, %xmm5
unpcklps %xmm0, %xmm1
unpckhps %xmm0, %xmm3
movaps %xmm7, %xmm0
unpckhps %xmm5, %xmm7
unpcklps %xmm5, %xmm0
shufps $136, %xmm6, %xmm0
movaps 32(%edx,%eax,2), %xmm4
shufps $221, %xmm6, %xmm5
movaps %xmm0, 96(%esp)
movaps 48(%edx,%eax,2), %xmm3
movaps %xmm4, %xmm0
shufps $136, %xmm3, %xmm0
movaps 96(%esp), %xmm2
shufps $221, %xmm3, %xmm4
movaps %xmm7, %xmm6
movlps (%ebx), %xmm6
movaps %xmm5, 80(%esp)
movhps 8(%ebx), %xmm6
shufps $136, %xmm0, %xmm2
movaps %xmm0, 64(%esp)
movaps %xmm7, %xmm5
movaps %xmm6, %xmm0
movlps 16(%ebx), %xmm5
movhps 24(%ebx), %xmm5
shufps $136, %xmm5, %xmm0
mulps 32(%esp), %xmm2
shufps $221, %xmm5, %xmm6
movaps %xmm4, 48(%esp)
addl $32, %ebx
mulps 16(%esp), %xmm0
movaps %xmm7, %xmm4
movlps (%eax,%ecx), %xmm4
movaps %xmm7, %xmm3
movhps 8(%eax,%ecx), %xmm4
movaps %xmm4, %xmm1
movlps 16(%ecx,%eax), %xmm3
movhps 24(%ecx,%eax), %xmm3
shufps $136, %xmm3, %xmm1
addps %xmm0, %xmm2
movaps 80(%esp), %xmm0
shufps $221, %xmm3, %xmm4
shufps $136, 48(%esp), %xmm0
mulps 16(%esp), %xmm1
movaps %xmm2, %xmm3
movaps 80(%esp), %xmm5
mulps 32(%esp), %xmm0
shufps $221, 48(%esp), %xmm5
mulps 16(%esp), %xmm6
addps %xmm1, %xmm0
movaps 96(%esp), %xmm1
shufps $221, 64(%esp), %xmm1
mulps 16(%esp), %xmm4
mulps 32(%esp), %xmm1
mulps 32(%esp), %xmm5
addps %xmm6, %xmm1
addps %xmm4, %xmm5
movaps %xmm0, %xmm4
unpcklps %xmm1, %xmm3
unpcklps %xmm5, %xmm4
unpckhps %xmm1, %xmm2
movaps %xmm3, %xmm1
unpckhps %xmm5, %xmm0
unpcklps %xmm4, %xmm1
unpckhps %xmm4, %xmm3
movaps %xmm1, (%edx,%eax,2)
movaps %xmm2, %xmm1
unpckhps %xmm0, %xmm2
unpcklps %xmm0, %xmm1
movaps %xmm3, 16(%edx,%eax,2)
movaps %xmm0, 32(%edx,%eax,2)
movaps %xmm7, 48(%edx,%eax,2)
movaps %xmm1, 32(%edx,%eax,2)
movaps %xmm2, 48(%edx,%eax,2)
addl $32, %eax
cmpl %edi, %ebp
ja .L37
movl 100(%esp), %edi
movl 96(%esp), %eax
movss 104(%esp), %xmm4
movss 108(%esp), %xmm5
movl 124(%esp), %edi
movl 120(%esp), %eax
addl %edi, %edi
cmpl %eax, 100(%esp)
je .L39
cmpl %eax, 124(%esp)
je .L44
.L36:
leal (%edx,%edi,8), %ebx
xorl %ebp, %ebp
@@ -363,38 +361,50 @@ alignedBufWetDryMixSplittedSSE:
.p2align 4,,7
.p2align 3
.L38:
movaps %xmm5, %xmm1
flds (%ebx)
addl $2, %ebp
movaps %xmm4, %xmm0
mulss (%ebx), %xmm1
mulss (%esi,%eax,4), %xmm0
addss %xmm1, %xmm0
movaps %xmm5, %xmm1
movss %xmm0, (%ebx)
movaps %xmm4, %xmm0
mulss 4(%ebx), %xmm1
mulss (%ecx,%eax,4), %xmm0
addss %xmm1, %xmm0
movaps %xmm5, %xmm1
movss %xmm0, 4(%ebx)
fmul %st(2), %st
flds (%esi,%eax,4)
fmul %st(2), %st
faddp %st, %st(1)
fstps (%ebx)
flds 4(%ebx)
fmul %st(2), %st
flds (%ecx,%eax,4)
fmul %st(2), %st
faddp %st, %st(1)
fstps 4(%ebx)
addl $16, %ebx
movaps %xmm4, %xmm0
mulss (%edx), %xmm1
mulss 4(%esi,%eax,4), %xmm0
addss %xmm1, %xmm0
movaps %xmm5, %xmm1
movss %xmm0, (%edx)
movaps %xmm4, %xmm0
mulss 4(%edx), %xmm1
mulss 4(%ecx,%eax,4), %xmm0
flds (%edx)
fmul %st(2), %st
flds 4(%esi,%eax,4)
fmul %st(2), %st
faddp %st, %st(1)
fstps (%edx)
flds 4(%edx)
fmul %st(2), %st
flds 4(%ecx,%eax,4)
leal (%edi,%ebp), %eax
addss %xmm1, %xmm0
movss %xmm0, 4(%edx)
fmul %st(2), %st
faddp %st, %st(1)
fstps 4(%edx)
addl $16, %edx
cmpl %eax, 164(%esp)
cmpl %eax, 180(%esp)
jg .L38
fstp %st(0)
fstp %st(0)
jmp .L39
.L43:
fstp %st(0)
fstp %st(0)
jmp .L39
.L44:
fstp %st(0)
fstp %st(0)
.p2align 4,,7
.p2align 3
.L39:
addl $124, %esp
addl $140, %esp
popl %ebx
popl %esi
popl %edi
@@ -407,34 +417,39 @@ alignedBufWetDryMixSplittedSSE:
unalignedBufMixLRCoeffSSE:
pushl %esi
pushl %ebx
movl 28(%esp), %esi
movl 12(%esp), %eax
movl 16(%esp), %edx
movss 20(%esp), %xmm0
movl %esi, %ecx
shrl $31, %ecx
leal (%esi,%ecx), %ebx
andl $1, %ebx
cmpl %ecx, %ebx
movss 24(%esp), %xmm3
jne .L52
.L44:
subl $4, %esp
movl 32(%esp), %esi
flds 24(%esp)
movl 16(%esp), %eax
movl 20(%esp), %edx
movl %esi, %ebx
flds 28(%esp)
shrl $31, %ebx
leal (%esi,%ebx), %ecx
andl $1, %ecx
cmpl %ebx, %ecx
jne .L54
.L46:
testl %esi, %esi
jle .L49
jle .L55
leal -1(%esi), %ebx
shrl %ebx
testb $15, %al
jne .L46
movaps %xmm0, %xmm1
jne .L48
fxch %st(1)
fstps (%esp)
xorps %xmm2, %xmm2
unpcklps %xmm3, %xmm1
movss (%esp), %xmm0
addl $1, %ebx
fstps (%esp)
xorl %ecx, %ecx
movaps %xmm1, %xmm3
movlhps %xmm1, %xmm3
movss (%esp), %xmm1
unpcklps %xmm1, %xmm0
movaps %xmm0, %xmm3
movlhps %xmm0, %xmm3
.p2align 4,,7
.p2align 3
.L47:
.L49:
movaps %xmm2, %xmm1
addl $1, %ecx
movlps (%edx), %xmm1
@@ -448,55 +463,65 @@ unalignedBufMixLRCoeffSSE:
movaps %xmm0, (%eax)
addl $16, %eax
cmpl %ebx, %ecx
jb .L47
.L49:
jb .L49
jmp .L51
.p2align 4,,7
.p2align 3
.L55:
fstp %st(0)
fstp %st(0)
.p2align 4,,7
.p2align 3
.L51:
addl $4, %esp
popl %ebx
popl %esi
ret
.p2align 4,,7
.p2align 3
.L46:
xorl %ecx, %ecx
.p2align 4,,7
.p2align 3
.L48:
movaps %xmm0, %xmm1
mulss (%edx,%ecx,8), %xmm1
addss (%eax,%ecx,8), %xmm1
movss %xmm1, (%eax,%ecx,8)
movaps %xmm3, %xmm1
mulss 4(%edx,%ecx,8), %xmm1
addss 4(%eax,%ecx,8), %xmm1
movss %xmm1, 4(%eax,%ecx,8)
movaps %xmm0, %xmm1
mulss 8(%edx,%ecx,8), %xmm1
addss 8(%eax,%ecx,8), %xmm1
movss %xmm1, 8(%eax,%ecx,8)
movaps %xmm3, %xmm1
mulss 12(%edx,%ecx,8), %xmm1
addss 12(%eax,%ecx,8), %xmm1
movss %xmm1, 12(%eax,%ecx,8)
xorl %ecx, %ecx
.p2align 4,,7
.p2align 3
.L50:
flds (%edx,%ecx,8)
fmul %st(2), %st
fadds (%eax,%ecx,8)
fstps (%eax,%ecx,8)
flds 4(%edx,%ecx,8)
fmul %st(1), %st
fadds 4(%eax,%ecx,8)
fstps 4(%eax,%ecx,8)
flds 8(%edx,%ecx,8)
fmul %st(2), %st
fadds 8(%eax,%ecx,8)
fstps 8(%eax,%ecx,8)
flds 12(%edx,%ecx,8)
fmul %st(1), %st
fadds 12(%eax,%ecx,8)
fstps 12(%eax,%ecx,8)
addl $2, %ecx
cmpl %ecx, %esi
jg .L48
jg .L50
fstp %st(0)
fstp %st(0)
addl $4, %esp
popl %ebx
popl %esi
ret
.L52:
movaps %xmm0, %xmm1
.L54:
flds (%edx)
subl $1, %esi
movss (%eax), %xmm2
mulss (%edx), %xmm1
addss %xmm2, %xmm1
movss 4(%eax), %xmm2
movss %xmm1, (%eax)
movaps %xmm3, %xmm1
mulss 4(%edx), %xmm1
fmul %st(2), %st
fadds (%eax)
fstps (%eax)
flds 4(%edx)
addl $8, %edx
addss %xmm2, %xmm1
movss %xmm1, 4(%eax)
fmul %st(1), %st
fadds 4(%eax)
fstps 4(%eax)
addl $8, %eax
jmp .L44
jmp .L46
.size unalignedBufMixLRCoeffSSE, .-unalignedBufMixLRCoeffSSE
.ident "GCC: (GNU) 4.4.0 20081204 (experimental)"
.ident "GCC: (GNU) 4.4.0 20090304 (experimental)"
.section .note.GNU-stack,"",@progbits

View File

@@ -70,19 +70,19 @@ alignedConvertToS16SSE2:
pushl %ebx
subl $8, %esp
movl 36(%esp), %eax
movss .LC0, %xmm4
cmpb $0, 44(%esp)
flds .LC0
movl 28(%esp), %edx
movl 32(%esp), %ebx
movl %eax, %esi
mulss 40(%esp), %xmm4
fmuls 40(%esp)
jne .L13
testw %ax, %ax
jle .L15
jle .L35
movl %eax, %edi
shrw $2, %di
cmpw $3, %ax
movw %ax, 2(%esp)
movw %ax, 4(%esp)
leal 0(,%edi,4), %ebp
ja .L33
.L28:
@@ -98,17 +98,19 @@ alignedConvertToS16SSE2:
.p2align 4,,7
.p2align 3
.L25:
movaps %xmm4, %xmm0
mulss (%edx), %xmm0
cvttss2si %xmm0, %ecx
movaps %xmm4, %xmm0
mulss 4(%edx), %xmm0
flds (%edx)
fmul %st(1), %st
fstps 4(%esp)
cvttss2si 4(%esp), %ecx
flds 4(%edx)
fmul %st(1), %st
cmpl $-32768, %ecx
cmovl %edi, %ecx
cmpl $32767, %ecx
cmovg %ebx, %ecx
fstps 4(%esp)
movw %cx, (%eax)
cvttss2si %xmm0, %ecx
cvttss2si 4(%esp), %ecx
cmpl $-32768, %ecx
cmovl %edi, %ecx
cmpl $32767, %ecx
@@ -119,6 +121,15 @@ alignedConvertToS16SSE2:
addl $4, %eax
cmpw %bp, %si
jg .L25
fstp %st(0)
jmp .L15
.L35:
fstp %st(0)
jmp .L15
.L36:
fstp %st(0)
.p2align 4,,7
.p2align 3
.L15:
movswl %si,%esi
addl $8, %esp
@@ -132,11 +143,11 @@ alignedConvertToS16SSE2:
.p2align 3
.L13:
testw %ax, %ax
jle .L15
jle .L36
movl %eax, %ebp
shrw $2, %bp
cmpw $3, %si
movw %ax, 2(%esp)
movw %ax, 4(%esp)
leal 0(,%ebp,4), %eax
ja .L34
.L27:
@@ -151,12 +162,13 @@ alignedConvertToS16SSE2:
.p2align 4,,7
.p2align 3
.L20:
movaps %xmm4, %xmm0
flds (%ecx)
movl $32767, %ebp
mulss (%ecx), %xmm0
cvttss2si %xmm0, %ebx
movaps %xmm4, %xmm0
mulss 4(%ecx), %xmm0
fmul %st(1), %st
fstps 4(%esp)
cvttss2si 4(%esp), %ebx
flds 4(%ecx)
fmul %st(1), %st
cmpl $-32768, %ebx
cmovl %edi, %ebx
cmpl $32767, %ebx
@@ -165,8 +177,9 @@ alignedConvertToS16SSE2:
sall $8, %ebx
orl %ebp, %ebx
movl $32767, %ebp
fstps 4(%esp)
movw %bx, (%edx)
cvttss2si %xmm0, %ebx
cvttss2si 4(%esp), %ebx
cmpl $-32768, %ebx
cmovl %edi, %ebx
cmpl $32767, %ebx
@@ -180,146 +193,149 @@ alignedConvertToS16SSE2:
addl $4, %edx
cmpw %ax, %si
jg .L20
fstp %st(0)
jmp .L15
.p2align 4,,7
.p2align 3
.L34:
testw %ax, %ax
je .L27
movaps %xmm4, %xmm0
fsts (%esp)
xorl %ecx, %ecx
movdqa .LC1, %xmm1
movss %xmm4, 4(%esp)
shufps $0, %xmm0, %xmm0
movdqa .LC1, %xmm3
movss (%esp), %xmm0
xorl %edi, %edi
movdqa .LC2, %xmm2
shufps $0, %xmm0, %xmm0
movaps %xmm0, %xmm7
movdqa .LC2, %xmm0
.p2align 4,,7
.p2align 3
.L19:
movaps %xmm7, %xmm3
movdqa %xmm0, %xmm5
movdqa %xmm0, %xmm6
movaps %xmm7, %xmm2
movaps (%edx,%ecx,2), %xmm0
movdqa %xmm2, %xmm4
movdqa %xmm2, %xmm6
addl $1, %edi
mulps (%edx,%ecx,2), %xmm3
mulps 16(%edx,%ecx,2), %xmm2
cvttps2dq %xmm3, %xmm3
movdqa %xmm3, %xmm4
pcmpgtd %xmm1, %xmm4
pand %xmm4, %xmm3
pandn %xmm1, %xmm4
por %xmm4, %xmm3
cvttps2dq %xmm2, %xmm2
movdqa %xmm3, %xmm4
pcmpgtd %xmm0, %xmm4
pand %xmm4, %xmm5
pandn %xmm3, %xmm4
movdqa %xmm4, %xmm3
movdqa %xmm2, %xmm4
por %xmm5, %xmm3
pcmpgtd %xmm1, %xmm4
movaps 16(%edx,%ecx,2), %xmm5
mulps %xmm7, %xmm0
mulps %xmm7, %xmm5
cvttps2dq %xmm0, %xmm0
movdqa %xmm0, %xmm1
pcmpgtd %xmm3, %xmm1
pand %xmm1, %xmm0
pandn %xmm3, %xmm1
por %xmm0, %xmm1
cvttps2dq %xmm5, %xmm5
movdqa %xmm1, %xmm0
pcmpgtd %xmm2, %xmm0
pand %xmm0, %xmm4
pandn %xmm1, %xmm0
movdqa %xmm0, %xmm1
movdqa %xmm5, %xmm0
por %xmm4, %xmm1
pcmpgtd %xmm3, %xmm0
movdqa .LC3, %xmm4
pand %xmm0, %xmm5
pand %xmm1, %xmm4
pandn %xmm3, %xmm0
psrad $8, %xmm4
por %xmm5, %xmm0
pslld $8, %xmm1
movdqa %xmm0, %xmm5
pcmpgtd %xmm2, %xmm5
pand %xmm5, %xmm6
pandn %xmm0, %xmm5
movdqa %xmm5, %xmm0
movdqa .LC3, %xmm5
pand %xmm4, %xmm2
pand %xmm3, %xmm5
pandn %xmm1, %xmm4
por %xmm6, %xmm0
pand %xmm0, %xmm5
pslld $8, %xmm0
psrad $8, %xmm5
por %xmm4, %xmm2
pslld $8, %xmm3
movdqa %xmm2, %xmm4
pcmpgtd %xmm0, %xmm4
pand %xmm4, %xmm6
pandn %xmm2, %xmm4
movdqa %xmm4, %xmm2
por %xmm6, %xmm2
movdqa .LC3, %xmm6
pand %xmm2, %xmm6
pslld $8, %xmm2
psrad $8, %xmm6
movdqa %xmm5, %xmm4
punpcklwd %xmm6, %xmm5
punpckhwd %xmm6, %xmm4
movdqa %xmm5, %xmm6
punpcklwd %xmm4, %xmm5
punpckhwd %xmm4, %xmm6
movdqa %xmm3, %xmm4
punpcklwd %xmm6, %xmm5
punpckhwd %xmm2, %xmm4
punpcklwd %xmm2, %xmm3
movdqa %xmm3, %xmm6
punpcklwd %xmm4, %xmm3
punpckhwd %xmm4, %xmm6
punpcklwd %xmm6, %xmm3
por %xmm3, %xmm5
movdqa %xmm5, (%ebx,%ecx)
movdqa %xmm4, %xmm6
punpcklwd %xmm5, %xmm4
punpckhwd %xmm5, %xmm6
movdqa %xmm4, %xmm5
punpcklwd %xmm6, %xmm4
punpckhwd %xmm6, %xmm5
punpcklwd %xmm5, %xmm4
movdqa %xmm1, %xmm5
punpcklwd %xmm0, %xmm1
punpckhwd %xmm0, %xmm5
movdqa %xmm1, %xmm0
punpcklwd %xmm5, %xmm1
punpckhwd %xmm5, %xmm0
punpcklwd %xmm0, %xmm1
por %xmm1, %xmm4
movdqa %xmm4, (%ebx,%ecx)
addl $16, %ecx
cmpw %di, %bp
ja .L19
cmpw 2(%esp), %ax
movss 4(%esp), %xmm4
cmpw 4(%esp), %ax
jne .L18
fstp %st(0)
jmp .L15
.p2align 4,,7
.p2align 3
.L33:
testw %bp, %bp
.p2align 4,,3
.p2align 4,,4
.p2align 3
je .L28
movaps %xmm4, %xmm0
fsts (%esp)
xorl %eax, %eax
movdqa .LC1, %xmm1
shufps $0, %xmm0, %xmm0
movdqa .LC1, %xmm3
movss (%esp), %xmm0
xorl %ecx, %ecx
movaps %xmm0, %xmm6
movdqa .LC2, %xmm0
movdqa .LC2, %xmm2
shufps $0, %xmm0, %xmm0
movaps %xmm0, %xmm5
.p2align 4,,7
.p2align 3
.L24:
movaps %xmm6, %xmm3
movaps (%edx,%eax,2), %xmm0
addl $1, %ecx
movdqa %xmm0, %xmm7
movaps %xmm6, %xmm2
mulps (%edx,%eax,2), %xmm3
mulps 16(%edx,%eax,2), %xmm2
cvttps2dq %xmm3, %xmm3
movdqa %xmm3, %xmm5
pcmpgtd %xmm1, %xmm5
pand %xmm5, %xmm3
pandn %xmm1, %xmm5
por %xmm5, %xmm3
cvttps2dq %xmm2, %xmm2
movdqa %xmm3, %xmm5
pcmpgtd %xmm0, %xmm5
pand %xmm5, %xmm7
pandn %xmm3, %xmm5
movdqa %xmm5, %xmm3
movdqa %xmm2, %xmm5
por %xmm7, %xmm3
pcmpgtd %xmm1, %xmm5
movdqa %xmm0, %xmm7
pand %xmm5, %xmm2
pandn %xmm1, %xmm5
por %xmm5, %xmm2
movdqa %xmm2, %xmm5
pcmpgtd %xmm0, %xmm5
pand %xmm5, %xmm7
pandn %xmm2, %xmm5
movdqa %xmm5, %xmm2
movdqa %xmm3, %xmm5
por %xmm7, %xmm2
punpckhwd %xmm2, %xmm5
punpcklwd %xmm2, %xmm3
movdqa %xmm3, %xmm7
punpcklwd %xmm5, %xmm3
punpckhwd %xmm5, %xmm7
punpcklwd %xmm7, %xmm3
movdqa %xmm3, (%ebx,%eax)
movdqa %xmm2, %xmm6
movaps 16(%edx,%eax,2), %xmm4
mulps %xmm5, %xmm0
mulps %xmm5, %xmm4
cvttps2dq %xmm0, %xmm0
movdqa %xmm0, %xmm1
pcmpgtd %xmm3, %xmm1
pand %xmm1, %xmm0
pandn %xmm3, %xmm1
por %xmm0, %xmm1
cvttps2dq %xmm4, %xmm4
movdqa %xmm1, %xmm0
pcmpgtd %xmm2, %xmm0
pand %xmm0, %xmm6
pandn %xmm1, %xmm0
movdqa %xmm0, %xmm1
movdqa %xmm4, %xmm0
por %xmm6, %xmm1
pcmpgtd %xmm3, %xmm0
movdqa %xmm2, %xmm6
pand %xmm0, %xmm4
pandn %xmm3, %xmm0
por %xmm4, %xmm0
movdqa %xmm0, %xmm4
pcmpgtd %xmm2, %xmm4
pand %xmm4, %xmm6
pandn %xmm0, %xmm4
movdqa %xmm4, %xmm0
movdqa %xmm1, %xmm4
por %xmm6, %xmm0
punpckhwd %xmm0, %xmm4
punpcklwd %xmm0, %xmm1
movdqa %xmm1, %xmm0
punpcklwd %xmm4, %xmm1
punpckhwd %xmm4, %xmm0
punpcklwd %xmm0, %xmm1
movdqa %xmm1, (%ebx,%eax)
addl $16, %eax
cmpw %cx, %di
ja .L24
cmpw %bp, 2(%esp)
cmpw %bp, 4(%esp)
jne .L23
fstp %st(0)
jmp .L15
.size alignedConvertToS16SSE2, .-alignedConvertToS16SSE2
.section .rodata.cst4,"aM",@progbits,4
@@ -345,5 +361,5 @@ alignedConvertToS16SSE2:
.long 65280
.long 65280
.long 65280
.ident "GCC: (GNU) 4.4.0 20081204 (experimental)"
.ident "GCC: (GNU) 4.4.0 20090304 (experimental)"
.section .note.GNU-stack,"",@progbits