overhaul optimized i386 memset asm

on most cpu models, "rep stosl" has high overhead that makes it undesirable for small memset sizes. the new code extends the minimal-branch fast path for short memsets from size 15 up to size 62, and shrink-wraps this code path. in addition, "rep stosl" is very sensitive to misalignment. the cost varies with size and with cpu model, but it has been observed performing 1.5 to 4 times slower when the destination address is not aligned mod 16. the new code thus ensures alignment mod 16, but also preserves any existing additional alignment, in case there are cpu models where it is beneficial. this version is based in part on changes to the x86_64 memset asm proposed by Denys Vlasenko.
author: Rich Felker <dalias@aerifal.cx> 2015-02-26 01:51:39 -0500
committer: Rich Felker <dalias@aerifal.cx> 2015-02-26 01:51:39 -0500
commit: 69858fa93107aa7485b143c54137e745a7b7ad72 (patch)
tree: dcc3bcbf9fa71af0227341d8a305c626d429e0d9 /src/string
parent: 20cbd607759038dca57f84ef7e7b5d44a3088574 (diff)
download: musl-69858fa93107aa7485b143c54137e745a7b7ad72.tar.gz
musl-69858fa93107aa7485b143c54137e745a7b7ad72.tar.xz
musl-69858fa93107aa7485b143c54137e745a7b7ad72.zip
1 files changed, 61 insertions, 32 deletions
diff --git a/src/string/i386/memset.s b/src/string/i386/memset.s
index 06ac923e..d00422c4 100644
--- a/src/string/i386/memset.s
+++ b/src/string/i386/memset.s
@@ -1,47 +1,76 @@
 .global memset
 .type memset,@function
 memset:
-	mov 8(%esp),%al
-	push %edi
-	mov %al,%ah
-	mov %al,%dl
-	mov 16(%esp),%ecx
-	shl $16,%eax
-	mov 8(%esp),%edi
-	mov %dl,%al
-	mov %dl,%ah
-	cmp $16,%ecx
-	jb 1f
+	mov 12(%esp),%ecx
+	cmp $62,%ecx
+	ja 2f
 
-	mov %eax,-4(%edi,%ecx)
-	shr $2,%ecx
-	rep
-	stosl
-	mov 8(%esp),%eax
-	pop %edi
-	ret
-
-1:	test %ecx,%ecx
+	mov 8(%esp),%dl
+	mov 4(%esp),%eax
+	test %ecx,%ecx
 	jz 1f
 
-	mov %al,(%edi)
-	mov %al,-1(%edi,%ecx)
+	mov %dl,%dh
+
+	mov %dl,(%eax)
+	mov %dl,-1(%eax,%ecx)
 	cmp $2,%ecx
 	jbe 1f
 
-	mov %al,1(%edi)
-	mov %al,-2(%edi,%ecx)
-	cmp $4,%ecx
+	mov %dx,1(%eax)
+	mov %dx,(-1-2)(%eax,%ecx)
+	cmp $6,%ecx
 	jbe 1f
 
-	mov %eax,(%edi)
-	mov %eax,-4(%edi,%ecx)
-	cmp $8,%ecx
+	shl $16,%edx
+	mov 8(%esp),%dl
+	mov 8(%esp),%dh
+
+	mov %edx,(1+2)(%eax)
+	mov %edx,(-1-2-4)(%eax,%ecx)
+	cmp $14,%ecx
 	jbe 1f
 
-	mov %eax,4(%edi)
-	mov %eax,-8(%edi,%ecx)
+	mov %edx,(1+2+4)(%eax)
+	mov %edx,(1+2+4+4)(%eax)
+	mov %edx,(-1-2-4-8)(%eax,%ecx)
+	mov %edx,(-1-2-4-4)(%eax,%ecx)
+	cmp $30,%ecx
+	jbe 1f
+
+	mov %edx,(1+2+4+8)(%eax)
+	mov %edx,(1+2+4+8+4)(%eax)
+	mov %edx,(1+2+4+8+8)(%eax)
+	mov %edx,(1+2+4+8+12)(%eax)
+	mov %edx,(-1-2-4-8-16)(%eax,%ecx)
+	mov %edx,(-1-2-4-8-12)(%eax,%ecx)
+	mov %edx,(-1-2-4-8-8)(%eax,%ecx)
+	mov %edx,(-1-2-4-8-4)(%eax,%ecx)
+
+1:	ret 	
+
+2:	movzbl 8(%esp),%eax
+	mov %edi,12(%esp)
+	imul $0x1010101,%eax
+	mov 4(%esp),%edi
+	test $15,%edi
+	mov %eax,-4(%edi,%ecx)
+	jnz 2f
 
-1:	mov 8(%esp),%eax
-	pop %edi
+1:	shr $2, %ecx
+	rep
+	stosl
+	mov 4(%esp),%eax
+	mov 12(%esp),%edi
 	ret
+	
+2:	xor %edx,%edx
+	sub %edi,%edx
+	and $15,%edx
+	mov %eax,(%edi)
+	mov %eax,4(%edi)
+	mov %eax,8(%edi)
+	mov %eax,12(%edi)
+	sub %edx,%ecx
+	add %edx,%edi
+	jmp 1b
author	Rich Felker <dalias@aerifal.cx>	2015-02-26 01:51:39 -0500
committer	Rich Felker <dalias@aerifal.cx>	2015-02-26 01:51:39 -0500
commit	69858fa93107aa7485b143c54137e745a7b7ad72 (patch)
tree	dcc3bcbf9fa71af0227341d8a305c626d429e0d9 /src/string
parent	20cbd607759038dca57f84ef7e7b5d44a3088574 (diff)
download	musl-69858fa93107aa7485b143c54137e745a7b7ad72.tar.gz musl-69858fa93107aa7485b143c54137e745a7b7ad72.tar.xz musl-69858fa93107aa7485b143c54137e745a7b7ad72.zip