optimized memset asm for i386 and x86_64

the concept of both versions is the same; they differ only in details. for long runs, they use "rep movsl" or "rep movsq", and for small runs, they use a trick, writing from both ends towards the middle, that reduces the number of branches needed. in addition, if memset is called multiple times with the same length, all branches will be predicted; there are no loops. for larger runs, there are likely faster approaches than "rep", at least on some cpu models. for 32-bit, it's unlikely that there is any faster approach that does not require non-baseline instructions; doing anything fancier would require inspecting cpu capabilities. for 64-bit, there may very well be faster versions that work on all models; further optimization could be explored in the future. with these changes, memset is anywhere between 50% faster and 6 times faster, depending on the cpu model and the length and alignment of the destination buffer.
author: Rich Felker <dalias@aerifal.cx> 2013-08-01 21:44:43 -0400
committer: Rich Felker <dalias@aerifal.cx> 2013-08-01 21:44:43 -0400
commit: 926272ddffa293ee68ffeb01422fc8c792acf428 (patch)
tree: 8997ae7c582e5d4dbaf0056d940d15702d728258 /src/string
parent: 4a1f55e92fa74ee382909baa96302231f566b5e1 (diff)
download: musl-926272ddffa293ee68ffeb01422fc8c792acf428.tar.gz
musl-926272ddffa293ee68ffeb01422fc8c792acf428.tar.xz
musl-926272ddffa293ee68ffeb01422fc8c792acf428.zip
2 files changed, 88 insertions, 0 deletions
diff --git a/src/string/i386/memset.s b/src/string/i386/memset.s
new file mode 100644
index 00000000..06ac923e
--- /dev/null
+++ b/src/string/i386/memset.s
@@ -0,0 +1,47 @@
+.global memset
+.type memset,@function
+memset:
+	mov 8(%esp),%al
+	push %edi
+	mov %al,%ah
+	mov %al,%dl
+	mov 16(%esp),%ecx
+	shl $16,%eax
+	mov 8(%esp),%edi
+	mov %dl,%al
+	mov %dl,%ah
+	cmp $16,%ecx
+	jb 1f
+
+	mov %eax,-4(%edi,%ecx)
+	shr $2,%ecx
+	rep
+	stosl
+	mov 8(%esp),%eax
+	pop %edi
+	ret
+
+1:	test %ecx,%ecx
+	jz 1f
+
+	mov %al,(%edi)
+	mov %al,-1(%edi,%ecx)
+	cmp $2,%ecx
+	jbe 1f
+
+	mov %al,1(%edi)
+	mov %al,-2(%edi,%ecx)
+	cmp $4,%ecx
+	jbe 1f
+
+	mov %eax,(%edi)
+	mov %eax,-4(%edi,%ecx)
+	cmp $8,%ecx
+	jbe 1f
+
+	mov %eax,4(%edi)
+	mov %eax,-8(%edi,%ecx)
+
+1:	mov 8(%esp),%eax
+	pop %edi
+	ret
diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s
new file mode 100644
index 00000000..fc06eef8
--- /dev/null
+++ b/src/string/x86_64/memset.s
@@ -0,0 +1,41 @@
+.global memset
+.type memset,@function
+memset:
+	and $0xff,%esi
+	mov $0x101010101010101,%rax
+	mov %rdx,%rcx
+	mov %rdi,%r8
+	imul %rsi,%rax
+	cmp $16,%rcx
+	jb 1f
+
+	mov %rax,-8(%rdi,%rcx)
+	shr $3,%rcx
+	rep
+	stosq
+	mov %r8,%rax
+	ret
+
+1:	test %ecx,%ecx
+	jz 1f
+
+	mov %al,(%rdi)
+	mov %al,-1(%rdi,%rcx)
+	cmp $2,%ecx
+	jbe 1f
+
+	mov %al,1(%rdi)
+	mov %al,-2(%rdi,%rcx)
+	cmp $4,%ecx
+	jbe 1f
+
+	mov %eax,(%rdi)
+	mov %eax,-4(%rdi,%rcx)
+	cmp $8,%ecx
+	jbe 1f
+
+	mov %eax,4(%rdi)
+	mov %eax,-8(%rdi,%rcx)
+
+1:	mov %r8,%rax
+	ret
author	Rich Felker <dalias@aerifal.cx>	2013-08-01 21:44:43 -0400
committer	Rich Felker <dalias@aerifal.cx>	2013-08-01 21:44:43 -0400
commit	926272ddffa293ee68ffeb01422fc8c792acf428 (patch)
tree	8997ae7c582e5d4dbaf0056d940d15702d728258 /src/string
parent	4a1f55e92fa74ee382909baa96302231f566b5e1 (diff)
download	musl-926272ddffa293ee68ffeb01422fc8c792acf428.tar.gz musl-926272ddffa293ee68ffeb01422fc8c792acf428.tar.xz musl-926272ddffa293ee68ffeb01422fc8c792acf428.zip