about summary refs log tree commit diff
path: root/REORG.TODO/sysdeps/i386/i586/strlen.S
diff options
context:
space:
mode:
Diffstat (limited to 'REORG.TODO/sysdeps/i386/i586/strlen.S')
-rw-r--r--REORG.TODO/sysdeps/i386/i586/strlen.S182
1 files changed, 182 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/i386/i586/strlen.S b/REORG.TODO/sysdeps/i386/i586/strlen.S
new file mode 100644
index 0000000000..cfea2e020f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/strlen.S
@@ -0,0 +1,182 @@
+/* strlen -- Compute length of NUL terminated string.
+   Highly optimized version for ix86, x>=5.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/* This version is especially optimized for the i586 (and following?)
+   processors.  This is mainly done by using the two pipelines.  The
+   version optimized for i486 is weak in this aspect because to get
+   as much parallelism we have to execute some *more* instructions.
+
+   The code below is structured to reflect the pairing of the instructions
+   as *I think* it is.  I have no processor data book to verify this.
+   If you find something you think is incorrect let me know.  */
+
+
+/* The magic value which is used throughout in the whole code.  */
+#define magic 0xfefefeff
+
+#define PARMS	4		/* no space for saved regs */
+#define STR	PARMS
+
+	.text
+ENTRY (strlen)
+
+	movl STR(%esp), %eax
+	movl $3, %edx		/* load mask (= 3) */
+
+	andl %eax, %edx		/* separate last two bits of address */
+
+	jz L(1)			/* aligned => start loop */
+	jp L(0)			/* exactly two bits set */
+
+	cmpb %dh, (%eax)	/* is byte NUL? */
+	je L(2)			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	cmpb %dh, (%eax)	/* is byte NUL? */
+
+	je L(2)			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	xorl $2, %edx
+
+	jz L(1)
+
+L(0):	cmpb %dh, (%eax)	/* is byte NUL? */
+	je L(2)			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	xorl %edx, %edx		/* We need %edx == 0 for later */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 Note: %edx == 0 in any case here.  */
+
+L(1):
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L(3)		/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L(3)		/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L(3)		/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L(3)		/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L(3)		/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L(3)		/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L(3)		/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	je L(1)			/* no => start loop again */
+
+
+L(3):	subl $4, %eax		/* correct too early pointer increment */
+	subl $magic, %ecx
+
+	cmpb $0, %cl		/* lowest byte NUL? */
+	jz L(2)			/* yes => return */
+
+	inc %eax		/* increment pointer */
+	testb %ch, %ch		/* second byte NUL? */
+
+	jz L(2)			/* yes => return */
+
+	shrl $16, %ecx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmpb $0, %cl		/* is third byte NUL? */
+	jz L(2)			/* yes => return */
+
+	incl %eax		/* increment pointer */
+
+L(2):	subl STR(%esp), %eax	/* now compute the length as difference
+				   between start and terminating NUL
+				   character */
+	ret
+END (strlen)
+libc_hidden_builtin_def (strlen)