about summary refs log tree commit diff
path: root/sysdeps/alpha
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/alpha')
-rw-r--r--sysdeps/alpha/Makefile2
-rw-r--r--sysdeps/alpha/bsd-_setjmp.S4
-rw-r--r--sysdeps/alpha/bsd-setjmp.S4
-rw-r--r--sysdeps/alpha/bzero.S7
-rw-r--r--sysdeps/alpha/div.S110
-rw-r--r--sysdeps/alpha/divl.S4
-rw-r--r--sysdeps/alpha/divlu.S6
-rw-r--r--sysdeps/alpha/divq.S4
-rw-r--r--sysdeps/alpha/divqu.S6
-rw-r--r--sysdeps/alpha/divrem.h308
-rw-r--r--sysdeps/alpha/ffs.S7
-rw-r--r--sysdeps/alpha/htonl.S8
-rw-r--r--sysdeps/alpha/htons.S8
-rw-r--r--sysdeps/alpha/ldiv.S109
-rw-r--r--sysdeps/alpha/lldiv.S1
-rw-r--r--sysdeps/alpha/memchr.S7
-rw-r--r--sysdeps/alpha/memcpy.S276
-rw-r--r--sysdeps/alpha/memset.S7
-rw-r--r--sysdeps/alpha/reml.S4
-rw-r--r--sysdeps/alpha/remlu.S6
-rw-r--r--sysdeps/alpha/remq.S4
-rw-r--r--sysdeps/alpha/remqu.S6
-rw-r--r--sysdeps/alpha/s_copysign.S8
-rw-r--r--sysdeps/alpha/s_fabs.S8
-rw-r--r--sysdeps/alpha/setjmp.S4
-rw-r--r--sysdeps/alpha/stpcpy.S4
-rw-r--r--sysdeps/alpha/stpncpy.S35
-rw-r--r--sysdeps/alpha/strcat.S33
-rw-r--r--sysdeps/alpha/strchr.S36
-rw-r--r--sysdeps/alpha/strcmp.S195
-rw-r--r--sysdeps/alpha/strcpy.S4
-rw-r--r--sysdeps/alpha/strlen.S9
-rw-r--r--sysdeps/alpha/strncat.S4
-rw-r--r--sysdeps/alpha/strncmp.S224
-rw-r--r--sysdeps/alpha/strncpy.S4
-rw-r--r--sysdeps/alpha/strrchr.S7
-rw-r--r--sysdeps/alpha/udiv_qrnnd.S44
-rw-r--r--sysdeps/alpha/w_sqrt.S161
38 files changed, 1447 insertions, 231 deletions
diff --git a/sysdeps/alpha/Makefile b/sysdeps/alpha/Makefile
index 45babb6c1c..6d4fbbb18b 100644
--- a/sysdeps/alpha/Makefile
+++ b/sysdeps/alpha/Makefile
@@ -42,4 +42,4 @@ ifeq ($(subdir),elf)
 sysdep-CFLAGS += -mno-fp-regs
 endif
 
-divrem := divl divlu divq divqu reml remlu remq remqu
+divrem := divl divq reml remq
diff --git a/sysdeps/alpha/bsd-_setjmp.S b/sysdeps/alpha/bsd-_setjmp.S
index a7bdbb5a61..be7f6dd05f 100644
--- a/sysdeps/alpha/bsd-_setjmp.S
+++ b/sysdeps/alpha/bsd-_setjmp.S
@@ -25,6 +25,10 @@ Cambridge, MA 02139, USA.  */
 
 ENTRY(_setjmp)
 	ldgp	$29,0($27)
+#ifdef PROF
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+#endif
 	.prologue 1
 	bis	$31, $31, $17		/* Pass a second argument of zero.  */
 	jmp	$31, __sigsetjmp	/* Call __sigsetjmp.  */
diff --git a/sysdeps/alpha/bsd-setjmp.S b/sysdeps/alpha/bsd-setjmp.S
index c0ed691f40..2b799613d7 100644
--- a/sysdeps/alpha/bsd-setjmp.S
+++ b/sysdeps/alpha/bsd-setjmp.S
@@ -25,6 +25,10 @@ Cambridge, MA 02139, USA.  */
 
 ENTRY(setjmp)
 	ldgp	$29, 0($27)
+#ifdef PROF
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+#endif
 	.prologue 1
 	bis	$31, 1, $17		/* Pass a second argument of one.  */
 	jmp	$31, __sigsetjmp	/* Call __sigsetjmp.  */
diff --git a/sysdeps/alpha/bzero.S b/sysdeps/alpha/bzero.S
index fffa53d7f1..c614fc1a6e 100644
--- a/sysdeps/alpha/bzero.S
+++ b/sysdeps/alpha/bzero.S
@@ -80,7 +80,14 @@ $tail:	bne	t4, 1f		# is there a tail to do?
 	.end bzero_loop
 
 ENTRY(bzero)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
 	.prologue 0
+#endif
 
 	mov	a0, v0		# e0    : move return value in place
 	beq	a1, $done	# .. e1 : early exit for zero-length store
diff --git a/sysdeps/alpha/div.S b/sysdeps/alpha/div.S
new file mode 100644
index 0000000000..6c461c40d4
--- /dev/null
+++ b/sysdeps/alpha/div.S
@@ -0,0 +1,110 @@
+/* Copyright (C) 1996 Free Software Foundation, Inc.
+   Contributed by Richard Henderson (rth@tamu.edu)
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If
+   not, write to the Free Software Foundation, Inc., 675 Mass Ave,
+   Cambridge, MA 02139, USA.  */
+
+
+#include <sysdep.h>
+
+#ifdef __linux__
+# include <asm/gentrap.h>
+# include <asm/pal.h>
+#else
+# include <machine/pal.h>
+#endif
+
+	.set noat
+
+	.align 4
+	.globl div
+	.ent div
+div:
+	.frame sp, 0, ra
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
+	.prologue 0
+#endif
+
+#define dividend  t0
+#define divisor   t1
+#define mask      t2
+#define quotient  t3
+#define modulus   t4
+#define tmp1      t5
+#define tmp2      t6
+#define compare   t7
+
+	/* find correct sign for input to unsigned divide loop. */
+	sextl	a1, a1				# e0    :
+	sextl	a2, a2				# .. e1 :
+	negl	a1, dividend			# e0    :
+	negl	a2, divisor			# .. e1 :
+	cmovge	a1, a1, dividend		# e0    :
+	cmovge	a2, a2, divisor			# .. e1 :
+	beq	a2, $divbyzero			# e1    :
+	unop					#       :
+
+	/* shift divisor left, using 3-bit shifts for 32-bit divides as we
+	   can't overflow.  Three-bit shifts will result in looping three
+	   times less here, but can result in two loops more later.  Thus
+	   using a large shift isn't worth it (and s8addq pairs better than
+	   a shift).  */
+
+1:	cmpult	divisor, modulus, compare	# e0    :
+	s8addq	divisor, zero, divisor		# .. e1 :
+	s8addq	mask, zero, mask		# e0    :
+	bne	compare, 1b			# .. e1 :
+
+	/* start to go right again. */
+2:	addq	quotient, mask, tmp2		# e1    :
+	srl	mask, 1, mask			# .. e0 :
+	cmpule	divisor, modulus, compare	# e0    :
+	subq	modulus, divisor, tmp1		# .. e1 :
+	cmovne	compare, tmp2, quotient		# e1    :
+	srl	divisor, 1, divisor		# .. e0 :
+	cmovne	compare, tmp1, modulus		# e0    :
+	bne	mask, 2b			# .. e1 :
+
+	/* find correct sign for result.  */
+	xor	a1, a2, compare			# e0    :
+	negl	quotient, tmp1			# .. e1 :
+	negl	modulus, tmp2			# e0    :
+	cmovlt	compare, tmp1, quotient		# .. e1 :
+	cmovlt	a1, tmp2, modulus		# e1    :
+
+	/* and store it away in the structure.  */
+	stl	quotient, 0(a0)			# .. e0 :
+	mov	a0, v0				# e1    :
+	stl	modulus, 4(a0)			# .. e0 :
+	ret					# e1    :
+
+$divbyzero:
+	mov	a0, v0
+	ldiq	a0, GEN_INTDIV
+	call_pal PAL_gentrap
+
+	/* if trap returns, return zero.  */
+	stl	zero, 0(v0)
+	stl	zero, 4(v0)
+	ret
+
+	.end div
diff --git a/sysdeps/alpha/divl.S b/sysdeps/alpha/divl.S
index 6990665937..fdf053fc25 100644
--- a/sysdeps/alpha/divl.S
+++ b/sysdeps/alpha/divl.S
@@ -1,6 +1,6 @@
 #define IS_REM		0
 #define SIZE		4
-#define SIGNED		1
-#define FUNC_NAME	__divl
+#define UFUNC_NAME	__divlu
+#define SFUNC_NAME	__divl
 
 #include "divrem.h"
diff --git a/sysdeps/alpha/divlu.S b/sysdeps/alpha/divlu.S
deleted file mode 100644
index ee96c95008..0000000000
--- a/sysdeps/alpha/divlu.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define IS_REM		0
-#define SIZE		4
-#define SIGNED		0
-#define FUNC_NAME	__divlu
-
-#include "divrem.h"
diff --git a/sysdeps/alpha/divq.S b/sysdeps/alpha/divq.S
index bde3425f37..8c88af9736 100644
--- a/sysdeps/alpha/divq.S
+++ b/sysdeps/alpha/divq.S
@@ -1,6 +1,6 @@
 #define IS_REM		0
 #define SIZE		8
-#define SIGNED		1
-#define FUNC_NAME	__divq
+#define UFUNC_NAME	__divqu
+#define SFUNC_NAME	__divq
 
 #include "divrem.h"
diff --git a/sysdeps/alpha/divqu.S b/sysdeps/alpha/divqu.S
deleted file mode 100644
index 72dcf971dd..0000000000
--- a/sysdeps/alpha/divqu.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define IS_REM		0
-#define SIZE		8
-#define SIGNED		0
-#define FUNC_NAME	__divqu
-
-#include "divrem.h"
diff --git a/sysdeps/alpha/divrem.h b/sysdeps/alpha/divrem.h
index eaf892b3c6..b83908dc9f 100644
--- a/sysdeps/alpha/divrem.h
+++ b/sysdeps/alpha/divrem.h
@@ -1,25 +1,25 @@
 /* Copyright (C) 1996 Free Software Foundation, Inc.
    Contributed by David Mosberger (davidm@cs.arizona.edu).
 
-This file is part of the GNU C Library.
+   This file is part of the GNU C Library.
 
-The GNU C Library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Library General Public License as
-published by the Free Software Foundation; either version 2 of the
-License, or (at your option) any later version.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
 
-The GNU C Library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Library General Public License for more details.
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
 
-You should have received a copy of the GNU Library General Public
-License along with the GNU C Library; see the file COPYING.LIB.  If
-not, write to the Free Software Foundation, Inc., 675 Mass Ave,
-Cambridge, MA 02139, USA.  */
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If
+   not, write to the Free Software Foundation, Inc., 675 Mass Ave,
+   Cambridge, MA 02139, USA.  */
 
 /* The current Alpha chips don't provide hardware for integer
-division.  The C compiler expects the functions
+   division.  The C compiler expects the functions
 
 	__divqu: 64-bit unsigned long divide
 	__remqu: 64-bit unsigned long remainder
@@ -27,10 +27,10 @@ division.  The C compiler expects the functions
 	__divlu/__remlu: unsigned 32-bit
 	__divls/__remls: signed 32-bit
 
-These are not normal C functions: instead of the normal calling
-sequence, these expect their arguments in registers t10 and t11, and
-return the result in t12 (aka pv). Register AT may be clobbered
-(assembly temporary), anything else must be saved.  */
+   These are not normal C functions: instead of the normal calling
+   sequence, these expect their arguments in registers t10 and t11, and
+   return the result in t12 (aka pv).  Register AT may be clobbered
+   (assembly temporary), anything else must be saved.  */
 
 #include <sysdep.h>
 
@@ -41,147 +41,185 @@ return the result in t12 (aka pv). Register AT may be clobbered
 # include <machine/pal.h>
 #endif
 
-#ifdef DEBUG
-# define arg1		a0
-# define arg2		a1
-# define result		v0
-# define mask		t0
-# define tmp0		t1
-# define tmp1		t2
-# define sign		t3
-# define retaddr	ra
-#else
-# define arg1		t10
-# define arg2		t11
-# define result		t12
-# define mask		v0
-# define tmp0		t0
-# define tmp1		t1
-# define sign		t2
-# define retaddr	t9
-#endif
+#define mask			v0
+#define divisor			t0
+#define compare			AT
+#define tmp1			t2
+#define tmp2			t3
+#define retaddr			t9
+#define arg1			t10
+#define arg2			t11
+#define result			t12
 
-# define divisor	arg2
 #if IS_REM
-# define dividend	result
-# define quotient	arg1
-# define GETDIVIDEND	bis arg1,zero,dividend
+# define DIV_ONLY(x,y...)
+# define REM_ONLY(x,y...)	x,##y
+# define modulus		result
+# define quotient		t1
+# define GETSIGN(x)		mov arg1, x
+# define STACK			32
 #else
-# define dividend	arg1
-# define quotient	result
-# define GETDIVIDEND
+# define DIV_ONLY(x,y...)	x,##y
+# define REM_ONLY(x,y...)
+# define modulus		t1
+# define quotient		result
+# define GETSIGN(x)		xor arg1, arg2, x
+# define STACK			48
 #endif
 
 #if SIZE == 8
-# define LONGIFYarg1	GETDIVIDEND
-# define LONGIFYarg2
-#else
-# if SIGNED
-#  define LONGIFYarg1	addl	arg1,zero,dividend
-#  define LONGIFYarg2	addl	arg2,zero,divisor
-# else
-#  define LONGIFYarg1	zapnot	arg1,0x0f,dividend
-#  define LONGIFYarg2	zapnot	arg2,0x0f,divisor
-# endif
-#endif
-
-#if SIGNED
-# define SETSIGN(sign,reg,tmp)	subq zero,reg,tmp; cmovlt sign,tmp,reg
-# if IS_REM
-#  define GETSIGN(x,y,s)	bis	x,zero,s
-# else
-#  define GETSIGN(x,y,s)	xor	x,y,s
-# endif
+# define LONGIFY(x,y)		mov x,y
+# define SLONGIFY(x,y)		mov x,y
+# define _SLONGIFY(x)
+# define NEG(x,y)		negq x,y
 #else
-# define SETSIGN(sign,reg,tmp)
-# define GETSIGN(x,y,s)
+# define LONGIFY(x,y)		zapnot x,15,y
+# define SLONGIFY(x,y)		sextl x,y
+# define _SLONGIFY(x)		sextl x,x
+# define NEG(x,y)		negl x,y
 #endif
 
 	.set noreorder
 	.set noat
 
-	.ent FUNC_NAME
-	.globl FUNC_NAME
-
-#define FRAME_SIZE	0x30
+	.ent UFUNC_NAME
+	.globl UFUNC_NAME
 
-	.align 5
-FUNC_NAME:
+	.align 3
+UFUNC_NAME:
+	lda	sp, -STACK(sp)
+	.frame	sp, STACK, retaddr, 0
 #ifdef PROF
-	lda	sp, -0x18(sp)
-	stq	ra, 0x00(sp)
-	stq	pv, 0x08(sp)
-	stq	gp, 0x10(sp)
+	stq	ra, 0(sp)
+	stq	pv, 8(sp)
+	stq	gp, 16(sp)
 
 	br	AT, 1f
 1:	ldgp	gp, 0(AT)
 
 	mov	retaddr, ra
-	jsr	AT, _mcount
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
 
-	ldq	ra, 0x00(sp)
-	ldq	pv, 0x08(sp)
-	ldq	gp, 0x10(sp)
-	lda	sp, 0x18(sp)
+	ldq	ra, 0(sp)
+	ldq	pv, 8(sp)
+	ldq	gp, 16(sp)
 #endif
-	.frame	sp, FRAME_SIZE, retaddr, 0
-	lda	sp,-FRAME_SIZE(sp)
-	.prologue 1
-	stq	arg1,0x00(sp)
-	LONGIFYarg1
-	stq	arg2,0x08(sp)
-	LONGIFYarg2
-	stq	mask,0x10(sp)
-	bis	zero,1,mask
-	stq	tmp0,0x18(sp)
-	bis	zero,zero,quotient
-	stq	tmp1,0x20(sp)
-	beq	divisor,$divbyzero
-	stq	sign,0x28(sp)
-	GETSIGN(dividend,divisor,sign)
-#if SIGNED
-	subq	zero,dividend,tmp0
-	subq	zero,divisor,tmp1
-	cmovlt	dividend,tmp0,dividend
-	cmovlt	divisor,tmp1,divisor
+	.prologue 0
+
+$udiv:
+	stq	t0, 0(sp)
+	LONGIFY	(arg2, divisor)
+	stq	t1, 8(sp)
+	LONGIFY	(arg1, modulus)
+	stq	v0, 16(sp)
+	clr	quotient
+	stq	tmp1, 24(sp)
+	ldiq	mask, 1
+	DIV_ONLY(stq tmp2,32(sp))
+
+	beq	divisor, $divbyzero
+
+	.align 3
+#if SIZE == 8
+	/* Shift divisor left.  */
+1:	cmpult	divisor, modulus, compare
+	blt	divisor, 2f
+	addq	divisor, divisor, divisor
+	addq	mask, mask, mask
+	bne	compare, 1b
+	unop
+2:
+#else
+	/* Shift divisor left using 3-bit shifts as we can't overflow.
+	   This results in looping three times less here, but up to
+	   two more times later.  Thus using a large shift isn't worth it.  */
+1:	cmpult	divisor, modulus, compare
+	s8addq	divisor, zero, divisor
+	s8addq	mask, zero, mask
+	bne	compare, 1b
 #endif
-	/*
-	 * Shift divisor left until either bit 63 is set or until it
-	 * is at least as big as the dividend:
-	 */
-	.align	3
-1:	cmpule	dividend,divisor,AT
-	blt	divisor,2f
-	blbs	AT,2f
-	addq	mask,mask,mask
-	addq	divisor,divisor,divisor
-	br	1b
-
-	.align	3
-2:	addq	mask,quotient,tmp0
-	cmpule	divisor,dividend,AT
-	subq	dividend,divisor,tmp1
-	srl	divisor,1,divisor
-	srl	mask,1,mask
-	cmovlbs	AT,tmp0,quotient
-	cmovlbs	AT,tmp1,dividend
-	bne	mask,2b
-
-	ldq	arg1,0x00(sp)
-	SETSIGN(sign,result,tmp0)
-$done:	ldq	arg2,0x08(sp)
-	ldq	mask,0x10(sp)
-	ldq	tmp0,0x18(sp)
-	ldq	tmp1,0x20(sp)
-	ldq	sign,0x28(sp)
-	lda	sp,FRAME_SIZE(sp)
-	ret	zero,(retaddr),0
+
+	/* Now go back to the right.  */
+3:	DIV_ONLY(addq quotient, mask, tmp2)
+	srl	mask, 1, mask
+	cmpule	divisor, modulus, compare
+	subq	modulus, divisor, tmp1
+	DIV_ONLY(cmovne compare, tmp2, quotient)
+	srl	divisor, 1, divisor
+	cmovne	compare, tmp1, modulus
+	bne	mask, 3b
+
+$done:	ldq	t0, 0(sp)
+	ldq	t1, 8(sp)
+	ldq	v0, 16(sp)
+	ldq	tmp1, 24(sp)
+	DIV_ONLY(ldq tmp2, 32(sp))
+	lda	sp, STACK(sp)
+	ret	zero, (retaddr), 1
 
 $divbyzero:
-	lda	a0,GEN_INTDIV(zero)
+	mov	a0, tmp1
+	ldiq	a0, GEN_INTDIV
 	call_pal PAL_gentrap
-	bis	zero,zero,result	/* if trap returns, return 0 */
-	ldq	arg1,0x00(sp)
+	mov	tmp1, a0
+	clr	result			/* If trap returns, return zero.  */
 	br	$done
 
-	END(FUNC_NAME)
+	.end UFUNC_NAME
+
+	.ent SFUNC_NAME
+	.globl SFUNC_NAME
+
+	.align 3
+SFUNC_NAME:
+	lda	sp, -STACK(sp)
+	.frame	sp, STACK, retaddr, 0
+#ifdef PROF
+	stq	ra, 0(sp)
+	stq	pv, 8(sp)
+	stq	gp, 16(sp)
+
+	br	AT, 1f
+1:	ldgp	gp, 0(AT)
+
+	mov	retaddr, ra
+	jsr	AT, _mcount
+
+	ldq	ra, 0(sp)
+	ldq	pv, 8(sp)
+	ldq	gp, 16(sp)
+#endif
+	.prologue 0
+
+	or	arg1, arg2, AT
+	_SLONGIFY(AT)
+	bge	AT, $udiv		/* don't need to mess with signs */
+
+	/* Save originals and find absolute values.  */
+	stq	arg1, 0(sp)
+	NEG	(arg1, AT)
+	stq	arg2, 8(sp)
+	cmovge	AT, AT, arg1
+	stq	retaddr, 16(sp)
+	NEG	(arg2, AT)
+	stq	tmp1, 24(sp)
+	cmovge	AT, AT, arg2
+
+	/* Do the unsigned division.  */
+	bsr	retaddr, UFUNC_NAME
+
+	/* Restore originals and adjust the sign of the result.  */
+	ldq	arg1, 0(sp)
+	ldq	arg2, 8(sp)
+	GETSIGN	(AT)
+	NEG	(result, tmp1)
+	_SLONGIFY(AT)
+	ldq	retaddr, 16(sp)
+	cmovlt	AT, tmp1, result
+	ldq	tmp1, 24(sp)
+
+	lda	sp, STACK(sp)
+	ret	zero, (retaddr), 1
+
+	.end	SFUNC_NAME
diff --git a/sysdeps/alpha/ffs.S b/sysdeps/alpha/ffs.S
index b84a51d326..959d1046d4 100644
--- a/sysdeps/alpha/ffs.S
+++ b/sysdeps/alpha/ffs.S
@@ -27,7 +27,14 @@ architecture.  */
         .set noat
 
 ENTRY(ffs)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
 	.prologue 0
+#endif
 
 	ldq_u	zero, 0(sp)	# on the 21064, this helps dual-issuing
 	addl	a0, zero, a0	# the last insn and reduces the stall
diff --git a/sysdeps/alpha/htonl.S b/sysdeps/alpha/htonl.S
index 9777e461cc..55d4f62bc7 100644
--- a/sysdeps/alpha/htonl.S
+++ b/sysdeps/alpha/htonl.S
@@ -19,7 +19,15 @@ Cambridge, MA 02139, USA.  */
 #include <sysdep.h>
 
 ENTRY(__htonl)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
 	.prologue 0
+#endif
+
 	extlh	a0, 5, t1	# t1 = dd000000
 	zap	a0, 0xfd, t2	# t2 = 0000cc00
 	sll	t2, 5, t2	# t2 = 00198000
diff --git a/sysdeps/alpha/htons.S b/sysdeps/alpha/htons.S
index 7717636782..743d3e2474 100644
--- a/sysdeps/alpha/htons.S
+++ b/sysdeps/alpha/htons.S
@@ -19,7 +19,15 @@ Cambridge, MA 02139, USA.  */
 #include <sysdep.h>
 
 ENTRY(__htons)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
 	.prologue 0
+#endif
+
 	extwh	a0, 7, t1	# t1 = bb00
 	extbl	a0, 1, v0	# v0 = 00aa
 	bis	v0, t1, v0	# v0 = bbaa
diff --git a/sysdeps/alpha/ldiv.S b/sysdeps/alpha/ldiv.S
new file mode 100644
index 0000000000..ebbe055870
--- /dev/null
+++ b/sysdeps/alpha/ldiv.S
@@ -0,0 +1,109 @@
+/* Copyright (C) 1996 Free Software Foundation, Inc.
+   Contributed by Richard Henderson (rth@tamu.edu)
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If
+   not, write to the Free Software Foundation, Inc., 675 Mass Ave,
+   Cambridge, MA 02139, USA.  */
+
+
+#include <sysdep.h>
+
+#ifdef __linux__
+# include <asm/gentrap.h>
+# include <asm/pal.h>
+#else
+# include <machine/pal.h>
+#endif
+
+	.set noat
+
+	.align 4
+	.globl ldiv
+	.ent ldiv
+ldiv:
+	.frame sp, 0, ra
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
+	.prologue 0
+#endif
+
+#define dividend  t0
+#define divisor   t1
+#define mask      t2
+#define quotient  t3
+#define modulus   t4
+#define tmp1      t5
+#define tmp2      t6
+#define compare   t7
+
+	/* find correct sign for input to unsigned divide loop. */
+	mov	a1, dividend			# e0    :
+	mov	a2, divisor			# .. e1 :
+	negq	a1, tmp1			# e0    :
+	negq	a2, tmp2			# .. e1 :
+	cmovlt	a1, tmp1, dividend		# e0    :
+	cmovlt	a2, tmp2, divisor		# .. e1 :
+	beq	a2, $divbyzero			# e1    :
+	unop					#       :
+
+	/* shift divisor left.  */
+1:	cmpult	divisor, modulus, compare	# e0    :
+	blt	divisor, 2f			# .. e1 :
+	addq	divisor, divisor, divisor	# e0    :
+	addq	mask, mask, mask		# .. e1 :
+	bne	compare, 1b			# e1    :
+	unop					#       :
+
+	/* start to go right again. */
+2:	addq	quotient, mask, tmp2		# e1    :
+	srl	mask, 1, mask			# .. e0 :
+	cmpule	divisor, modulus, compare	# e0    :
+	subq	modulus, divisor, tmp1		# .. e1 :
+	cmovne	compare, tmp2, quotient		# e1    :
+	srl	divisor, 1, divisor		# .. e0 :
+	cmovne	compare, tmp1, modulus		# e0    :
+	bne	mask, 2b			# .. e1 :
+
+	/* find correct sign for result.  */
+	xor	a1, a2, compare			# e0    :
+	negq	quotient, tmp1			# .. e1 :
+	negq	modulus, tmp2			# e0    :
+	cmovlt	compare, tmp1, quotient		# .. e1 :
+	cmovlt	a1, tmp2, modulus		# e1    :
+
+	/* and store it away in the structure.  */
+9:	stq	quotient, 0(a0)			# .. e0 :
+	mov	a0, v0				# e1    :
+	stq	modulus, 8(a0)			# .. e0 :
+	ret					# e1    :
+
+$divbyzero:
+	mov	a0, v0
+	lda	a0, GEN_INTDIV
+	call_pal PAL_gentrap
+
+	/* if trap returns, return zero.  */
+	stq	zero, 0(v0)
+	stq	zero, 8(v0)
+	ret
+
+	.end ldiv
+
+weak_alias(ldiv, lldiv)
diff --git a/sysdeps/alpha/lldiv.S b/sysdeps/alpha/lldiv.S
new file mode 100644
index 0000000000..80c450a3fc
--- /dev/null
+++ b/sysdeps/alpha/lldiv.S
@@ -0,0 +1 @@
+/* lldiv is the same as ldiv on the Alpha.  */
diff --git a/sysdeps/alpha/memchr.S b/sysdeps/alpha/memchr.S
index a47ac96e01..ecd26e8d6f 100644
--- a/sysdeps/alpha/memchr.S
+++ b/sysdeps/alpha/memchr.S
@@ -40,7 +40,14 @@ For correctness consider that:
         .set noat
 
 ENTRY(memchr)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
 	.prologue 0
+#endif
 
 	beq	a2, $not_found
         ldq_u   t0, 0(a0)       # load first quadword (a0 may be misaligned)
diff --git a/sysdeps/alpha/memcpy.S b/sysdeps/alpha/memcpy.S
new file mode 100644
index 0000000000..4ee9c115db
--- /dev/null
+++ b/sysdeps/alpha/memcpy.S
@@ -0,0 +1,276 @@
+/* Copyright (C) 1996 Free Software Foundation, Inc.
+   Contributed by Richard Henderson (rth@tamu.edu)
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If
+   not, write to the Free Software Foundation, Inc., 675 Mass Ave,
+   Cambridge, MA 02139, USA.  */
+
+
+/* This is the child of the C-with-inline-assembly memcpy posted by
+   Martin Ostermann (ost@comnets.rwth-aachen.de).
+
+   This is generally scheduled for the EV5, but whenever necessary and
+   possible, the autoswap slotting feature of the EV5 is used so that the
+   code lays out nicely for the EV4 as well.  */
+
+#include <alpha/regdef.h>
+
+	.set noreorder
+
+	.text
+
+	.ent copy_fwd_aligned
+copy_fwd_aligned:
+	.frame sp, 0, ra, 0
+	.prologue 0
+
+	/* Aligned forward copy main loop.  On entry to this basic block:
+	   t0 == source word waiting to be stored
+	   t2 == loop counter
+	   a0 == destination pointer
+	   a1 == source pointer
+	   a2 mod 8 == byte count in final word */
+	.align 4
+$fa_loop:
+	and	t2, 7, t1	# e0    :
+	beq	t1, 1f		# .. e1 :
+
+0:	stq_u	t0, 0(a0)	# e0    :
+	subq	t1, 1, t1	# .. e1 :
+	ldq_u	t0, 8(a1)	# e0    : copy up to seven words
+	addq	a0, 8, a0	# .. e1 :
+	addq	a1, 8, a1	# e0    :
+	bne	t1, 0b		# .. e1 :
+
+1:	bic	t2, 7, t2	# e0    :
+	beq	t2, $fa_tail	# .. e1 :
+
+2:	stq_u	t0, 0(a0)	# e0    :
+	addq	a0, 64, a0	# .. e1 :
+	ldq_u	t3, 8(a1)	# e0    : copy eight words as fast as we can
+	ldq_u	t4, 16(a1)	# .. e1 :
+	ldq_u	t5, 24(a1)	# e0    :
+	ldq_u	t6, 32(a1)	# .. e1 :
+	ldq_u	t7, 40(a1)	# e0    :
+	ldq_u	t8, 48(a1)	# .. e1 :
+	ldq_u	t9, 56(a1)	# e0    :
+	ldq_u	t0, 64(a1)	# .. e1 :
+	stq_u	t3, -56(a0)	# e0    :
+	subq	t2, 8, t2	# .. e1 :
+	stq_u	t4, -48(a0)	# e0    :
+	addq	a1, 64, a1	# .. e1 :
+	stq_u	t5, -40(a0)	# e0    :
+	stq_u	t6, -32(a0)	# e0    :
+	stq_u	t7, -24(a0)	# e0    :
+	stq_u	t8, -16(a0)	# e0    :
+	stq_u	t9, -8(a0)	# e0    :
+	bne	t2, 2b		# .. e1 :
+
+	/* Take care of a partial word tail.  */
+$fa_tail:
+	and	a2, 7, t3	# e0    :
+	bne	t3, 1f		# .. e1 (zdb)
+
+	/* Aligned copy, aligned tail, final store.  */
+	stq_u	t0, 0(a0)
+	ret
+
+1:	ldq_u	t1, 0(a0)	# e1    :
+	mskql	t0, a2, t0	# .. e1 :
+	mskqh	t1, a2, t1	# e0 (stall)
+	bis	t0, t1, t0	# e1    :
+	stq_u	t0, 0(a0)	# e0    :
+	ret			# .. e1 :
+
+	/* This is the actual entry point to this function.  */
+	.align 3
+$fwd_aligned:
+	ldq_u	t0, 0(a1)	# e0    :
+	and	a0, 7, t3	# .. e1 :
+	addq	a2, t3, a2	# e0    :
+	subq	a2, 1, t2	# e1    :
+	sra	t2, 3, t2	# e0    :
+	beq	t3, $fa_loop	# .. e1 :
+
+	ldq_u	t1, 0(a0)	# e0    :
+	beq	t2, $fa_small	# .. e1 :
+	mskqh	t0, a0, t0	# e0    :
+	mskql	t1, a0, t3	# e0    :
+	bis	t0, t3, t0	# e0    :
+	br	$fa_loop	# .. e1 :
+
+	/* The move affects exactly one destination word.  */
+$fa_small:
+	mskqh	t0, a0, t0	# e0    :
+	and	a2, 7, t4	# .. e1 :
+	mskql	t1, a0, t3	# e0    :
+	bne	t4, 1f		# .. e1 :
+
+	or	t0, t3, t0	# e0    :
+	unop			#       :
+	stq_u	t0, 0(a0)	# e0    :
+	ret			# .. e1 :
+
+1:	mskql	t0, a2, t0	# e0    :
+	mskqh	t1, a2, t1	# e0    :
+	or	t0, t3, t0	# e0    :
+	or	t0, t1, t0	# e1    :
+	stq_u	t0, 0(a0)	# e0    :
+	ret			# .. e1 :
+
+	.end copy_fwd_aligned
+
+	.ent memcpy
+	.globl memcpy
+	.align 3
+memcpy:
+	.frame sp, 0, ra, 0
+#ifdef PROF
+	ldgp	gp, 0(ra)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
+	.prologue 0
+#endif
+
+	mov	a0, v0
+	beq	a2, $zero_length
+
+	/* Are source and destination co-aligned?  */
+	xor	a0, a1, t0
+	unop
+	and	t0, 7, t0
+	beq	t0, $fwd_aligned
+	br	$fwd_unaligned
+
+	.end memcpy
+
+	.ent copy_fwd_unaligned
+copy_fwd_unaligned:
+	.frame sp, 0, ra, 0
+	.prologue 0
+
+	/* Unaligned forward copy main loop.  On entry to this basic block:
+	   t0 == source low word, unshifted
+	   t2 == loop counter
+	   t7 == last source byte + 1
+	   a0 == destination pointer
+	   a1 == source pointer
+	   a2 mod 8 == byte count in final word */
+	.align 4
+$fu_loop:
+	beq	t2, $fu_tail	# e1    :
+	blbc	t2, 0f		# e1    :
+
+	ldq_u	t1, 8(a1)	# e1    : copy one unaligned word
+	extql	t0, a1, t3	# .. e0 :
+	addq	a1, 8, a1	# e0    :
+	addq	a0, 8, a0	# .. e1 :
+	extqh	t1, a1, t4	# e0    :
+	subq	t2, 1, t2	# .. e1 :
+	mov	t1, t0		# e0    :
+	or	t3, t4, t3	# .. e1 :
+	stq_u	t3, -8(a0)	# e0    :
+	beq	t2, $fu_tail	# .. e1 :
+
+0:	ldq_u	t1, 8(a1)	# e1    : copy two unaligned words
+	extql	t0, a1, t3	# .. e0 :
+	ldq_u	t0, 16(a1)	# e0    :
+	subq	t2, 2, t2	# .. e1 :
+	extqh	t1, a1, t4	# e0    :
+	addq	a0, 16, a0	# .. e1 :
+	extql	t1, a1, t5	# e0    :
+	or	t3, t4, t3	# .. e1 :
+	extqh	t0, a1, t6	# e0    :
+	addq	a1, 16, a1	# .. e1 :
+	stq_u	t3, -16(a0)	# e0    :
+	or	t5, t6, t5	# .. e1 :
+	stq_u	t5, -8(a0)	# e0    :
+	bne	t2, 0b		# .. e1 :
+
+	/* Take care of a partial words tail.  */
+$fu_tail:
+	ldq_u	t4, -1(t7)	# e1    :
+	extql	t0, a1, t3	# .. e0 :
+	extqh	t4, a1, t4	# e0 (stall)
+	and	a2, 7, t5	# .. e1 :
+	or	t3, t4, t3	# e0    :
+	beq	t5, 1f		# .. e1 :
+
+	ldq_u	t1, 0(a0)	# e1    :
+	mskql	t3, a2, t3	# .. e0 :
+	mskqh	t1, a2, t1	# e0 (stall)
+	or	t1, t3, t3	# e1    :
+
+1:	stq_u	t3, 0(a0)	# e0    :
+	ret			# .. e1 :
+
+	/* The entry point to the unaligned forward copy.  */
+	.align 3
+$fwd_unaligned:
+	ldq_u	t0, 0(a1)	# e0    : load initial bits of src
+	addq	a1, a2, t7	# .. e1 : record last byte + 1 of src
+	and	a0, 7, t3	# e0    : find dst misalignment
+	addq	a2, t3, a2	# e1    : find number of words affected
+	subq	a2, 1, t2	# e0    :
+	cmple	a2, 8, t4	# .. e1 : are we dealing with a small block?
+	subq	a1, t3, a1	# e0    :
+	bne	t4, $fu_small	# .. e1 :
+	srl	t2, 3, t2	# e0    :
+	beq	t3, $fu_loop	# .. e1 :
+
+	/* Take care of an unaligned dst head.  */
+	ldq_u	t5, 0(a0)	# e0    :
+	ldq_u	t1, 8(a1)	# .. e1 :
+	extql	t0, a1, t3	# e0    :
+	addq	a0, 8, a0	# .. e1 :
+	extqh	t1, a1, t4	# e0    :
+	addq	a1, 8, a1	# .. e1 :
+	mskql	t5, a0, t5	# e0    :
+	or	t3, t4, t3	# .. e1 :
+	mskqh	t3, a0, t3	# e0    :
+	subq	t2, 1, t2	# .. e1 :
+	or	t3, t5, t3	# e0    :
+	mov	t1, t0		# .. e1 :
+	stq_u	t3, -8(a0)	# e0    :
+	br	$fu_loop	# .. e1 :
+
+	/* The move affects exactly one destination word.  */
+	.align 3
+$fu_small:
+	ldq_u	t2, 0(a0)	# e1    :
+	extql	t0, a1, t3	# .. e0 :
+	ldq_u	t1, -1(t7)	# e0    :
+	and	a2, 7, t8	# .. e1 :
+	mskqh	t2, a2, t6	# e0    :
+	mskql	t2, a0, t5	# e0    :
+	extqh	t1, a1, t4	# e0    :
+	cmovne	t8, t6, t8	# .. e1 :
+	or	t3, t4, t3	# e0    :
+	or	t5, t8, t5	# .. e1 :
+	mskqh	t3, a0, t3	# e0    :
+	and	a2, 7, t8	# .. e1 :
+	mskql	t3, a2, t6	# e0    :
+	cmovne	t8, t6, t8	# e1    :
+	or	t3, t5, t3	# e0    :
+	unop			#       :
+	stq_u	t3, 0(a0)	# e0    :
+
+$zero_length:
+	ret			# .. e1 :
+
+	.end copy_fwd_unaligned
diff --git a/sysdeps/alpha/memset.S b/sysdeps/alpha/memset.S
index 55271f00ea..2b29357c6e 100644
--- a/sysdeps/alpha/memset.S
+++ b/sysdeps/alpha/memset.S
@@ -85,7 +85,14 @@ $tail:	bne	t4, 1f		# is there a tail to do?
 	.end memset_loop
 
 ENTRY(memset)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
 	.prologue 0
+#endif
 
 	zapnot	a1, 1, a1	# e0    : zero extend input character
 	mov	a0, v0		# .. e1 : move return value in place
diff --git a/sysdeps/alpha/reml.S b/sysdeps/alpha/reml.S
index b631a02c31..8c00365ee3 100644
--- a/sysdeps/alpha/reml.S
+++ b/sysdeps/alpha/reml.S
@@ -1,6 +1,6 @@
 #define IS_REM		1
 #define SIZE		4
-#define SIGNED		1
-#define FUNC_NAME	__reml
+#define UFUNC_NAME	__remlu
+#define SFUNC_NAME	__reml
 
 #include "divrem.h"
diff --git a/sysdeps/alpha/remlu.S b/sysdeps/alpha/remlu.S
deleted file mode 100644
index 8d527e4678..0000000000
--- a/sysdeps/alpha/remlu.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define IS_REM		1
-#define SIZE		4
-#define SIGNED		0
-#define FUNC_NAME	__remlu
-
-#include "divrem.h"
diff --git a/sysdeps/alpha/remq.S b/sysdeps/alpha/remq.S
index 8bd9f334f4..cd1064af4e 100644
--- a/sysdeps/alpha/remq.S
+++ b/sysdeps/alpha/remq.S
@@ -1,6 +1,6 @@
 #define IS_REM		1
 #define SIZE		8
-#define SIGNED		1
-#define FUNC_NAME	__remq
+#define UFUNC_NAME	__remqu
+#define SFUNC_NAME	__remq
 
 #include "divrem.h"
diff --git a/sysdeps/alpha/remqu.S b/sysdeps/alpha/remqu.S
deleted file mode 100644
index 14a7486148..0000000000
--- a/sysdeps/alpha/remqu.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define IS_REM		1
-#define SIZE		8
-#define SIGNED		0
-#define FUNC_NAME	__remqu
-
-#include "divrem.h"
diff --git a/sysdeps/alpha/s_copysign.S b/sysdeps/alpha/s_copysign.S
index 95eb608666..739d3deb79 100644
--- a/sysdeps/alpha/s_copysign.S
+++ b/sysdeps/alpha/s_copysign.S
@@ -20,7 +20,15 @@ Cambridge, MA 02139, USA.  */
 #include <sysdep.h>
 
 ENTRY(__copysign)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
 	.prologue 0
+#endif
+
 	cpys	$f17,$f16,$f0
 	ret
 
diff --git a/sysdeps/alpha/s_fabs.S b/sysdeps/alpha/s_fabs.S
index 12c0abdf75..75976333f9 100644
--- a/sysdeps/alpha/s_fabs.S
+++ b/sysdeps/alpha/s_fabs.S
@@ -20,7 +20,15 @@ Cambridge, MA 02139, USA.  */
 #include <sysdep.h>
 
 ENTRY(__fabs)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
 	.prologue 0
+#endif
+
 	cpys	$f31,$f16,$f0
 	ret
 
diff --git a/sysdeps/alpha/setjmp.S b/sysdeps/alpha/setjmp.S
index 59929a0234..f57d49017e 100644
--- a/sysdeps/alpha/setjmp.S
+++ b/sysdeps/alpha/setjmp.S
@@ -23,6 +23,10 @@ Cambridge, MA 02139, USA.  */
    extra arguments.  */
 ENTRY (__sigsetjmp)
 	ldgp	$29, 0($27)
+#ifdef PROF
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+#endif
 	.prologue 1
 
 	bis	$30, $30, $18		/* Pass SP as 3rd arg.  */
diff --git a/sysdeps/alpha/stpcpy.S b/sysdeps/alpha/stpcpy.S
index 0dc44d353a..9c2668b535 100644
--- a/sysdeps/alpha/stpcpy.S
+++ b/sysdeps/alpha/stpcpy.S
@@ -27,6 +27,10 @@ Cambridge, MA 02139, USA.  */
 
 ENTRY(__stpcpy)
 	ldgp	gp, 0(pv)
+#ifdef PROF
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+#endif
 	.prologue 1
 
 	jsr	t9, __stxcpy	# do the work of the copy
diff --git a/sysdeps/alpha/stpncpy.S b/sysdeps/alpha/stpncpy.S
index 50cda2672e..90470cf95c 100644
--- a/sysdeps/alpha/stpncpy.S
+++ b/sysdeps/alpha/stpncpy.S
@@ -1,24 +1,23 @@
 /* Copyright (C) 1996 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
    Contributed by Richard Henderson (rth@tamu.edu)
 
-This file is part of the GNU C Library.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
 
-The GNU C Library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Library General Public License as
-published by the Free Software Foundation; either version 2 of the
-License, or (at your option) any later version.
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
 
-The GNU C Library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Library General Public License for more details.
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If not,
+   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
 
-You should have received a copy of the GNU Library General Public
-License along with the GNU C Library; see the file COPYING.LIB.  If
-not, write to the Free Software Foundation, Inc., 675 Mass Ave,
-Cambridge, MA 02139, USA.  */
-
-/* Copy no more than COUNT bytes of the null-terminated string from 
+/* Copy no more than COUNT bytes of the null-terminated string from
    SRC to DST.  If SRC does not cover all of COUNT, the balance is
    zeroed.  Return the address of the terminating null in DEST, if
    any, else DEST + COUNT.  */
@@ -32,8 +31,12 @@ Cambridge, MA 02139, USA.  */
 
 ENTRY(__stpncpy)
 	ldgp	gp, 0(pv)
+#ifdef PROF
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+#endif
 	.prologue 1
-	
+
 	beq	a2, $zerocount
 	jsr	t9, __stxncpy	# do the work of the copy
 
diff --git a/sysdeps/alpha/strcat.S b/sysdeps/alpha/strcat.S
index d3afff3c5f..e57259f51d 100644
--- a/sysdeps/alpha/strcat.S
+++ b/sysdeps/alpha/strcat.S
@@ -1,22 +1,21 @@
 /* Copyright (C) 1996 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
    Contributed by Richard Henderson (rth@tamu.edu)
 
-This file is part of the GNU C Library.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
 
-The GNU C Library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Library General Public License as
-published by the Free Software Foundation; either version 2 of the
-License, or (at your option) any later version.
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
 
-The GNU C Library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Library General Public License for more details.
-
-You should have received a copy of the GNU Library General Public
-License along with the GNU C Library; see the file COPYING.LIB.  If
-not, write to the Free Software Foundation, Inc., 675 Mass Ave,
-Cambridge, MA 02139, USA.  */
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If not,
+   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
 
 /* Append a null-terminated string from SRC to DST.  */
 
@@ -26,6 +25,10 @@ Cambridge, MA 02139, USA.  */
 
 ENTRY(strcat)
 	ldgp	gp, 0(pv)
+#ifdef PROF
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+#endif
 	.prologue 1
 
 	mov	a0, v0		# set up return value
@@ -59,7 +62,7 @@ $found:	negq    t1, t2		# clear all but least set bit
 	addq	a0, t2, a0
 
 	/* Now do the append.  */
-	
+
 	jsr	t9, __stxcpy
 	ret
 
diff --git a/sysdeps/alpha/strchr.S b/sysdeps/alpha/strchr.S
index c26a8431d2..e35b44ad6c 100644
--- a/sysdeps/alpha/strchr.S
+++ b/sysdeps/alpha/strchr.S
@@ -1,25 +1,24 @@
 /* Copyright (C) 1996 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
    Contributed by Richard Henderson (rth@tamu.edu)
 
-This file is part of the GNU C Library.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
 
-The GNU C Library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Library General Public License as
-published by the Free Software Foundation; either version 2 of the
-License, or (at your option) any later version.
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
 
-The GNU C Library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Library General Public License for more details.
-
-You should have received a copy of the GNU Library General Public
-License along with the GNU C Library; see the file COPYING.LIB.  If
-not, write to the Free Software Foundation, Inc., 675 Mass Ave,
-Cambridge, MA 02139, USA.  */
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If not,
+   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
 
 /* Return the address of a given character within a null-terminated
-   string, or null if it is not found. 
+   string, or null if it is not found.
 
    This is generally scheduled for the EV5 (got to look out for my own
    interests :-), but with EV4 needs in mind.  There *should* be no more
@@ -32,7 +31,14 @@ Cambridge, MA 02139, USA.  */
 	.set noat
 
 ENTRY(strchr)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
 	.prologue 0
+#endif
 
 	zapnot	a1, 1, a1	# e0    : zero extend the search character
 	ldq_u   t0, 0(a0)	# .. e1 : load first quadword
diff --git a/sysdeps/alpha/strcmp.S b/sysdeps/alpha/strcmp.S
new file mode 100644
index 0000000000..7dcae04ea4
--- /dev/null
+++ b/sysdeps/alpha/strcmp.S
@@ -0,0 +1,195 @@
+/* Copyright (C) 1996 Free Software Foundation, Inc.
+   Contributed by Richard Henderson (rth@tamu.edu)
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If
+   not, write to the Free Software Foundation, Inc., 675 Mass Ave,
+   Cambridge, MA 02139, USA.  */
+
+/* Bytewise compare two null-terminated strings.  */
+
+#include <sysdep.h>
+
+	.set noat
+	.set noreorder
+
+	.text
+
+ENTRY(strcmp)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jmp	AT, (AT), _mcount
+	.prologue 1
+#else
+	.prologue 0
+#endif
+
+	ldq_u	t0, 0(a0)	# e0    : give cache time to catch up
+	xor	a0, a1, t2	# .. e1 : are s1 and s2 co-aligned?
+	ldq_u	t1, 0(a1)	# e0    :
+	and	t2, 7, t2	# .. e1 :
+	lda	t3, -1		# e0    :
+	bne	t2, $unaligned	# .. e1 :
+
+	/* On entry to this basic block:
+	   t0 == the first destination word for masking back in
+	   t1 == the first source word.
+	   t3 == -1.  */
+
+$aligned:
+	mskqh	t3, a0, t3	# e0    :
+	nop			# .. e1 :
+	ornot	t1, t3, t1	# e0    :
+	ornot	t0, t3, t0	# .. e1 :
+	cmpbge	zero, t1, t7	# e0    : bits set iff null found
+	bne	t7, $eos	# e1 (zdb)
+
+	/* Aligned compare main loop.
+	   On entry to this basic block:
+	   t0 == an s1 word.
+	   t1 == an s2 word not containing a null.  */
+
+$a_loop:
+	xor	t0, t1, t2	# e0	:
+	bne	t2, $wordcmp	# .. e1 (zdb)
+	ldq_u	t1, 8(a1)	# e0    :
+	ldq_u	t0, 8(a0)	# .. e1 :
+	addq	a1, 8, a1	# e0    :
+	addq	a0, 8, a0	# .. e1 :
+	cmpbge	zero, t1, t7	# e0    :
+	beq	t7, $a_loop	# .. e1 (zdb)
+	br	$eos		# e1    :
+
+	/* The two strings are not co-aligned.  Align s1 and cope.  */
+
+$unaligned:
+	and	a0, 7, t4	# e0    : find s1 misalignment
+	and	a1, 7, t5	# .. e1 : find s2 misalignment
+	subq	a1, t4, a1	# e0    :
+
+	/* If s2 misalignment is larger than s2 misalignment, we need
+	   extra startup checks to avoid SEGV.  */
+
+	cmplt	t4, t5, t8	# .. e1 :
+	beq	t8, $u_head	# e1    :
+
+	mskqh	t3, t5, t3	# e0    :
+	ornot	t1, t3, t3	# e0    :
+	cmpbge	zero, t3, t7	# e1    : is there a zero?
+	beq	t7, $u_head	# e1    :
+
+	/* We've found a zero in the first partial word of s2.  Align
+	   our current s1 and s2 words and compare what we've got.  */
+
+	extql	t1, t5, t1	# e0    :
+	extql	t0, a0, t0	# e0    :
+	cmpbge	zero, t1, t7	# .. e1 : find that zero again
+	br	$eos		# e1    : and finish up
+
+	.align 3
+$u_head:
+	/* We know just enough now to be able to assemble the first
+	   full word of s2.  We can still find a zero at the end of it.
+
+	   On entry to this basic block:
+	   t0 == first word of s1
+	   t1 == first partial word of s2.  */
+
+	ldq_u	t2, 8(a1)	# e0    : load second partial s2 word
+	lda	t3, -1		# .. e1 : create leading garbage mask
+	extql	t1, a1, t1	# e0    : create first s2 word
+	mskqh	t3, a0, t3	# e0    :
+	extqh	t2, a1, t4	# e0    :
+	ornot	t0, t3, t0	# .. e1 : kill s1 garbage
+	or	t1, t4, t1	# e0    : s2 word now complete
+	cmpbge	zero, t0, t7	# .. e1 : find zero in first s1 word
+	ornot	t1, t3, t1	# e0    : kill s2 garbage
+	lda	t3, -1		# .. e1 :
+	mskql	t3, a1, t3	# e0    : mask for s2[1] bits we have seen
+	bne	t7, $eos	# .. e1 :
+	xor	t0, t1, t4	# e0    : compare aligned words
+	bne	t4, $wordcmp	# .. e1 (zdb)
+	or	t2, t3, t3	# e0    :
+	cmpbge	zero, t3, t7	# e1    :
+	bne	t7, $u_final	# e1    :
+
+	/* Unaligned copy main loop.  In order to avoid reading too much,
+	   the loop is structured to detect zeros in aligned words from s2.
+	   This has, unfortunately, effectively pulled half of a loop
+	   iteration out into the head and half into the tail, but it does
+	   prevent nastiness from accumulating in the very thing we want
+	   to run as fast as possible.
+
+	   On entry to this basic block:
+	   t2 == the unshifted low-bits from the next s2 word.  */
+
+	.align 3
+$u_loop:
+	extql	t2, a1, t3	# e0    :
+	ldq_u	t2, 16(a1)	# .. e1 : load next s2 high bits
+	ldq_u	t0, 8(a0)	# e0    : load next s1 word
+	addq	a1, 8, a1	# .. e1 :
+	addq	a0, 8, a0	# e0    :
+	nop			# .. e1 :
+	extqh	t2, a1, t1	# e0    :
+	cmpbge	zero, t0, t7	# .. e1 : find zero in current s1 word
+	or	t1, t3, t1	# e0    :
+	bne	t7, $eos	# .. e1 :
+	xor	t0, t1, t4	# e0    : compare the words
+	bne	t4, $wordcmp	# .. e1 (zdb)
+	cmpbge	zero, t2, t4	# e0    : find zero in next low bits
+	beq	t4, $u_loop	# .. e1 (zdb)
+
+	/* We've found a zero in the low bits of the last s2 word.  Get
+	   the next s1 word and align them.  */
+$u_final:
+	ldq_u	t0, 8(a0)	# e1    :
+	extql	t2, a1, t1	# .. e0 :
+	cmpbge	zero, t1, t7	# e0    :
+
+	/* We've found a zero somewhere in a word we just read.
+	   On entry to this basic block:
+	   t0 == s1 word
+	   t1 == s2 word
+	   t7 == cmpbge mask containing the zero.  */
+
+	.align 3
+$eos:
+	negq	t7, t6		# e0    : create bytemask of valid data
+	and	t6, t7, t8	# e1    :
+	subq	t8, 1, t6	# e0    :
+	or	t6, t8, t7	# e1    :
+	zapnot	t0, t7, t0	# e0    : kill the garbage
+	zapnot	t1, t7, t1	# .. e1 :
+	xor	t0, t1, v0	# e0    : and compare
+	beq	v0, $done	# .. e1 :
+
+	/* Here we have two differing co-aligned words in t0 & t1.
+           Bytewise compare them and return (t0 > t1 ? 1 : -1).  */
+$wordcmp:
+	cmpbge	t0, t1, t2	# e0    : comparison yields bit mask of ge
+	cmpbge	t1, t0, t3	# .. e1 :
+	xor	t2, t3, t0	# e0    : bits set iff t0/t1 bytes differ
+	negq	t0, t1		# e1    : clear all but least bit
+	and	t0, t1, t0	# e0    :
+	lda	v0, -1		# .. e1 :
+	and	t0, t2, t1	# e0    : was bit set in t0 > t1?
+	cmovne	t1, 1, v0	# .. e1 (zdb)
+
+$done:
+	ret			# e1    :
+
+	END(strcmp)
diff --git a/sysdeps/alpha/strcpy.S b/sysdeps/alpha/strcpy.S
index 2975181919..823476f750 100644
--- a/sysdeps/alpha/strcpy.S
+++ b/sysdeps/alpha/strcpy.S
@@ -27,6 +27,10 @@ Cambridge, MA 02139, USA.  */
 
 ENTRY(strcpy)
 	ldgp	gp, 0(pv)
+#ifdef PROF
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+#endif
 	.prologue 1
 
 	mov	a0, v0		# set up return value
diff --git a/sysdeps/alpha/strlen.S b/sysdeps/alpha/strlen.S
index 9eab707388..026c8addc2 100644
--- a/sysdeps/alpha/strlen.S
+++ b/sysdeps/alpha/strlen.S
@@ -34,6 +34,15 @@ Cambridge, MA 02139, USA.  */
 	.set noat
 
 ENTRY(strlen)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
+	.prologue 0
+#endif
+
 	ldq_u   t0, 0(a0)	# load first quadword (a0 may be misaligned)
 	lda     t1, -1(zero)
 	insqh   t1, a0, t1
diff --git a/sysdeps/alpha/strncat.S b/sysdeps/alpha/strncat.S
index d502037ace..089fba34d9 100644
--- a/sysdeps/alpha/strncat.S
+++ b/sysdeps/alpha/strncat.S
@@ -27,6 +27,10 @@ Cambridge, MA 02139, USA.  */
 
 ENTRY(strncat)
 	ldgp	gp, 0(pv)
+#ifdef PROF
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+#endif
 	.prologue 1
 
 	mov	a0, v0		# set up return value
diff --git a/sysdeps/alpha/strncmp.S b/sysdeps/alpha/strncmp.S
new file mode 100644
index 0000000000..682759042f
--- /dev/null
+++ b/sysdeps/alpha/strncmp.S
@@ -0,0 +1,224 @@
+/* Copyright (C) 1996 Free Software Foundation, Inc.
+   Contributed by Richard Henderson (rth@tamu.edu)
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If
+   not, write to the Free Software Foundation, Inc., 675 Mass Ave,
+   Cambridge, MA 02139, USA.  */
+
+/* Bytewise compare two null-terminated strings of length no longer than N.  */
+
+#include <sysdep.h>
+
+	.set noat
+	.set noreorder
+
+	.text
+
+ENTRY(strncmp)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
+	.prologue 0
+#endif
+
+	xor	a0, a1, t2	# e0    : are s1 and s2 co-aligned?
+	beq	a2, $zerolength	# .. e1 :
+	ldq_u	t0, 0(a0)	# e0    : give cache time to catch up
+	ldq_u	t1, 0(a1)	# .. e1 :
+	and	t2, 7, t2	# e0    :
+	and	a0, 7, t4	# .. e1 : find s1 misalignment
+	lda	t3, -1		# e0    :
+	addq	a2, t4, a2	# .. e1 : bias count by s1 misalignment
+	and	a2, 7, t10	# e1    : ofs of last byte in last word
+	srl	a2, 3, a2	# .. e0 : remaining full words in count
+	and	a1, 7, t5	# e0    : find s2 misalignment
+	bne	t2, $unaligned	# .. e1 :
+
+	/* On entry to this basic block:
+	   t0 == the first word of s1.
+	   t1 == the first word of s2.
+	   t3 == -1.  */
+
+$aligned:
+	mskqh	t3, a1, t3	# e0    : mask off leading garbage
+	nop			# .. e1 :
+	ornot	t1, t3, t1	# e0    :
+	ornot	t0, t3, t0	# .. e1 :
+	cmpbge	zero, t1, t7	# e0    : bits set iff null found
+	beq	a2, $eoc	# .. e1 : check end of count
+	unop			#       :
+	bne	t7, $eos	# e1    :
+
+	/* Aligned compare main loop.
+	   On entry to this basic block:
+	   t0 == an s1 word.
+	   t1 == an s2 word not containing a null.  */
+
+$a_loop:
+	xor	t0, t1, t2	# e0	:
+	bne	t2, $wordcmp	# .. e1 (zdb)
+	ldq_u	t1, 0(a1)	# e0    :
+	ldq_u	t0, 0(a0)	# .. e1 :
+	addq	a1, 8, a1	# e0    :
+	addq	a0, 8, a0	# .. e1 :
+	cmpbge	zero, t1, t7	# e0    :
+	beq	a2, $eoc	# .. e1 :
+	subq	a2, 1, a2	# e0    :
+	beq	t7, $a_loop	# .. e1 :
+	br	$eos		# e1    :
+
+	/* The two strings are not co-aligned.  Align s1 and cope.  */
+$unaligned:
+	subq	a1, t4, a1	# e0	 :
+	unop			#        :
+
+	/* If s2 misalignment is larger than s2 misalignment, we need
+	   extra startup checks to avoid SEGV.  */
+
+	cmplt	t4, t5, t8	# .. e1 :
+	beq	t8, $u_head	# e1    :
+
+	mskqh	t3, t5, t3	# e0    :
+	ornot	t1, t3, t3	# e0    :
+	cmpbge	zero, t3, t7	# e1    : is there a zero?
+	beq	t7, $u_head	# e1    :
+
+	/* We've found a zero in the first partial word of s2.  Align
+	   our current s1 and s2 words and compare what we've got.  */
+
+	extql	t1, t5, t1	# e0    :
+	lda	t3, -1		# .. e1 :
+	insql	t1, a0, t1	# e0    :
+	mskqh	t3, a0, t3	# e0    :
+	ornot	t1, t3, t1	# e0    :
+	ornot	t0, t3, t0	# .. e1 :
+	cmpbge	zero, t1, t7	# e0    : find that zero again
+	beq	a2, $eoc	# .. e1 : and finish up
+	br	$eos		# e1    :
+
+	.align 3
+$u_head:
+	/* We know just enough now to be able to assemble the first
+	   full word of s2.  We can still find a zero at the end of it.
+
+	   On entry to this basic block:
+	   t0 == first word of s1
+	   t1 == first partial word of s2.  */
+
+	ldq_u	t2, 8(a1)	# e0    : load second partial s2 word
+	lda	t3, -1		# .. e1 : create leading garbage mask
+	extql	t1, a1, t1	# e0    : create first s2 word
+	mskqh	t3, a0, t3	# e0    :
+	extqh	t2, a1, t4	# e0    :
+	ornot	t0, t3, t0	# .. e1 : kill s1 garbage
+	or	t1, t4, t1	# e0    : s2 word now complete
+	ornot	t1, t3, t1	# e1    : kill s2 garbage
+	cmpbge	zero, t0, t7	# e0    : find zero in first s1 word
+	beq	a2, $eoc	# .. e1 :
+	lda	t3, -1		# e0    :
+	bne	t7, $eos	# .. e1 :
+	subq	a2, 1, a2	# e0    :
+	xor	t0, t1, t4	# .. e1 : compare aligned words
+	mskql	t3, a1, t3	# e0    : mask out s2[1] bits we have seen
+	bne	t4, $wordcmp	# .. e1 :
+	or	t2, t3, t3	# e0    :
+	cmpbge	zero, t3, t7	# e1    : find zero in high bits of s2[1]
+	bne	t7, $u_final	# e1    :
+
+	/* Unaligned copy main loop.  In order to avoid reading too much,
+	   the loop is structured to detect zeros in aligned words from s2.
+	   This has, unfortunately, effectively pulled half of a loop
+	   iteration out into the head and half into the tail, but it does
+	   prevent nastiness from accumulating in the very thing we want
+	   to run as fast as possible.
+
+	   On entry to this basic block:
+	   t2 == the unshifted low-bits from the next s2 word.  */
+
+	.align 3
+$u_loop:
+	extql	t2, a1, t3	# e0    :
+	ldq_u	t2, 16(a1)	# .. e1 : load next s2 high bits
+	ldq_u	t0, 8(a0)	# e0    : load next s1 word
+	addq	a1, 8, a1	# .. e1 :
+	addq	a0, 8, a0	# e0    :
+	nop			# .. e1 :
+	extqh	t2, a1, t1	# e0    :
+	cmpbge	zero, t0, t7	# .. e1 : find zero in current s1 word
+	or	t1, t3, t1	# e0    :
+	beq	a2, $eoc	# .. e1 : check for end of count
+	subq	a2, 1, a2	# e0    :
+	bne	t7, $eos	# .. e1 :
+	xor	t0, t1, t4	# e0    : compare the words
+	bne	t4, $wordcmp	# .. e1 (zdb)
+	cmpbge	zero, t2, t4	# e0    : find zero in next low bits
+	beq	t4, $u_loop	# .. e1 (zdb)
+
+	/* We've found a zero in the low bits of the last s2 word.  Get
+	   the next s1 word and align them.  */
+$u_final:
+	ldq_u	t0, 8(a0)	# e1    :
+	extql	t2, a1, t1	# .. e0 :
+	cmpbge	zero, t1, t7	# e0    :
+	bne	a2, $eos	# .. e1 :
+
+	/* We've hit end of count.  Zero everything after the count
+	   and compare whats left.  */
+
+	.align 3
+$eoc:
+	mskql	t0, t10, t0
+	mskql	t1, t10, t1
+
+	/* We've found a zero somewhere in a word we just read.
+	   On entry to this basic block:
+	   t0 == s1 word
+	   t1 == s2 word
+	   t7 == cmpbge mask containing the zero.  */
+
+$eos:
+	negq	t7, t6		# e0    : create bytemask of valid data
+	and	t6, t7, t8	# e1    :
+	subq	t8, 1, t6	# e0    :
+	or	t6, t8, t7	# e1    :
+	zapnot	t0, t7, t0	# e0    : kill the garbage
+	zapnot	t1, t7, t1	# .. e1 :
+	xor	t0, t1, v0	# e0    : and compare
+	beq	v0, $done	# .. e1 :
+
+	/* Here we have two differing co-aligned words in t0 & t1.
+           Bytewise compare them and return (t0 > t1 ? 1 : -1).  */
+$wordcmp:
+	cmpbge	t0, t1, t2	# e0    : comparison yields bit mask of ge
+	cmpbge	t1, t0, t3	# .. e1 :
+	xor	t2, t3, t0	# e0    : bits set iff t0/t1 bytes differ
+	negq	t0, t1		# e1    : clear all but least bit
+	and	t0, t1, t0	# e0    :
+	lda	v0, -1		# .. e1 :
+	and	t0, t2, t1	# e0    : was bit set in t0 > t1?
+	cmovne	t1, 1, v0	# .. e1 (zdb)
+
+$done:
+	ret			# e1    :
+
+$zerolength:
+	clr	v0
+	ret
+
+	END(strncmp)
diff --git a/sysdeps/alpha/strncpy.S b/sysdeps/alpha/strncpy.S
index e13769c5c3..c077ab35b7 100644
--- a/sysdeps/alpha/strncpy.S
+++ b/sysdeps/alpha/strncpy.S
@@ -31,6 +31,10 @@ Cambridge, MA 02139, USA.  */
 
 ENTRY(strncpy)
 	ldgp	gp, 0(pv)
+#ifdef PROF
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+#endif
 	.prologue 1
 
 	mov	a0, v0		# set return value now
diff --git a/sysdeps/alpha/strrchr.S b/sysdeps/alpha/strrchr.S
index 464f754b20..02f37f50be 100644
--- a/sysdeps/alpha/strrchr.S
+++ b/sysdeps/alpha/strrchr.S
@@ -31,7 +31,14 @@ Cambridge, MA 02139, USA.  */
 	.set noat
 
 ENTRY(strrchr)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
 	.prologue 0
+#endif
 
 	zapnot	a1, 1, a1	# e0    : zero extend our test character
 	mov	zero, t6	# .. e1 : t6 is last match aligned addr
diff --git a/sysdeps/alpha/udiv_qrnnd.S b/sysdeps/alpha/udiv_qrnnd.S
index eb134f25c7..75d11827f9 100644
--- a/sysdeps/alpha/udiv_qrnnd.S
+++ b/sysdeps/alpha/udiv_qrnnd.S
@@ -27,8 +27,15 @@
 	.text
 
 LEAF(__udiv_qrnnd, 0)
-
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.prologue 1
+#else
         .prologue 0
+#endif
+
 #define cnt	$2
 #define tmp	$3
 #define rem_ptr	$16
@@ -38,9 +45,9 @@ LEAF(__udiv_qrnnd, 0)
 #define qb	$20
 
 	ldiq	cnt,16
-	blt	d,.Largedivisor
+	blt	d,$largedivisor
 
-.Loop1:	cmplt	n0,0,tmp
+$loop1:	cmplt	n0,0,tmp
 	addq	n1,n1,n1
 	bis	n1,tmp,n1
 	addq	n0,n0,n0
@@ -73,12 +80,12 @@ LEAF(__udiv_qrnnd, 0)
 	cmovne	qb,tmp,n1
 	bis	n0,qb,n0
 	subq	cnt,1,cnt
-	bgt	cnt,.Loop1
+	bgt	cnt,$loop1
 	stq	n1,0(rem_ptr)
 	bis	$31,n0,$0
 	ret	$31,($26),1
 
-.Largedivisor:
+$largedivisor:
 	and	n0,1,$4
 
 	srl	n0,1,n0
@@ -90,7 +97,7 @@ LEAF(__udiv_qrnnd, 0)
 	srl	d,1,$5
 	addq	$5,$6,$5
 
-.Loop2:	cmplt	n0,0,tmp
+$loop2:	cmplt	n0,0,tmp
 	addq	n1,n1,n1
 	bis	n1,tmp,n1
 	addq	n0,n0,n0
@@ -123,27 +130,30 @@ LEAF(__udiv_qrnnd, 0)
 	cmovne	qb,tmp,n1
 	bis	n0,qb,n0
 	subq	cnt,1,cnt
-	bgt	cnt,.Loop2
+	bgt	cnt,$loop2
 
 	addq	n1,n1,n1
 	addq	$4,n1,n1
-	bne	$6,.LOdd
+	bne	$6,$Odd
 	stq	n1,0(rem_ptr)
 	bis	$31,n0,$0
 	ret	$31,($26),1
 
-.LOdd:
+$Odd:
 	/* q' in n0. r' in n1 */
 	addq	n1,n0,n1
+
 	cmpult	n1,n0,tmp	# tmp := carry from addq
-	beq	tmp,.LLp6
-	addq	n0,1,n0
-	subq	n1,d,n1
-.LLp6:	cmpult	n1,d,tmp
-	bne	tmp,.LLp7
-	addq	n0,1,n0
-	subq	n1,d,n1
-.LLp7:
+	subq	n1,d,AT
+	addq	n0,tmp,n0
+	cmovne	tmp,AT,n1
+
+	cmpult	n1,d,tmp
+	addq	n0,1,AT
+	cmoveq	tmp,AT,n0
+	subq	n1,d,AT
+	cmoveq	tmp,AT,n1
+
 	stq	n1,0(rem_ptr)
 	bis	$31,n0,$0
 	ret	$31,($26),1
diff --git a/sysdeps/alpha/w_sqrt.S b/sysdeps/alpha/w_sqrt.S
new file mode 100644
index 0000000000..b5c980e557
--- /dev/null
+++ b/sysdeps/alpha/w_sqrt.S
@@ -0,0 +1,161 @@
+/* Copyright (C) 1996 Free Software Foundation, Inc.
+   Contributed by David Mosberger (davidm@cs.arizona.edu).
+   Based on public-domain C source by Linus Torvalds.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If
+   not, write to the Free Software Foundation, Inc., 675 Mass Ave,
+   Cambridge, MA 02139, USA.  */
+
+/* This version is much faster than generic sqrt implementation, but
+   it doesn't handle exceptional values or the inexact flag.  Don't use
+   this if _IEEE_FP or _IEEE_FP_INEXACT is in effect. */
+
+#ifndef _IEEE_FP
+
+#include <errnos.h>
+#include <sysdep.h>
+
+	.set noreorder
+
+#ifdef __ELF__
+	.section .rodata
+#else
+	.rdata
+#endif
+	.align 5        # align to cache line
+
+	/* Do all memory accesses relative to sqrtdata.  */
+sqrtdata:
+
+#define DN                     0x00
+#define UP                     0x08
+#define HALF                   0x10
+#define ALMOST_THREE_HALF      0x18
+#define T2                     0x20
+
+	.quad 0x3fefffffffffffff        /* DN = next(1.0) */
+	.quad 0x3ff0000000000001        /* UP = prev(1.0) */
+	.quad 0x3fe0000000000000        /* HALF = 0.5 */
+	.quad 0x3ff7ffffffc00000        /* ALMOST_THREE_HALF = 1.5-2^-30 */
+
+/* table T2: */
+.long   0x1500, 0x2ef8,   0x4d67,  0x6b02,  0x87be,  0xa395,  0xbe7a,  0xd866
+.long   0xf14a, 0x1091b, 0x11fcd, 0x13552, 0x14999, 0x15c98, 0x16e34, 0x17e5f
+.long  0x18d03, 0x19a01, 0x1a545, 0x1ae8a, 0x1b5c4, 0x1bb01, 0x1bfde, 0x1c28d
+.long  0x1c2de, 0x1c0db, 0x1ba73, 0x1b11c, 0x1a4b5, 0x1953d, 0x18266, 0x16be0
+.long  0x1683e, 0x179d8, 0x18a4d, 0x19992, 0x1a789, 0x1b445, 0x1bf61, 0x1c989
+.long  0x1d16d, 0x1d77b, 0x1dddf, 0x1e2ad, 0x1e5bf, 0x1e6e8, 0x1e654, 0x1e3cd
+.long  0x1df2a, 0x1d635, 0x1cb16, 0x1be2c, 0x1ae4e, 0x19bde, 0x1868e, 0x16e2e
+.long  0x1527f, 0x1334a, 0x11051,  0xe951,  0xbe01,  0x8e0d,  0x5924,  0x1edd
+
+/*
+ * Stack variables:
+ */
+#define K      16(sp)
+#define Y      24(sp)
+#define FSIZE  32
+
+	.text
+
+LEAF(__sqrt, FSIZE)
+	lda	sp, -FSIZE(sp)
+	ldgp	gp, .-__sqrt(pv)
+	stq	ra, 0(sp)
+#ifdef PROF
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+#endif
+	.prologue 1
+
+	stt	$f16, K
+	lda	t3, sqrtdata			# load base address into t3
+
+	fblt	$f16, $negative
+
+	/* Compute initial guess.  */
+
+	.align 3
+
+	ldah	t1, 0x5fe8			# e0    :
+	ldq	t2, K				# .. e1 :
+	ldt	$f12, HALF(t3)			# e0    :
+	ldt	$f18, ALMOST_THREE_HALF(t3)	# .. e1 :
+	srl	t2, 33, t0			# e0    :
+	mult	$f16, $f12, $f11		# .. fm : $f11 = x * 0.5
+	subl	t1, t0, t1			# e0    :
+	addt	$f12, $f12, $f17		# .. fa : $f17 = 1.0
+	srl	t1, 12, t0			# e0    :
+	and	t0, 0xfc, t0			# .. e1 :
+	addq	t0, t3, t0			# e0    :
+	ldl	t0, T2(t0)			# .. e1 :
+	addt	$f12, $f17, $f15		# fa    : $f15 = 1.5
+	subl	t1, t0, t1			# .. e1 :
+	sll	t1, 32, t1			# e0    :
+	ldt	$f14, DN(t3)			# .. e1 :
+	stq	t1, Y				# e0    :
+	ldt	$f13, Y				# e1    :
+	addq	sp, FSIZE, sp			# e0    :
+
+	mult	$f11, $f13, $f10	# fm    : $f10 = (x * 0.5) * y
+	mult	$f10, $f13, $f10	# fm    : $f10 = ((x * 0.5) * y) * y
+	subt	$f15, $f10, $f1		# fa    : $f1 = (1.5 - 0.5*x*y*y)
+	mult	$f13, $f1, $f13         # fm    : yp = y*(1.5 - 0.5*x*y*y)
+ 	mult	$f11, $f13, $f11	# fm    : $f11 = x * 0.5 * yp
+	mult	$f11, $f13, $f11	# fm    : $f11 = (x * 0.5 * yp) * yp
+	subt	$f18, $f11, $f1		# fa    : $f1= (1.5-2^-30) - 0.5*x*yp*yp
+	mult	$f13, $f1, $f13		# fm    : ypp = $f13 = yp*$f1
+	subt	$f15, $f12, $f1		# fa    : $f1 = (1.5 - 0.5)
+	ldt	$f15, UP(t3)		# .. e1 :
+	mult	$f16, $f13, $f10	# fm    : z = $f10 = x * ypp
+	mult	$f10, $f13, $f11	# fm    : $f11 = z*ypp
+	mult	$f10, $f12, $f12	# fm    : $f12 = z*0.5
+	subt	$f1, $f11, $f1		# .. fa : $f1 = 1 - z*ypp
+	mult	$f12, $f1, $f12		# fm    : $f12 = z*0.5*(1 - z*ypp)
+	addt	$f10, $f12, $f0		# fa    : zp=res=$f0= z + z*0.5*(1 - z*ypp)
+
+	mult/c	$f0, $f14, $f12		# fm    : zmi = zp * DN
+	mult/c	$f0, $f15, $f11		# fm    : zpl = zp * UP
+	mult/c	$f0, $f12, $f1		# fm    : $f1 = zp * zmi
+	mult/c	$f0, $f11, $f15		# fm    : $f15 = zp * zpl
+
+	subt    $f1, $f16, $f13		# fa    : y1 = zp*zmi - x
+	subt    $f15, $f16, $f15	# fa    : y2 = zp*zpl - x
+
+	fcmovge	$f13, $f12, $f0		# res = (y1 >= 0) ? zmi : res
+	fcmovlt	$f15, $f11, $f0		# res = (y2 <  0) ? zpl : res
+
+	ret
+
+$negative:
+	lda	t1, -1
+	stq	t1, K
+	lda	t1, EDOM
+	stl	t1, errno
+#ifdef _LIBC_REENTRANT
+	jsr	ra, __errno_location
+	lda	t1, -1
+	ldq	ra, 0(sp)
+	stl	t1, 0(v0)
+#endif
+	ldt	$f0, K			# res = (double) 0xffffffffffffffff
+	addq	sp, FSIZE, sp
+	ret
+
+	END(__sqrt)
+
+weak_alias(__sqrt, sqrt)
+
+#endif /* !_IEEE_FP */