28 files changed, 3292 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/Makefile b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/Makefile
new file mode 100644
index 0000000000..55b757f9ad
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/Makefile
@@ -0,0 +1,21 @@
+ifeq ($(subdir),crypt)
+libcrypt-sysdep_routines += md5-crop sha256-crop sha512-crop
+endif
+
+ifeq ($(subdir),locale)
+localedef-aux += md5-crop
+endif
+
+ifeq ($(subdir),string)
+sysdep_routines += memcpy-ultra3 memcpy-niagara1 memcpy-niagara2 \
+		   memset-niagara1 memcpy-niagara4 memset-niagara4
+endif
+
+ifeq ($(subdir),stdlib)
+sysdep_routines += mul_1-vis3 addmul_1-vis3 submul_1-vis3 add_n-vis3 sub_n-vis3
+endif
+
+ifeq ($(subdir),math)
+gmp-sysdep_routines = mul_1-vis3 addmul_1-vis3 submul_1-vis3 add_n-vis3 \
+		      sub_n-vis3
+endif
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/add_n-vis3.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/add_n-vis3.S
new file mode 100644
index 0000000000..c038bcbd6e
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/add_n-vis3.S
@@ -0,0 +1,67 @@
+! SPARC v9 64-bit VIS3 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+! store sum in a third limb vector.
+!
+! Copyright (C) 2013-2017 Free Software Foundation, Inc.
+! This file is part of the GNU C Library.
+! Contributed by David S. Miller <davem@davemloft.net>
+!
+! The GNU C Library is free software; you can redistribute it and/or
+! modify it under the terms of the GNU Lesser General Public
+! License as published by the Free Software Foundation; either
+! version 2.1 of the License, or (at your option) any later version.
+!
+! The GNU C Library is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+! Lesser General Public License for more details.
+!
+! You should have received a copy of the GNU Lesser General Public
+! License along with the GNU C Library; if not, see
+! <http://www.gnu.org/licenses/>.
+
+#include <sysdep.h>
+
+#define res_ptr	%o0
+#define s1_ptr	%o1
+#define s2_ptr	%o2
+#define sz	%o3
+#define tmp1	%g1
+#define tmp2	%g2
+#define tmp3	%g3
+#define tmp4	%o4
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+ENTRY(__mpn_add_n_vis3)
+	subcc	sz, 1, sz
+	be	.Lfinal_limb
+	 cmp	%g0, 0
+
+.Lloop:
+	ldx	[s2_ptr + 0x00], tmp1
+	add	s2_ptr, 0x10, s2_ptr
+	ldx	[s1_ptr + 0x00], tmp2
+	add	s1_ptr, 0x10, s1_ptr
+	ldx	[s2_ptr - 0x08], tmp3
+	add	res_ptr, 0x10, res_ptr
+	ldx	[s1_ptr - 0x08], tmp4
+	sub	sz, 2, sz
+	addxccc	tmp1, tmp2, tmp1
+	stx	tmp1, [res_ptr - 0x10]
+	addxccc	tmp3, tmp4, tmp3
+	brgz	sz, .Lloop
+	stx	tmp3, [res_ptr - 0x08]
+
+	brlz,pt	sz, .Lfinish
+	 nop
+
+.Lfinal_limb:
+	ldx	[s2_ptr + 0x00], tmp1
+	ldx	[s1_ptr + 0x00], tmp2
+	addxccc	tmp1, tmp2, tmp1
+	stx	tmp1, [res_ptr + 0x00]
+
+.Lfinish:
+	retl
+	 addxc	%g0, %g0, %o0
+END(__mpn_add_n_vis3)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/add_n.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/add_n.S
new file mode 100644
index 0000000000..9ffaf7865b
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/add_n.S
@@ -0,0 +1,56 @@
+/* Multiple versions of add_n
+
+   Copyright (C) 2013-2017 Free Software Foundation, Inc.
+   Contributed by David S. Miller (davem@davemloft.net)
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY(__mpn_add_n)
+	.type	__mpn_add_n, @gnu_indirect_function
+# ifdef SHARED
+	SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+	set	HWCAP_SPARC_VIS3, %o1
+	andcc	%o0, %o1, %g0
+	be	1f
+	 nop
+# ifdef SHARED
+	sethi	%gdop_hix22(__mpn_add_n_vis3), %o1
+	xor	%o1, %gdop_lox10(__mpn_add_n_vis3), %o1
+# else
+	set	__mpn_add_n_vis3, %o1
+# endif
+	ba	10f
+	 nop
+1:
+# ifdef SHARED
+	sethi	%gdop_hix22(__mpn_add_n_generic), %o1
+	xor	%o1, %gdop_lox10(__mpn_add_n_generic), %o1
+# else
+	set	__mpn_add_n_generic, %o1
+# endif
+10:
+# ifdef SHARED
+	add	%o3, %o1, %o1
+# endif
+	retl
+	 mov	%o1, %o0
+END(__mpn_add_n)
+
+#define __mpn_add_n __mpn_add_n_generic
+#include "../add_n.S"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/addmul_1-vis3.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/addmul_1-vis3.S
new file mode 100644
index 0000000000..64671f5079
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/addmul_1-vis3.S
@@ -0,0 +1,87 @@
+! SPARC v9 64-bit VIS3 __mpn_addmul_1 -- Multiply a limb vector with a
+! limb and add the result to a second limb vector.
+!
+! Copyright (C) 2013-2017 Free Software Foundation, Inc.
+! This file is part of the GNU C Library.
+! Contributed by David S. Miller <davem@davemloft.net>
+!
+! The GNU C Library is free software; you can redistribute it and/or
+! modify it under the terms of the GNU Lesser General Public
+! License as published by the Free Software Foundation; either
+! version 2.1 of the License, or (at your option) any later version.
+!
+! The GNU C Library is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+! Lesser General Public License for more details.
+!
+! You should have received a copy of the GNU Lesser General Public
+! License along with the GNU C Library; if not, see
+! <http://www.gnu.org/licenses/>.
+
+#include <sysdep.h>
+
+#define res_ptr	%i0
+#define s1_ptr	%i1
+#define sz	%i2
+#define s2_limb	%i3
+#define carry	%o5
+#define tmp1	%g1
+#define tmp2	%g2
+#define tmp3	%g3
+#define tmp4	%o4
+#define tmp5	%l0
+#define tmp6	%l1
+#define tmp7	%l2
+#define tmp8	%l3
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+ENTRY(__mpn_addmul_1_vis3)
+	save	%sp, -176, %sp
+	subcc	sz, 1, sz
+	be	.Lfinal_limb
+	 clr	carry
+
+.Lloop:
+	ldx	[s1_ptr  + 0x00], tmp1
+	ldx	[res_ptr + 0x00], tmp3
+	ldx	[s1_ptr  + 0x08], tmp2
+	ldx	[res_ptr + 0x08], tmp4
+	mulx	tmp1, s2_limb, tmp5
+	add	s1_ptr, 0x10, s1_ptr
+	umulxhi	tmp1, s2_limb, tmp6
+	add	res_ptr, 0x10, res_ptr
+	mulx	tmp2, s2_limb, tmp7
+	sub	sz, 2, sz
+	umulxhi	tmp2, s2_limb, tmp8
+	addcc	carry, tmp5, tmp5
+	addxc	%g0, tmp6, carry
+	addcc	tmp3, tmp5, tmp5
+	addxc	%g0, carry, carry
+	stx	tmp5, [res_ptr - 0x10]
+	addcc	carry, tmp7, tmp7
+	addxc	%g0, tmp8, carry
+	addcc	tmp4, tmp7, tmp7
+	addxc	%g0, carry, carry
+	brgz	sz, .Lloop
+	 stx	tmp7, [res_ptr - 0x08]
+
+	brlz,pt	sz, .Lfinish
+	 nop
+
+.Lfinal_limb:
+	ldx	[s1_ptr  + 0x00], tmp1
+	ldx	[res_ptr + 0x00], tmp3
+	mulx	tmp1, s2_limb, tmp5
+	umulxhi	tmp1, s2_limb, tmp6
+	addcc	carry, tmp5, tmp5
+	addxc	%g0, tmp6, carry
+	addcc	tmp3, tmp5, tmp5
+	addxc	%g0, carry, carry
+	stx	tmp5, [res_ptr + 0x00]
+
+.Lfinish:
+	jmpl	%i7 + 8, %g0
+	 restore carry, 0, %o0
+END(__mpn_addmul_1_vis3)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/addmul_1.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/addmul_1.S
new file mode 100644
index 0000000000..dcb1da184c
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/addmul_1.S
@@ -0,0 +1,56 @@
+/* Multiple versions of addmul_1
+
+   Copyright (C) 2013-2017 Free Software Foundation, Inc.
+   Contributed by David S. Miller (davem@davemloft.net)
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY(__mpn_addmul_1)
+	.type	__mpn_addmul_1, @gnu_indirect_function
+# ifdef SHARED
+	SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+	set	HWCAP_SPARC_VIS3, %o1
+	andcc	%o0, %o1, %g0
+	be	1f
+	 nop
+# ifdef SHARED
+	sethi	%gdop_hix22(__mpn_addmul_1_vis3), %o1
+	xor	%o1, %gdop_lox10(__mpn_addmul_1_vis3), %o1
+# else
+	set	__mpn_addmul_1_vis3, %o1
+# endif
+	ba	10f
+	 nop
+1:
+# ifdef SHARED
+	sethi	%gdop_hix22(__mpn_addmul_1_generic), %o1
+	xor	%o1, %gdop_lox10(__mpn_addmul_1_generic), %o1
+# else
+	set	__mpn_addmul_1_generic, %o1
+# endif
+10:
+# ifdef SHARED
+	add	%o3, %o1, %o1
+# endif
+	retl
+	 mov	%o1, %o0
+END(__mpn_addmul_1)
+
+#define __mpn_addmul_1 __mpn_addmul_1_generic
+#include "../addmul_1.S"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..a97bc455a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c
@@ -0,0 +1,75 @@
+/* Enumerate available IFUNC implementations of a function.  sparc version.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ldsodefs.h>
+#include <sysdep.h>
+#include <ifunc-impl-list.h>
+
+/* Fill ARRAY of MAX elements with IFUNC implementations for function
+   NAME and return the number of valid entries.  */
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+			size_t max)
+{
+  size_t i = 0;
+  int hwcap;
+
+  hwcap = GLRO(dl_hwcap);
+
+  IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_SPARC_CRYPTO,
+			      __memcpy_niagara4)
+	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_SPARC_N2,
+			      __memcpy_niagara2)
+	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_SPARC_BLKINIT,
+			      __memcpy_niagara1)
+	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_SPARC_ULTRA3,
+			      __memcpy_ultra3)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ultra1));
+
+  IFUNC_IMPL (i, name, mempcpy,
+	      IFUNC_IMPL_ADD (array, i, mempcpy, hwcap & HWCAP_SPARC_CRYPTO,
+			      __mempcpy_niagara4)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, hwcap & HWCAP_SPARC_N2,
+			      __mempcpy_niagara2)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, hwcap & HWCAP_SPARC_BLKINIT,
+			      __mempcpy_niagara1)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, hwcap & HWCAP_SPARC_ULTRA3,
+			      __mempcpy_ultra3)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ultra1));
+
+  IFUNC_IMPL (i, name, bzero,
+	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & HWCAP_SPARC_CRYPTO,
+			      __bzero_niagara4)
+	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & HWCAP_SPARC_BLKINIT,
+			      __bzero_niagara1)
+	      IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ultra1));
+
+  IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, hwcap & HWCAP_SPARC_CRYPTO,
+			      __memset_niagara4)
+	      IFUNC_IMPL_ADD (array, i, memset, hwcap & HWCAP_SPARC_BLKINIT,
+			      __memset_niagara1)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ultra1));
+
+  return i;
+}
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/md5-block.c b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/md5-block.c
new file mode 100644
index 0000000000..7c1a3a368f
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/md5-block.c
@@ -0,0 +1,29 @@
+#include <sparc-ifunc.h>
+
+#define  __md5_process_block __md5_process_block_generic
+extern void __md5_process_block_generic (const void *buffer, size_t len,
+					 struct md5_ctx *ctx);
+
+#include <crypt/md5-block.c>
+
+#undef __md5_process_block
+
+extern void __md5_process_block_crop (const void *buffer, size_t len,
+				      struct md5_ctx *ctx);
+static bool cpu_supports_md5(int hwcap)
+{
+  unsigned long cfr;
+
+  if (!(hwcap & HWCAP_SPARC_CRYPTO))
+    return false;
+
+  __asm__ ("rd %%asr26, %0" : "=r" (cfr));
+  if (cfr & (1 << 4))
+    return true;
+
+  return false;
+}
+
+extern void __md5_process_block (const void *buffer, size_t len,
+				 struct md5_ctx *ctx);
+sparc_libc_ifunc(__md5_process_block, cpu_supports_md5(hwcap) ? __md5_process_block_crop : __md5_process_block_generic);
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/md5-crop.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/md5-crop.S
new file mode 100644
index 0000000000..e8810da83e
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/md5-crop.S
@@ -0,0 +1,110 @@
+/* MD5 using sparc crypto opcodes.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by David S. Miller (davem@davemloft.net)
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define ASI_PL 0x88
+
+#define MD5		\
+	.word	0x81b02800;
+
+	.text
+	.align	32
+ENTRY(__md5_process_block_crop)
+	/* %o0=buffer, %o1=len, %o2=CTX */
+	ld	[%o2 + 0x10], %g1
+	add	%g1, %o1, %o4
+	st	%o4, [%o2 + 0x10]
+	clr	%o5
+	cmp	%o4, %g1
+	movlu	%icc, 1, %o5
+#ifdef __arch64__
+	srlx	%o1, 32, %o4
+	add	%o5, %o4, %o5
+#endif
+	ld	[%o2 + 0x14], %o4
+	add	%o4, %o5, %o4
+	st	%o4, [%o2 + 0x14]
+	lda	[%o2] ASI_PL, %f0
+	add	%o2, 0x4, %g1
+	lda	[%g1] ASI_PL, %f1
+	add	%o2, 0x8, %g1
+	andcc	%o0, 0x7, %g0
+	lda	[%g1] ASI_PL, %f2
+	add	%o2, 0xc, %g1
+	bne,pn	%xcc, 10f
+	 lda	[%g1] ASI_PL, %f3
+
+1:
+	ldd	[%o0 + 0x00], %f8
+	ldd	[%o0 + 0x08], %f10
+	ldd	[%o0 + 0x10], %f12
+	ldd	[%o0 + 0x18], %f14
+	ldd	[%o0 + 0x20], %f16
+	ldd	[%o0 + 0x28], %f18
+	ldd	[%o0 + 0x30], %f20
+	ldd	[%o0 + 0x38], %f22
+
+	MD5
+
+	subcc	%o1, 64, %o1
+	bne,pt	%xcc, 1b
+	 add	%o0, 0x40, %o0
+
+5:
+	sta	%f0, [%o2] ASI_PL
+	add	%o2, 0x4, %g1
+	sta	%f1, [%g1] ASI_PL
+	add	%o2, 0x8, %g1
+	sta	%f2, [%g1] ASI_PL
+	add	%o2, 0xc, %g1
+	retl
+	 sta	%f3, [%g1] ASI_PL
+10:
+	alignaddr %o0, %g0, %o0
+
+	ldd	[%o0 + 0x00], %f10
+1:
+	ldd	[%o0 + 0x08], %f12
+	ldd	[%o0 + 0x10], %f14
+	ldd	[%o0 + 0x18], %f16
+	ldd	[%o0 + 0x20], %f18
+	ldd	[%o0 + 0x28], %f20
+	ldd	[%o0 + 0x30], %f22
+	ldd	[%o0 + 0x38], %f24
+	ldd	[%o0 + 0x40], %f26
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	MD5
+
+	subcc	%o1, 64, %o1
+	fsrc2	%f26, %f10
+	bne,pt	%xcc, 1b
+	 add	%o0, 0x40, %o0
+
+	ba,a,pt	%xcc, 5b
+END(__md5_process_block_crop)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara1.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara1.S
new file mode 100644
index 0000000000..ccf42446e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara1.S
@@ -0,0 +1,347 @@
+/* Copy SIZE bytes from SRC to DEST.  For SUN4V Niagara.
+   Copyright (C) 2006-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by David S. Miller (davem@davemloft.net)
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define ASI_BLK_INIT_QUAD_LDD_P	0xe2
+#define ASI_P			0x80
+#define ASI_PNF			0x82
+
+#define LOAD(type,addr,dest)	type##a [addr] ASI_P, dest
+#define LOAD_TWIN(addr_reg,dest0,dest1)	\
+	ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
+
+#define STORE(type,src,addr)	type src, [addr]
+#define STORE_INIT(src,addr)	stxa src, [addr] %asi
+
+#ifndef XCC
+#define USE_BPR
+#define XCC xcc
+#endif
+
+#if IS_IN (libc)
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+	.register	%g6,#scratch
+
+	.text
+
+ENTRY(__mempcpy_niagara1)
+	ba,pt		%XCC, 101f
+	 add		%o0, %o2, %g5
+END(__mempcpy_niagara1)
+
+	.align		32
+ENTRY(__memcpy_niagara1)
+100:	/* %o0=dst, %o1=src, %o2=len */
+	mov		%o0, %g5
+101:
+# ifndef USE_BPR
+	srl		%o2, 0, %o2
+# endif
+	cmp		%o2, 0
+	be,pn		%XCC, 85f
+218:	 or		%o0, %o1, %o3
+	cmp		%o2, 16
+	blu,a,pn	%XCC, 80f
+	 or		%o3, %o2, %o3
+
+	/* 2 blocks (128 bytes) is the minimum we can do the block
+	 * copy with.  We need to ensure that we'll iterate at least
+	 * once in the block copy loop.  At worst we'll need to align
+	 * the destination to a 64-byte boundary which can chew up
+	 * to (64 - 1) bytes from the length before we perform the
+	 * block copy loop.
+	 */
+	cmp		%o2, (2 * 64)
+	blu,pt		%XCC, 70f
+	 andcc		%o3, 0x7, %g0
+
+	/* %o0:	dst
+	 * %o1:	src
+	 * %o2:	len  (known to be >= 128)
+	 *
+	 * The block copy loops will use %o4/%o5,%g2/%g3 as
+	 * temporaries while copying the data.
+	 */
+
+	LOAD(prefetch, %o1, #one_read)
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+
+	/* Align destination on 64-byte boundary.  */
+	andcc		%o0, (64 - 1), %o4
+	be,pt		%XCC, 2f
+	 sub		%o4, 64, %o4
+	sub		%g0, %o4, %o4	! bytes to align dst
+	sub		%o2, %o4, %o2
+1:	subcc		%o4, 1, %o4
+	LOAD(ldub, %o1, %g1)
+	STORE(stb, %g1, %o0)
+	add		%o1, 1, %o1
+	bne,pt		%XCC, 1b
+	add		%o0, 1, %o0
+
+	/* If the source is on a 16-byte boundary we can do
+	 * the direct block copy loop.  If it is 8-byte aligned
+	 * we can do the 16-byte loads offset by -8 bytes and the
+	 * init stores offset by one register.
+	 *
+	 * If the source is not even 8-byte aligned, we need to do
+	 * shifting and masking (basically integer faligndata).
+	 *
+	 * The careful bit with init stores is that if we store
+	 * to any part of the cache line we have to store the whole
+	 * cacheline else we can end up with corrupt L2 cache line
+	 * contents.  Since the loop works on 64-bytes of 64-byte
+	 * aligned store data at a time, this is easy to ensure.
+	 */
+2:
+	andcc		%o1, (16 - 1), %o4
+	andn		%o2, (64 - 1), %g1	! block copy loop iterator
+	sub		%o2, %g1, %o2		! final sub-block copy bytes
+	be,pt		%XCC, 50f
+	 cmp		%o4, 8
+	be,a,pt		%XCC, 10f
+	 sub		%o1, 0x8, %o1
+
+	/* Neither 8-byte nor 16-byte aligned, shift and mask.  */
+	mov		%g1, %o4
+	and		%o1, 0x7, %g1
+	sll		%g1, 3, %g1
+	mov		64, %o3
+	andn		%o1, 0x7, %o1
+	LOAD(ldx, %o1, %g2)
+	sub		%o3, %g1, %o3
+	sllx		%g2, %g1, %g2
+
+#define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\
+	LOAD(ldx, SRC, TMP1); \
+	srlx		TMP1, PRE_SHIFT, TMP2; \
+	or		TMP2, PRE_VAL, TMP2; \
+	STORE_INIT(TMP2, DST); \
+	sllx		TMP1, POST_SHIFT, PRE_VAL;
+
+1:	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00)
+	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08)
+	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10)
+	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18)
+	add		%o1, 32, %o1
+	LOAD(prefetch, %o1, #one_read)
+	sub		%o1, 32 - 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20)
+	add		%o1, 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28)
+	add		%o1, 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30)
+	add		%o1, 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38)
+	subcc		%o4, 64, %o4
+	bne,pt		%XCC, 1b
+	 add		%o0, 64, %o0
+
+#undef SWIVEL_ONE_DWORD
+
+	srl		%g1, 3, %g1
+	ba,pt		%XCC, 60f
+	 add		%o1, %g1, %o1
+
+10:	/* Destination is 64-byte aligned, source was only 8-byte
+	 * aligned but it has been subtracted by 8 and we perform
+	 * one twin load ahead, then add 8 back into source when
+	 * we finish the loop.
+	 */
+	LOAD_TWIN(%o1, %o4, %o5)
+1:	add		%o1, 16, %o1
+	LOAD_TWIN(%o1, %g2, %g3)
+	add		%o1, 16 + 32, %o1
+	LOAD(prefetch, %o1, #one_read)
+	sub		%o1, 32, %o1
+	STORE_INIT(%o5, %o0 + 0x00)		! initializes cache line
+	STORE_INIT(%g2, %o0 + 0x08)
+	LOAD_TWIN(%o1, %o4, %o5)
+	add		%o1, 16, %o1
+	STORE_INIT(%g3, %o0 + 0x10)
+	STORE_INIT(%o4, %o0 + 0x18)
+	LOAD_TWIN(%o1, %g2, %g3)
+	add		%o1, 16, %o1
+	STORE_INIT(%o5, %o0 + 0x20)
+	STORE_INIT(%g2, %o0 + 0x28)
+	LOAD_TWIN(%o1, %o4, %o5)
+	STORE_INIT(%g3, %o0 + 0x30)
+	STORE_INIT(%o4, %o0 + 0x38)
+	subcc		%g1, 64, %g1
+	bne,pt		%XCC, 1b
+	 add		%o0, 64, %o0
+
+	ba,pt		%XCC, 60f
+	 add		%o1, 0x8, %o1
+
+50:	/* Destination is 64-byte aligned, and source is 16-byte
+	 * aligned.
+	 */
+1:	LOAD_TWIN(%o1, %o4, %o5)
+	add	%o1, 16, %o1
+	LOAD_TWIN(%o1, %g2, %g3)
+	add	%o1, 16 + 32, %o1
+	LOAD(prefetch, %o1, #one_read)
+	sub	%o1, 32, %o1
+	STORE_INIT(%o4, %o0 + 0x00)		! initializes cache line
+	STORE_INIT(%o5, %o0 + 0x08)
+	LOAD_TWIN(%o1, %o4, %o5)
+	add	%o1, 16, %o1
+	STORE_INIT(%g2, %o0 + 0x10)
+	STORE_INIT(%g3, %o0 + 0x18)
+	LOAD_TWIN(%o1, %g2, %g3)
+	add	%o1, 16, %o1
+	STORE_INIT(%o4, %o0 + 0x20)
+	STORE_INIT(%o5, %o0 + 0x28)
+	STORE_INIT(%g2, %o0 + 0x30)
+	STORE_INIT(%g3, %o0 + 0x38)
+	subcc	%g1, 64, %g1
+	bne,pt	%XCC, 1b
+	 add	%o0, 64, %o0
+	/* fall through */
+
+60:
+	/* %o2 contains any final bytes still needed to be copied
+	 * over. If anything is left, we copy it one byte at a time.
+	 */
+	wr		%g0, ASI_PNF, %asi
+	brz,pt		%o2, 85f
+	 sub		%o0, %o1, %o3
+	ba,a,pt		%XCC, 90f
+
+	.align		64
+70: /* 16 < len <= 64 */
+	bne,pn		%XCC, 75f
+	 sub		%o0, %o1, %o3
+
+72:
+	andn		%o2, 0xf, %o4
+	and		%o2, 0xf, %o2
+1:	subcc		%o4, 0x10, %o4
+	LOAD(ldx, %o1, %o5)
+	add		%o1, 0x08, %o1
+	LOAD(ldx, %o1, %g1)
+	sub		%o1, 0x08, %o1
+	STORE(stx, %o5, %o1 + %o3)
+	add		%o1, 0x8, %o1
+	STORE(stx, %g1, %o1 + %o3)
+	bgu,pt		%XCC, 1b
+	 add		%o1, 0x8, %o1
+73:	andcc		%o2, 0x8, %g0
+	be,pt		%XCC, 1f
+	 nop
+	sub		%o2, 0x8, %o2
+	LOAD(ldx, %o1, %o5)
+	STORE(stx, %o5, %o1 + %o3)
+	add		%o1, 0x8, %o1
+1:	andcc		%o2, 0x4, %g0
+	be,pt		%XCC, 1f
+	 nop
+	sub		%o2, 0x4, %o2
+	LOAD(lduw, %o1, %o5)
+	STORE(stw, %o5, %o1 + %o3)
+	add		%o1, 0x4, %o1
+1:	cmp		%o2, 0
+	be,pt		%XCC, 85f
+	 nop
+	ba,pt		%XCC, 90f
+	 nop
+
+75:
+	andcc		%o0, 0x7, %g1
+	sub		%g1, 0x8, %g1
+	be,pn		%icc, 2f
+	 sub		%g0, %g1, %g1
+	sub		%o2, %g1, %o2
+
+1:	subcc		%g1, 1, %g1
+	LOAD(ldub, %o1, %o5)
+	STORE(stb, %o5, %o1 + %o3)
+	bgu,pt		%icc, 1b
+	 add		%o1, 1, %o1
+
+2:	add		%o1, %o3, %o0
+	andcc		%o1, 0x7, %g1
+	bne,pt		%icc, 8f
+	 sll		%g1, 3, %g1
+
+	cmp		%o2, 16
+	bgeu,pt		%icc, 72b
+	 nop
+	ba,a,pt		%XCC, 73b
+
+8:	mov		64, %o3
+	andn		%o1, 0x7, %o1
+	LOAD(ldx, %o1, %g2)
+	sub		%o3, %g1, %o3
+	andn		%o2, 0x7, %o4
+	sllx		%g2, %g1, %g2
+1:	add		%o1, 0x8, %o1
+	LOAD(ldx, %o1, %g3)
+	subcc		%o4, 0x8, %o4
+	srlx		%g3, %o3, %o5
+	or		%o5, %g2, %o5
+	STORE(stx, %o5, %o0)
+	add		%o0, 0x8, %o0
+	bgu,pt		%icc, 1b
+	 sllx		%g3, %g1, %g2
+
+	srl		%g1, 3, %g1
+	andcc		%o2, 0x7, %o2
+	be,pn		%icc, 85f
+	 add		%o1, %g1, %o1
+	ba,pt		%XCC, 90f
+	 sub		%o0, %o1, %o3
+
+	.align		64
+80: /* 0 < len <= 16 */
+	andcc		%o3, 0x3, %g0
+	bne,pn		%XCC, 90f
+	 sub		%o0, %o1, %o3
+
+1:
+	subcc		%o2, 4, %o2
+	LOAD(lduw, %o1, %g1)
+	STORE(stw, %g1, %o1 + %o3)
+	bgu,pt		%XCC, 1b
+	 add		%o1, 4, %o1
+
+85:	retl
+	 mov		%g5, %o0
+
+	.align		32
+90:
+	subcc		%o2, 1, %o2
+	LOAD(ldub, %o1, %g1)
+	STORE(stb, %g1, %o1 + %o3)
+	bgu,pt		%XCC, 90b
+	 add		%o1, 1, %o1
+	retl
+	 mov		%g5, %o0
+
+END(__memcpy_niagara1)
+
+#endif
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S
new file mode 100644
index 0000000000..798b3c80fe
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S
@@ -0,0 +1,498 @@
+/* Copy SIZE bytes from SRC to DEST.  For SUN4V Niagara-2.
+   Copyright (C) 2007-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by David S. Miller (davem@davemloft.net)
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define ASI_BLK_INIT_QUAD_LDD_P	0xe2
+#define ASI_BLK_P		0xf0
+#define ASI_P			0x80
+#define ASI_PNF			0x82
+
+#define FPRS_FEF		0x04
+
+#define VISEntryHalf			\
+	rd	%fprs, %o5;		\
+	wr	%g0, FPRS_FEF, %fprs
+
+#define VISExitHalf			\
+	and	%o5, FPRS_FEF, %o5;	\
+	wr	%o5, 0x0, %fprs
+
+#define STORE_ASI		ASI_BLK_INIT_QUAD_LDD_P
+
+#define LOAD(type,addr,dest)	type [addr], dest
+#define LOAD_BLK(addr,dest)	ldda [addr] ASI_BLK_P, dest
+#define STORE(type,src,addr)	type src, [addr]
+#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_P
+#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI
+
+#ifndef XCC
+#define USE_BPR
+#define XCC xcc
+#endif
+
+#define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
+	faligndata	%x0, %x1, %f0; \
+	faligndata	%x1, %x2, %f2; \
+	faligndata	%x2, %x3, %f4; \
+	faligndata	%x3, %x4, %f6; \
+	faligndata	%x4, %x5, %f8; \
+	faligndata	%x5, %x6, %f10; \
+	faligndata	%x6, %x7, %f12; \
+	faligndata	%x7, %x8, %f14;
+
+#define FREG_MOVE_1(x0) \
+	fsrc2		%x0, %f0;
+#define FREG_MOVE_2(x0, x1) \
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2;
+#define FREG_MOVE_3(x0, x1, x2) \
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4;
+#define FREG_MOVE_4(x0, x1, x2, x3) \
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6;
+#define FREG_MOVE_5(x0, x1, x2, x3, x4) \
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6; \
+	fsrc2		%x4, %f8;
+#define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6; \
+	fsrc2		%x4, %f8; \
+	fsrc2		%x5, %f10;
+#define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6; \
+	fsrc2		%x4, %f8; \
+	fsrc2		%x5, %f10; \
+	fsrc2		%x6, %f12;
+#define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6; \
+	fsrc2		%x4, %f8; \
+	fsrc2		%x5, %f10; \
+	fsrc2		%x6, %f12; \
+	fsrc2		%x7, %f14;
+#define FREG_LOAD_1(base, x0) \
+	LOAD(ldd, base + 0x00, %x0)
+#define FREG_LOAD_2(base, x0, x1) \
+	LOAD(ldd, base + 0x00, %x0); \
+	LOAD(ldd, base + 0x08, %x1);
+#define FREG_LOAD_3(base, x0, x1, x2) \
+	LOAD(ldd, base + 0x00, %x0); \
+	LOAD(ldd, base + 0x08, %x1); \
+	LOAD(ldd, base + 0x10, %x2);
+#define FREG_LOAD_4(base, x0, x1, x2, x3) \
+	LOAD(ldd, base + 0x00, %x0); \
+	LOAD(ldd, base + 0x08, %x1); \
+	LOAD(ldd, base + 0x10, %x2); \
+	LOAD(ldd, base + 0x18, %x3);
+#define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
+	LOAD(ldd, base + 0x00, %x0); \
+	LOAD(ldd, base + 0x08, %x1); \
+	LOAD(ldd, base + 0x10, %x2); \
+	LOAD(ldd, base + 0x18, %x3); \
+	LOAD(ldd, base + 0x20, %x4);
+#define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
+	LOAD(ldd, base + 0x00, %x0); \
+	LOAD(ldd, base + 0x08, %x1); \
+	LOAD(ldd, base + 0x10, %x2); \
+	LOAD(ldd, base + 0x18, %x3); \
+	LOAD(ldd, base + 0x20, %x4); \
+	LOAD(ldd, base + 0x28, %x5);
+#define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
+	LOAD(ldd, base + 0x00, %x0); \
+	LOAD(ldd, base + 0x08, %x1); \
+	LOAD(ldd, base + 0x10, %x2); \
+	LOAD(ldd, base + 0x18, %x3); \
+	LOAD(ldd, base + 0x20, %x4); \
+	LOAD(ldd, base + 0x28, %x5); \
+	LOAD(ldd, base + 0x30, %x6);
+
+#if IS_IN (libc)
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+	.register	%g6,#scratch
+
+	.text
+
+ENTRY(__mempcpy_niagara2)
+	ba,pt		%XCC, 101f
+	 add		%o0, %o2, %g5
+END(__mempcpy_niagara2)
+
+	.align		32
+ENTRY(__memcpy_niagara2)
+100:	/* %o0=dst, %o1=src, %o2=len */
+	mov		%o0, %g5
+101:
+# ifndef USE_BPR
+	srl		%o2, 0, %o2
+# endif
+	cmp		%o2, 0
+	be,pn		%XCC, 85f
+218:	 or		%o0, %o1, %o3
+	cmp		%o2, 16
+	blu,a,pn	%XCC, 80f
+	 or		%o3, %o2, %o3
+
+	/* 2 blocks (128 bytes) is the minimum we can do the block
+	 * copy with.  We need to ensure that we'll iterate at least
+	 * once in the block copy loop.  At worst we'll need to align
+	 * the destination to a 64-byte boundary which can chew up
+	 * to (64 - 1) bytes from the length before we perform the
+	 * block copy loop.
+	 *
+	 * However, the cut-off point, performance wise, is around
+	 * 4 64-byte blocks.
+	 */
+	cmp		%o2, (4 * 64)
+	blu,pt		%XCC, 75f
+	 andcc		%o3, 0x7, %g0
+
+	/* %o0:	dst
+	 * %o1:	src
+	 * %o2:	len  (known to be >= 128)
+	 *
+	 * The block copy loops can use %o4, %g2, %g3 as
+	 * temporaries while copying the data.  %o5 must
+	 * be preserved between VISEntryHalf and VISExitHalf
+	 */
+
+	LOAD(prefetch, %o1 + 0x000, #one_read)
+	LOAD(prefetch, %o1 + 0x040, #one_read)
+	LOAD(prefetch, %o1 + 0x080, #one_read)
+
+	/* Align destination on 64-byte boundary.  */
+	andcc		%o0, (64 - 1), %o4
+	be,pt		%XCC, 2f
+	 sub		%o4, 64, %o4
+	sub		%g0, %o4, %o4	! bytes to align dst
+	sub		%o2, %o4, %o2
+1:	subcc		%o4, 1, %o4
+	LOAD(ldub, %o1, %g1)
+	STORE(stb, %g1, %o0)
+	add		%o1, 1, %o1
+	bne,pt		%XCC, 1b
+	add		%o0, 1, %o0
+
+2:
+	/* Clobbers o5/g1/g2/g3/g7/icc/xcc.  We must preserve
+	 * o5 from here until we hit VISExitHalf.
+	 */
+	VISEntryHalf
+
+	membar		#Sync
+	alignaddr	%o1, %g0, %g0
+
+	add		%o1, (64 - 1), %o4
+	andn		%o4, (64 - 1), %o4
+	andn		%o2, (64 - 1), %g1
+	sub		%o2, %g1, %o2
+
+	and		%o1, (64 - 1), %g2
+	add		%o1, %g1, %o1
+	sub		%o0, %o4, %g3
+	brz,pt		%g2, 190f
+	 cmp		%g2, 32
+	blu,a		5f
+	 cmp		%g2, 16
+	cmp		%g2, 48
+	blu,a		4f
+	 cmp		%g2, 40
+	cmp		%g2, 56
+	blu		170f
+	 nop
+	ba,a,pt		%xcc, 180f
+
+4:	/* 32 <= low bits < 48 */
+	blu		150f
+	 nop
+	ba,a,pt		%xcc, 160f
+5:	/* 0 < low bits < 32 */
+	blu,a		6f
+	 cmp		%g2, 8
+	cmp		%g2, 24
+	blu		130f
+	 nop
+	ba,a,pt		%xcc, 140f
+6:	/* 0 < low bits < 16 */
+	bgeu		120f
+	 nop
+	/* fall through for 0 < low bits < 8 */
+110:	sub		%o4, 64, %g2
+	LOAD_BLK(%g2, %f0)
+1:	STORE_INIT(%g0, %o4 + %g3)
+	LOAD_BLK(%o4, %f16)
+	FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
+	STORE_BLK(%f0, %o4 + %g3)
+	FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
+	subcc		%g1, 64, %g1
+	add		%o4, 64, %o4
+	bne,pt		%XCC, 1b
+	 LOAD(prefetch, %o4 + 64, #one_read)
+	ba,pt		%xcc, 195f
+	 nop
+
+120:	sub		%o4, 56, %g2
+	FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
+1:	STORE_INIT(%g0, %o4 + %g3)
+	LOAD_BLK(%o4, %f16)
+	FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
+	STORE_BLK(%f0, %o4 + %g3)
+	FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
+	subcc		%g1, 64, %g1
+	add		%o4, 64, %o4
+	bne,pt		%XCC, 1b
+	 LOAD(prefetch, %o4 + 64, #one_read)
+	ba,pt		%xcc, 195f
+	 nop
+
+130:	sub		%o4, 48, %g2
+	FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
+1:	STORE_INIT(%g0, %o4 + %g3)
+	LOAD_BLK(%o4, %f16)
+	FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
+	STORE_BLK(%f0, %o4 + %g3)
+	FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
+	subcc		%g1, 64, %g1
+	add		%o4, 64, %o4
+	bne,pt		%XCC, 1b
+	 LOAD(prefetch, %o4 + 64, #one_read)
+	ba,pt		%xcc, 195f
+	 nop
+
+140:	sub		%o4, 40, %g2
+	FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
+1:	STORE_INIT(%g0, %o4 + %g3)
+	LOAD_BLK(%o4, %f16)
+	FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
+	STORE_BLK(%f0, %o4 + %g3)
+	FREG_MOVE_5(f22, f24, f26, f28, f30)
+	subcc		%g1, 64, %g1
+	add		%o4, 64, %o4
+	bne,pt		%XCC, 1b
+	 LOAD(prefetch, %o4 + 64, #one_read)
+	ba,pt		%xcc, 195f
+	 nop
+
+150:	sub		%o4, 32, %g2
+	FREG_LOAD_4(%g2, f0, f2, f4, f6)
+1:	STORE_INIT(%g0, %o4 + %g3)
+	LOAD_BLK(%o4, %f16)
+	FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
+	STORE_BLK(%f0, %o4 + %g3)
+	FREG_MOVE_4(f24, f26, f28, f30)
+	subcc		%g1, 64, %g1
+	add		%o4, 64, %o4
+	bne,pt		%XCC, 1b
+	 LOAD(prefetch, %o4 + 64, #one_read)
+	ba,pt		%xcc, 195f
+	 nop
+
+160:	sub		%o4, 24, %g2
+	FREG_LOAD_3(%g2, f0, f2, f4)
+1:	STORE_INIT(%g0, %o4 + %g3)
+	LOAD_BLK(%o4, %f16)
+	FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
+	STORE_BLK(%f0, %o4 + %g3)
+	FREG_MOVE_3(f26, f28, f30)
+	subcc		%g1, 64, %g1
+	add		%o4, 64, %o4
+	bne,pt		%XCC, 1b
+	 LOAD(prefetch, %o4 + 64, #one_read)
+	ba,pt		%xcc, 195f
+	 nop
+
+170:	sub		%o4, 16, %g2
+	FREG_LOAD_2(%g2, f0, f2)
+1:	STORE_INIT(%g0, %o4 + %g3)
+	LOAD_BLK(%o4, %f16)
+	FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
+	STORE_BLK(%f0, %o4 + %g3)
+	FREG_MOVE_2(f28, f30)
+	subcc		%g1, 64, %g1
+	add		%o4, 64, %o4
+	bne,pt		%XCC, 1b
+	 LOAD(prefetch, %o4 + 64, #one_read)
+	ba,pt		%xcc, 195f
+	 nop
+
+180:	sub		%o4, 8, %g2
+	FREG_LOAD_1(%g2, f0)
+1:	STORE_INIT(%g0, %o4 + %g3)
+	LOAD_BLK(%o4, %f16)
+	FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
+	STORE_BLK(%f0, %o4 + %g3)
+	FREG_MOVE_1(f30)
+	subcc		%g1, 64, %g1
+	add		%o4, 64, %o4
+	bne,pt		%XCC, 1b
+	 LOAD(prefetch, %o4 + 64, #one_read)
+	ba,pt		%xcc, 195f
+	 nop
+
+190:
+1:	STORE_INIT(%g0, %o4 + %g3)
+	subcc		%g1, 64, %g1
+	LOAD_BLK(%o4, %f0)
+	STORE_BLK(%f0, %o4 + %g3)
+	add		%o4, 64, %o4
+	bne,pt		%XCC, 1b
+	 LOAD(prefetch, %o4 + 64, #one_read)
+
+195:
+	add		%o4, %g3, %o0
+	membar		#Sync
+
+	VISExitHalf
+
+	/* %o2 contains any final bytes still needed to be copied
+	 * over. If anything is left, we copy it one byte at a time.
+	 */
+	brz,pt		%o2, 85f
+	 sub		%o0, %o1, %o3
+	ba,a,pt		%XCC, 90f
+
+	.align		64
+75: /* 16 < len <= 64 */
+	bne,pn		%XCC, 75f
+	 sub		%o0, %o1, %o3
+
+72:
+	andn		%o2, 0xf, %o4
+	and		%o2, 0xf, %o2
+1:	subcc		%o4, 0x10, %o4
+	LOAD(ldx, %o1, %o5)
+	add		%o1, 0x08, %o1
+	LOAD(ldx, %o1, %g1)
+	sub		%o1, 0x08, %o1
+	STORE(stx, %o5, %o1 + %o3)
+	add		%o1, 0x8, %o1
+	STORE(stx, %g1, %o1 + %o3)
+	bgu,pt		%XCC, 1b
+	 add		%o1, 0x8, %o1
+73:	andcc		%o2, 0x8, %g0
+	be,pt		%XCC, 1f
+	 nop
+	sub		%o2, 0x8, %o2
+	LOAD(ldx, %o1, %o5)
+	STORE(stx, %o5, %o1 + %o3)
+	add		%o1, 0x8, %o1
+1:	andcc		%o2, 0x4, %g0
+	be,pt		%XCC, 1f
+	 nop
+	sub		%o2, 0x4, %o2
+	LOAD(lduw, %o1, %o5)
+	STORE(stw, %o5, %o1 + %o3)
+	add		%o1, 0x4, %o1
+1:	cmp		%o2, 0
+	be,pt		%XCC, 85f
+	 nop
+	ba,pt		%xcc, 90f
+	 nop
+
+75:
+	andcc		%o0, 0x7, %g1
+	sub		%g1, 0x8, %g1
+	be,pn		%icc, 2f
+	 sub		%g0, %g1, %g1
+	sub		%o2, %g1, %o2
+
+1:	subcc		%g1, 1, %g1
+	LOAD(ldub, %o1, %o5)
+	STORE(stb, %o5, %o1 + %o3)
+	bgu,pt		%icc, 1b
+	 add		%o1, 1, %o1
+
+2:	add		%o1, %o3, %o0
+	andcc		%o1, 0x7, %g1
+	bne,pt		%icc, 8f
+	 sll		%g1, 3, %g1
+
+	cmp		%o2, 16
+	bgeu,pt		%icc, 72b
+	 nop
+	ba,a,pt		%xcc, 73b
+
+8:	mov		64, %o3
+	andn		%o1, 0x7, %o1
+	LOAD(ldx, %o1, %g2)
+	sub		%o3, %g1, %o3
+	andn		%o2, 0x7, %o4
+	sllx		%g2, %g1, %g2
+1:	add		%o1, 0x8, %o1
+	LOAD(ldx, %o1, %g3)
+	subcc		%o4, 0x8, %o4
+	srlx		%g3, %o3, %o5
+	or		%o5, %g2, %o5
+	STORE(stx, %o5, %o0)
+	add		%o0, 0x8, %o0
+	bgu,pt		%icc, 1b
+	 sllx		%g3, %g1, %g2
+
+	srl		%g1, 3, %g1
+	andcc		%o2, 0x7, %o2
+	be,pn		%icc, 85f
+	 add		%o1, %g1, %o1
+	ba,pt		%xcc, 90f
+	 sub		%o0, %o1, %o3
+
+	.align		64
+80: /* 0 < len <= 16 */
+	andcc		%o3, 0x3, %g0
+	bne,pn		%XCC, 90f
+	 sub		%o0, %o1, %o3
+
+1:
+	subcc		%o2, 4, %o2
+	LOAD(lduw, %o1, %g1)
+	STORE(stw, %g1, %o1 + %o3)
+	bgu,pt		%XCC, 1b
+	 add		%o1, 4, %o1
+
+85:	retl
+	 mov		%g5, %o0
+
+	.align		32
+90:
+	subcc		%o2, 1, %o2
+	LOAD(ldub, %o1, %g1)
+	STORE(stb, %g1, %o1 + %o3)
+	bgu,pt		%XCC, 90b
+	 add		%o1, 1, %o1
+	retl
+	 mov		%g5, %o0
+
+END(__memcpy_niagara2)
+
+#endif
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara4.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara4.S
new file mode 100644
index 0000000000..709b398364
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara4.S
@@ -0,0 +1,332 @@
+/* Copy SIZE bytes from SRC to DEST.  For SUN4V Niagara-4.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by David S. Miller (davem@davemloft.net)
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define ASI_BLK_INIT_QUAD_LDD_P	0xe2
+
+#define FPRS_FEF		0x04
+
+/* On T4 it is very expensive to access ASRs like %fprs and
+ * %asi, avoiding a read or a write can save ~50 cycles.
+ */
+#define FPU_ENTER			\
+	rd	%fprs, %o5;		\
+	andcc	%o5, FPRS_FEF, %g0;	\
+	be,a,pn	%icc, 999f;		\
+	 wr	%g0, FPRS_FEF, %fprs;	\
+	999:
+
+#define VISEntryHalf FPU_ENTER
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+
+#define GLOBAL_SPARE	%g5
+
+#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
+#define EX_LD(x)	x
+#define EX_ST(x)	x
+#define EX_RETVAL(x)	x
+#define LOAD(type,addr,dest)	type [addr], dest
+#define STORE(type,src,addr)	type src, [addr]
+#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI
+
+#if IS_IN (libc)
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+	.register	%g6,#scratch
+
+	.text
+
+ENTRY(__mempcpy_niagara4)
+	ba,pt		%icc, 101f
+	 add		%o0, %o2, %o3
+END(__mempcpy_niagara4)
+
+	.align		32
+ENTRY(__memcpy_niagara4)
+100:	/* %o0=dst, %o1=src, %o2=len */
+	mov		%o0, %o3
+101:
+#ifndef __arch64__
+	srl		%o2, 0, %o2
+#endif
+	brz,pn		%o2, .Lexit
+	 cmp		%o2, 3
+	ble,pn		%icc, .Ltiny
+	 cmp		%o2, 19
+	ble,pn		%icc, .Lsmall
+	 or		%o0, %o1, %g2
+	cmp		%o2, 128
+	bl,pn		%icc, .Lmedium
+	 nop
+
+.Llarge:/* len >= 0x80 */
+	/* First get dest 8 byte aligned.  */
+	sub		%g0, %o0, %g1
+	and		%g1, 0x7, %g1
+	brz,pt		%g1, 51f
+	 sub		%o2, %g1, %o2
+
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
+	add		%o1, 1, %o1
+	subcc		%g1, 1, %g1
+	add		%o0, 1, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stb, %g2, %o0 - 0x01))
+
+51:	LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
+
+	/* Check if we can use the straight fully aligned
+	 * loop, or we require the alignaddr/faligndata variant.
+	 */
+	andcc		%o1, 0x7, %o5
+	bne,pn		%icc, .Llarge_src_unaligned
+	 sub		%g0, %o0, %g1
+
+	/* Legitimize the use of initializing stores by getting dest
+	 * to be 64-byte aligned.
+	 */
+	and		%g1, 0x3f, %g1
+	brz,pt		%g1, .Llarge_aligned
+	 sub		%o2, %g1, %o2
+
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
+	add		%o1, 8, %o1
+	subcc		%g1, 8, %g1
+	add		%o0, 8, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stx, %g2, %o0 - 0x08))
+
+.Llarge_aligned:
+	/* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
+	andn		%o2, 0x3f, %o4
+	sub		%o2, %o4, %o2
+
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+	add		%o1, 0x40, %o1
+	EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
+	subcc		%o4, 0x40, %o4
+	EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
+	EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
+	EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
+	EX_ST(STORE_INIT(%g1, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(%g2, %o0))
+	add		%o0, 0x08, %o0
+	EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
+	EX_ST(STORE_INIT(%g3, %o0))
+	add		%o0, 0x08, %o0
+	EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
+	EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
+	add		%o0, 0x08, %o0
+	EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
+	EX_ST(STORE_INIT(%o5, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(%g2, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(%g3, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
+	add		%o0, 0x08, %o0
+	bne,pt		%icc, 1b
+	 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
+
+	membar		#StoreLoad | #StoreStore
+
+	brz,pn		%o2, .Lexit
+	 cmp		%o2, 19
+	ble,pn		%icc, .Lsmall_unaligned
+	 nop
+	ba,a,pt		%icc, .Lmedium_noprefetch
+
+.Lexit:	retl
+	 mov		EX_RETVAL(%o3), %o0
+
+.Llarge_src_unaligned:
+	andn		%o2, 0x3f, %o4
+	sub		%o2, %o4, %o2
+	VISEntryHalf
+	alignaddr	%o1, %g0, %g1
+	add		%o1, %o4, %o1
+	EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
+1:	EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
+	subcc		%o4, 0x40, %o4
+	EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
+	EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
+	EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
+	EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
+	EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
+	EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
+	faligndata	%f0, %f2, %f16
+	EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
+	faligndata	%f2, %f4, %f18
+	add		%g1, 0x40, %g1
+	faligndata	%f4, %f6, %f20
+	faligndata	%f6, %f8, %f22
+	faligndata	%f8, %f10, %f24
+	faligndata	%f10, %f12, %f26
+	faligndata	%f12, %f14, %f28
+	faligndata	%f14, %f0, %f30
+	EX_ST(STORE(std, %f16, %o0 + 0x00))
+	EX_ST(STORE(std, %f18, %o0 + 0x08))
+	EX_ST(STORE(std, %f20, %o0 + 0x10))
+	EX_ST(STORE(std, %f22, %o0 + 0x18))
+	EX_ST(STORE(std, %f24, %o0 + 0x20))
+	EX_ST(STORE(std, %f26, %o0 + 0x28))
+	EX_ST(STORE(std, %f28, %o0 + 0x30))
+	EX_ST(STORE(std, %f30, %o0 + 0x38))
+	add		%o0, 0x40, %o0
+	bne,pt		%icc, 1b
+	 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
+	VISExitHalf
+
+	brz,pn		%o2, .Lexit
+	 cmp		%o2, 19
+	ble,pn		%icc, .Lsmall_unaligned
+	 nop
+	ba,a,pt		%icc, .Lmedium_unaligned
+
+.Lmedium:
+	LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
+	andcc		%g2, 0x7, %g0
+	bne,pn		%icc, .Lmedium_unaligned
+	 nop
+.Lmedium_noprefetch:
+	andncc		%o2, 0x20 - 1, %o5
+	be,pn		%icc, 2f
+	 sub		%o2, %o5, %o2
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+	EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
+	EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
+	EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
+	add		%o1, 0x20, %o1
+	subcc		%o5, 0x20, %o5
+	EX_ST(STORE(stx, %g1, %o0 + 0x00))
+	EX_ST(STORE(stx, %g2, %o0 + 0x08))
+	EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
+	EX_ST(STORE(stx, %o4, %o0 + 0x18))
+	bne,pt		%icc, 1b
+	 add		%o0, 0x20, %o0
+2:	andcc		%o2, 0x18, %o5
+	be,pt		%icc, 3f
+	 sub		%o2, %o5, %o2
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+	add		%o1, 0x08, %o1
+	add		%o0, 0x08, %o0
+	subcc		%o5, 0x08, %o5
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stx, %g1, %o0 - 0x08))
+3:	brz,pt		%o2, .Lexit
+	 cmp		%o2, 0x04
+	bl,pn		%icc, .Ltiny
+	 nop
+	EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
+	add		%o1, 0x04, %o1
+	add		%o0, 0x04, %o0
+	subcc		%o2, 0x04, %o2
+	bne,pn		%icc, .Ltiny
+	 EX_ST(STORE(stw, %g1, %o0 - 0x04))
+	ba,a,pt		%icc, .Lexit
+.Lmedium_unaligned:
+	/* First get dest 8 byte aligned.  */
+	sub		%g0, %o0, %g1
+	and		%g1, 0x7, %g1
+	brz,pt		%g1, 2f
+	 sub		%o2, %g1, %o2
+
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
+	add		%o1, 1, %o1
+	subcc		%g1, 1, %g1
+	add		%o0, 1, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stb, %g2, %o0 - 0x01))
+2:
+	and		%o1, 0x7, %g1
+	brz,pn		%g1, .Lmedium_noprefetch
+	 sll		%g1, 3, %g1
+	mov		64, %g2
+	sub		%g2, %g1, %g2
+	andn		%o1, 0x7, %o1
+	EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
+	sllx		%o4, %g1, %o4
+	andn		%o2, 0x08 - 1, %o5
+	sub		%o2, %o5, %o2
+1:	EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
+	add		%o1, 0x08, %o1
+	subcc		%o5, 0x08, %o5
+	srlx		%g3, %g2, GLOBAL_SPARE
+	or		GLOBAL_SPARE, %o4, GLOBAL_SPARE
+	EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
+	add		%o0, 0x08, %o0
+	bne,pt		%icc, 1b
+	 sllx		%g3, %g1, %o4
+	srl		%g1, 3, %g1
+	add		%o1, %g1, %o1
+	brz,pn		%o2, .Lexit
+	 nop
+	ba,pt		%icc, .Lsmall_unaligned
+
+.Ltiny:
+	EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
+	subcc		%o2, 1, %o2
+	be,pn		%icc, .Lexit
+	 EX_ST(STORE(stb, %g1, %o0 + 0x00))
+	EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
+	subcc		%o2, 1, %o2
+	be,pn		%icc, .Lexit
+	 EX_ST(STORE(stb, %g1, %o0 + 0x01))
+	EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
+	ba,pt		%icc, .Lexit
+	 EX_ST(STORE(stb, %g1, %o0 + 0x02))
+
+.Lsmall:
+	andcc		%g2, 0x3, %g0
+	bne,pn		%icc, .Lsmall_unaligned
+	 andn		%o2, 0x4 - 1, %o5
+	sub		%o2, %o5, %o2
+1:
+	EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
+	add		%o1, 0x04, %o1
+	subcc		%o5, 0x04, %o5
+	add		%o0, 0x04, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stw, %g1, %o0 - 0x04))
+	brz,pt		%o2, .Lexit
+	 nop
+	ba,a,pt		%icc, .Ltiny
+
+.Lsmall_unaligned:
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
+	add		%o1, 1, %o1
+	add		%o0, 1, %o0
+	subcc		%o2, 1, %o2
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stb, %g1, %o0 - 0x01))
+	ba,a,pt		%icc, .Lexit
+END(__memcpy_niagara4)
+
+#endif
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S
new file mode 100644
index 0000000000..b8f5c3cb8f
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S
@@ -0,0 +1,325 @@
+/* Copy SIZE bytes from SRC to DEST.
+   For UltraSPARC-III.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by David S. Miller (davem@redhat.com)
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define ASI_BLK_P 0xf0
+#define FPRS_FEF  0x04
+#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+
+#ifndef XCC
+#define USE_BPR
+#define XCC xcc
+#endif
+
+#if IS_IN (libc)
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+	.register	%g6,#scratch
+
+	.text
+
+ENTRY(__mempcpy_ultra3)
+	ba,pt		%XCC, 101f
+	 add		%o0, %o2, %g5
+END(__mempcpy_ultra3)
+
+	/* Special/non-trivial issues of this code:
+	 *
+	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
+	 * 2) Only low 32 FPU registers are used so that only the
+	 *    lower half of the FPU register set is dirtied by this
+	 *    code.  This is especially important in the kernel.
+	 * 3) This code never prefetches cachelines past the end
+	 *    of the source buffer.
+	 *
+	 * The cheetah's flexible spine, oversized liver, enlarged heart,
+	 * slender muscular body, and claws make it the swiftest hunter
+	 * in Africa and the fastest animal on land.  Can reach speeds
+	 * of up to 2.4GB per second.
+	 */
+	.align		32
+ENTRY(__memcpy_ultra3)
+
+100: /* %o0=dst, %o1=src, %o2=len */
+	mov		%o0, %g5
+101:
+	cmp		%o2, 0
+	be,pn		%XCC, out
+218:	 or		%o0, %o1, %o3
+	cmp		%o2, 16
+	bleu,a,pn	%XCC, small_copy
+	 or		%o3, %o2, %o3
+
+	cmp		%o2, 256
+	blu,pt		%XCC, medium_copy
+	 andcc		%o3, 0x7, %g0
+
+	ba,pt		%xcc, enter
+	 andcc		%o0, 0x3f, %g2
+
+	/* Here len >= 256 and condition codes reflect execution
+	 * of "andcc %o0, 0x7, %g2", done by caller.
+	 */
+	.align		64
+enter:
+	/* Is 'dst' already aligned on an 64-byte boundary? */
+	be,pt		%XCC, 2f
+
+	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
+	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
+	 * subtract this from 'len'.
+	 */
+	 sub		%g2, 0x40, %g2
+	sub		%g0, %g2, %g2
+	sub		%o2, %g2, %o2
+
+	/* Copy %g2 bytes from src to dst, one byte at a time. */
+1:	ldub		[%o1 + 0x00], %o3
+	add		%o1, 0x1, %o1
+	add		%o0, 0x1, %o0
+	subcc		%g2, 0x1, %g2
+
+	bg,pt		%XCC, 1b
+	 stb		%o3, [%o0 + -1]
+
+2:	VISEntryHalf
+	and		%o1, 0x7, %g1
+	ba,pt		%xcc, begin
+	 alignaddr	%o1, %g0, %o1
+
+	.align		64
+begin:
+	prefetch	[%o1 + 0x000], #one_read
+	prefetch	[%o1 + 0x040], #one_read
+	andn		%o2, (0x40 - 1), %o4
+	prefetch	[%o1 + 0x080], #one_read
+	prefetch	[%o1 + 0x0c0], #one_read
+	ldd		[%o1 + 0x000], %f0
+	prefetch	[%o1 + 0x100], #one_read
+	ldd		[%o1 + 0x008], %f2
+	prefetch	[%o1 + 0x140], #one_read
+	ldd		[%o1 + 0x010], %f4
+	prefetch	[%o1 + 0x180], #one_read
+	faligndata	%f0, %f2, %f16
+	ldd		[%o1 + 0x018], %f6
+	faligndata	%f2, %f4, %f18
+	ldd		[%o1 + 0x020], %f8
+	faligndata	%f4, %f6, %f20
+	ldd		[%o1 + 0x028], %f10
+	faligndata	%f6, %f8, %f22
+
+	ldd		[%o1 + 0x030], %f12
+	faligndata	%f8, %f10, %f24
+	ldd		[%o1 + 0x038], %f14
+	faligndata	%f10, %f12, %f26
+	ldd		[%o1 + 0x040], %f0
+
+	sub		%o4, 0x80, %o4
+	add		%o1, 0x40, %o1
+	ba,pt		%xcc, loop
+	 srl		%o4, 6, %o3
+
+	.align		64
+loop:
+	ldd		[%o1 + 0x008], %f2
+	faligndata	%f12, %f14, %f28
+	ldd		[%o1 + 0x010], %f4
+	faligndata	%f14, %f0, %f30
+	stda		%f16, [%o0] ASI_BLK_P
+	ldd		[%o1 + 0x018], %f6
+	faligndata	%f0, %f2, %f16
+
+	ldd		[%o1 + 0x020], %f8
+	faligndata	%f2, %f4, %f18
+	ldd		[%o1 + 0x028], %f10
+	faligndata	%f4, %f6, %f20
+	ldd		[%o1 + 0x030], %f12
+	faligndata	%f6, %f8, %f22
+	ldd		[%o1 + 0x038], %f14
+	faligndata	%f8, %f10, %f24
+
+	ldd		[%o1 + 0x040], %f0
+	prefetch	[%o1 + 0x180], #one_read
+	faligndata	%f10, %f12, %f26
+	subcc		%o3, 0x01, %o3
+	add		%o1, 0x40, %o1
+	bg,pt		%XCC, loop
+	 add		%o0, 0x40, %o0
+
+	/* Finally we copy the last full 64-byte block. */
+loopfini:
+	ldd		[%o1 + 0x008], %f2
+	faligndata	%f12, %f14, %f28
+	ldd		[%o1 + 0x010], %f4
+	faligndata	%f14, %f0, %f30
+	stda		%f16, [%o0] ASI_BLK_P
+	ldd		[%o1 + 0x018], %f6
+	faligndata	%f0, %f2, %f16
+	ldd		[%o1 + 0x020], %f8
+	faligndata	%f2, %f4, %f18
+	ldd		[%o1 + 0x028], %f10
+	faligndata	%f4, %f6, %f20
+	ldd		[%o1 + 0x030], %f12
+	faligndata	%f6, %f8, %f22
+	ldd		[%o1 + 0x038], %f14
+	faligndata	%f8, %f10, %f24
+	cmp		%g1, 0
+	be,pt		%XCC, 1f
+	 add		%o0, 0x40, %o0
+	ldd		[%o1 + 0x040], %f0
+1:	faligndata	%f10, %f12, %f26
+	faligndata	%f12, %f14, %f28
+	faligndata	%f14, %f0, %f30
+	stda		%f16, [%o0] ASI_BLK_P
+	add		%o0, 0x40, %o0
+	add		%o1, 0x40, %o1
+	membar		#Sync
+
+	/* Now we copy the (len modulo 64) bytes at the end.
+	 * Note how we borrow the %f0 loaded above.
+	 *
+	 * Also notice how this code is careful not to perform a
+	 * load past the end of the src buffer.
+	 */
+loopend:
+	and		%o2, 0x3f, %o2
+	andcc		%o2, 0x38, %g2
+	be,pn		%XCC, endcruft
+	 subcc		%g2, 0x8, %g2
+	be,pn		%XCC, endcruft
+	 cmp		%g1, 0
+
+	be,a,pt		%XCC, 1f
+	 ldd		[%o1 + 0x00], %f0
+
+1:	ldd		[%o1 + 0x08], %f2
+	add		%o1, 0x8, %o1
+	sub		%o2, 0x8, %o2
+	subcc		%g2, 0x8, %g2
+	faligndata	%f0, %f2, %f8
+	std		%f8, [%o0 + 0x00]
+	be,pn		%XCC, endcruft
+	 add		%o0, 0x8, %o0
+	ldd		[%o1 + 0x08], %f0
+	add		%o1, 0x8, %o1
+	sub		%o2, 0x8, %o2
+	subcc		%g2, 0x8, %g2
+	faligndata	%f2, %f0, %f8
+	std		%f8, [%o0 + 0x00]
+	bne,pn		%XCC, 1b
+	 add		%o0, 0x8, %o0
+
+	/* If anything is left, we copy it one byte at a time.
+	 * Note that %g1 is (src & 0x3) saved above before the
+	 * alignaddr was performed.
+	 */
+endcruft:
+	cmp		%o2, 0
+	add		%o1, %g1, %o1
+	VISExitHalf
+	be,pn		%XCC, out
+	 sub		%o0, %o1, %o3
+
+	andcc		%g1, 0x7, %g0
+	bne,pn		%icc, small_copy_unaligned
+	 andcc		%o2, 0x8, %g0
+	be,pt		%icc, 1f
+	 nop
+	ldx		[%o1], %o5
+	stx		%o5, [%o1 + %o3]
+	add		%o1, 0x8, %o1
+
+1:	andcc		%o2, 0x4, %g0
+	be,pt		%icc, 1f
+	 nop
+	lduw		[%o1], %o5
+	stw		%o5, [%o1 + %o3]
+	add		%o1, 0x4, %o1
+
+1:	andcc		%o2, 0x2, %g0
+	be,pt		%icc, 1f
+	 nop
+	lduh		[%o1], %o5
+	sth		%o5, [%o1 + %o3]
+	add		%o1, 0x2, %o1
+
+1:	andcc		%o2, 0x1, %g0
+	be,pt		%icc, out
+	 nop
+	ldub		[%o1], %o5
+	ba,pt		%xcc, out
+	 stb		%o5, [%o1 + %o3]
+
+medium_copy: /* 16 < len <= 64 */
+	bne,pn		%XCC, small_copy_unaligned
+	 sub		%o0, %o1, %o3
+
+medium_copy_aligned:
+	andn		%o2, 0x7, %o4
+	and		%o2, 0x7, %o2
+1:	subcc		%o4, 0x8, %o4
+	ldx		[%o1], %o5
+	stx		%o5, [%o1 + %o3]
+	bgu,pt		%XCC, 1b
+	 add		%o1, 0x8, %o1
+	andcc		%o2, 0x4, %g0
+	be,pt		%XCC, 1f
+	 nop
+	sub		%o2, 0x4, %o2
+	lduw		[%o1], %o5
+	stw		%o5, [%o1 + %o3]
+	add		%o1, 0x4, %o1
+1:	cmp		%o2, 0
+	be,pt		%XCC, out
+	 nop
+	ba,pt		%xcc, small_copy_unaligned
+	 nop
+
+small_copy: /* 0 < len <= 16 */
+	andcc		%o3, 0x3, %g0
+	bne,pn		%XCC, small_copy_unaligned
+	 sub		%o0, %o1, %o3
+
+small_copy_aligned:
+	subcc		%o2, 4, %o2
+	lduw		[%o1], %g1
+	stw		%g1, [%o1 + %o3]
+	bgu,pt		%XCC, small_copy_aligned
+	 add		%o1, 4, %o1
+
+out:	retl
+	 mov		%g5, %o0
+
+	.align	32
+small_copy_unaligned:
+	subcc		%o2, 1, %o2
+	ldub		[%o1], %g1
+	stb		%g1, [%o1 + %o3]
+	bgu,pt		%XCC, small_copy_unaligned
+	 add		%o1, 1, %o1
+	retl
+	 mov		%g5, %o0
+
+END(__memcpy_ultra3)
+
+#endif
\ No newline at end of file
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy.S
new file mode 100644
index 0000000000..b6396eeae5
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy.S
@@ -0,0 +1,167 @@
+/* Multiple versions of memcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by David S. Miller (davem@davemloft.net)
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(memcpy)
+	.type	memcpy, @gnu_indirect_function
+# ifdef SHARED
+	SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+	set	HWCAP_SPARC_CRYPTO, %o1
+	andcc	%o0, %o1, %g0
+	be	1f
+	 andcc	%o0, HWCAP_SPARC_N2, %g0
+# ifdef SHARED
+	sethi	%gdop_hix22(__memcpy_niagara4), %o1
+	xor	%o1, %gdop_lox10(__memcpy_niagara4), %o1
+# else
+	set	__memcpy_niagara4, %o1
+# endif
+	ba	10f
+	 nop
+1:	be	1f
+	 andcc	%o0, HWCAP_SPARC_BLKINIT, %g0
+# ifdef SHARED
+	sethi	%gdop_hix22(__memcpy_niagara2), %o1
+	xor	%o1, %gdop_lox10(__memcpy_niagara2), %o1
+# else
+	set	__memcpy_niagara2, %o1
+# endif
+	ba	10f
+	 nop
+1:	be	1f
+	 andcc	%o0, HWCAP_SPARC_ULTRA3, %g0
+# ifdef SHARED
+	sethi	%gdop_hix22(__memcpy_niagara1), %o1
+	xor	%o1, %gdop_lox10(__memcpy_niagara1), %o1
+# else
+	set	__memcpy_niagara1, %o1
+# endif
+	ba	10f
+	 nop
+1:	be	9f
+	 nop
+# ifdef SHARED
+	sethi	%gdop_hix22(__memcpy_ultra3), %o1
+	xor	%o1, %gdop_lox10(__memcpy_ultra3), %o1
+# else
+	set	__memcpy_ultra3, %o1
+# endif
+	ba	10f
+	 nop
+9:
+# ifdef SHARED
+	sethi	%gdop_hix22(__memcpy_ultra1), %o1
+	xor	%o1, %gdop_lox10(__memcpy_ultra1), %o1
+# else
+	set	__memcpy_ultra1, %o1
+# endif
+10:
+# ifdef SHARED
+	add	%o3, %o1, %o1
+# endif
+	retl
+	 mov	%o1, %o0
+END(memcpy)
+
+ENTRY(__mempcpy)
+	.type	__mempcpy, @gnu_indirect_function
+# ifdef SHARED
+	SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+	set	HWCAP_SPARC_CRYPTO, %o1
+	andcc	%o0, %o1, %g0
+	be	1f
+	 andcc	%o0, HWCAP_SPARC_N2, %g0
+# ifdef SHARED
+	sethi	%gdop_hix22(__mempcpy_niagara4), %o1
+	xor	%o1, %gdop_lox10(__mempcpy_niagara4), %o1
+# else
+	set	__mempcpy_niagara4, %o1
+# endif
+	ba	10f
+	 nop
+1:	be	1f
+	 andcc	%o0, HWCAP_SPARC_BLKINIT, %g0
+# ifdef SHARED
+	sethi	%gdop_hix22(__mempcpy_niagara2), %o1
+	xor	%o1, %gdop_lox10(__mempcpy_niagara2), %o1
+# else
+	set	__mempcpy_niagara2, %o1
+# endif
+	ba	10f
+	 nop
+1:	be	1f
+	 andcc	%o0, HWCAP_SPARC_ULTRA3, %g0
+# ifdef SHARED
+	sethi	%gdop_hix22(__mempcpy_niagara1), %o1
+	xor	%o1, %gdop_lox10(__mempcpy_niagara1), %o1
+# else
+	set	__mempcpy_niagara1, %o1
+# endif
+	ba	10f
+	 nop
+1:	be	9f
+	 nop
+# ifdef SHARED
+	sethi	%gdop_hix22(__mempcpy_ultra3), %o1
+	xor	%o1, %gdop_lox10(__mempcpy_ultra3), %o1
+# else
+	set	__mempcpy_ultra3, %o1
+# endif
+	ba	10f
+	 nop
+9:
+# ifdef SHARED
+	sethi	%gdop_hix22(__mempcpy_ultra1), %o1
+	xor	%o1, %gdop_lox10(__mempcpy_ultra1), %o1
+# else
+	set	__mempcpy_ultra1, %o1
+# endif
+10:
+# ifdef SHARED
+	add	%o3, %o1, %o1
+# endif
+	retl
+	 mov	%o1, %o0
+END(__mempcpy)
+
+libc_hidden_builtin_def (memcpy)
+
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+#undef weak_alias
+#define weak_alias(x, y)
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+#define memcpy __memcpy_ultra1
+#define __mempcpy __mempcpy_ultra1
+
+#endif
+
+#include "../memcpy.S"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset-niagara1.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset-niagara1.S
new file mode 100644
index 0000000000..45b2251691
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset-niagara1.S
@@ -0,0 +1,177 @@
+/* Set a block of memory to some byte value.  For SUN4V Niagara.
+   Copyright (C) 2006-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by David S. Miller (davem@davemloft.net)
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define ASI_BLK_INIT_QUAD_LDD_P	0xe2
+#define ASI_P			0x80
+#define ASI_PNF			0x82
+
+#ifndef XCC
+#define USE_BPR
+#define XCC xcc
+#endif
+
+#if IS_IN (libc)
+
+	.register	%g2,#scratch
+
+	.text
+	.align		32
+
+ENTRY(__memset_niagara1)
+	/* %o0=buf, %o1=pat, %o2=len */
+	and		%o1, 0xff, %o3
+	mov		%o2, %o1
+	sllx		%o3, 8, %g1
+	or		%g1, %o3, %o2
+	sllx		%o2, 16, %g1
+	or		%g1, %o2, %o2
+	sllx		%o2, 32, %g1
+	ba,pt		%XCC, 1f
+	 or		%g1, %o2, %o2
+END(__memset_niagara1)
+
+ENTRY(__bzero_niagara1)
+	clr		%o2
+1:
+# ifndef USE_BRP
+	srl		%o1, 0, %o1
+# endif
+	brz,pn		%o1, 90f
+	 mov		%o0, %o3
+
+	wr		%g0, ASI_P, %asi
+
+	cmp		%o1, 15
+	blu,pn		%XCC, 70f
+	 andcc		%o0, 0x7, %g1
+	be,pt		%XCC, 2f
+	 mov		8, %g2
+	sub		%g2, %g1, %g1
+	sub		%o1, %g1, %o1
+1:	stba		%o2, [%o0 + 0x00] %asi
+	subcc		%g1, 1, %g1
+	bne,pt		%XCC, 1b
+	 add		%o0, 1, %o0
+2:	cmp		%o1, 128
+	blu,pn		%XCC, 60f
+	 andcc		%o0, (64 - 1), %g1
+	be,pt		%XCC, 40f
+	 mov		64, %g2
+	sub		%g2, %g1, %g1
+	sub		%o1, %g1, %o1
+1:	stxa		%o2, [%o0 + 0x00] %asi
+	subcc		%g1, 8, %g1
+	bne,pt		%XCC, 1b
+	 add		%o0, 8, %o0
+
+40:
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+	andn		%o1, (64 - 1), %g1
+	sub		%o1, %g1, %o1
+
+	andn		%g1, (256 - 1), %g2
+	brz,pt		%g2, 50f
+	 and		%g1, (256 - 1), %g1
+
+45:
+	stxa		%o2, [%o0 + 0x00] %asi
+	stxa		%o2, [%o0 + 0x08] %asi
+	stxa		%o2, [%o0 + 0x10] %asi
+	stxa		%o2, [%o0 + 0x18] %asi
+	stxa		%o2, [%o0 + 0x20] %asi
+	stxa		%o2, [%o0 + 0x28] %asi
+	stxa		%o2, [%o0 + 0x30] %asi
+	stxa		%o2, [%o0 + 0x38] %asi
+	stxa		%o2, [%o0 + 0x40] %asi
+	stxa		%o2, [%o0 + 0x48] %asi
+	stxa		%o2, [%o0 + 0x50] %asi
+	stxa		%o2, [%o0 + 0x58] %asi
+	stxa		%o2, [%o0 + 0x60] %asi
+	stxa		%o2, [%o0 + 0x68] %asi
+	stxa		%o2, [%o0 + 0x70] %asi
+	stxa		%o2, [%o0 + 0x78] %asi
+	stxa		%o2, [%o0 + 0x80] %asi
+	stxa		%o2, [%o0 + 0x88] %asi
+	stxa		%o2, [%o0 + 0x90] %asi
+	stxa		%o2, [%o0 + 0x98] %asi
+	stxa		%o2, [%o0 + 0xa0] %asi
+	stxa		%o2, [%o0 + 0xa8] %asi
+	stxa		%o2, [%o0 + 0xb0] %asi
+	stxa		%o2, [%o0 + 0xb8] %asi
+	stxa		%o2, [%o0 + 0xc0] %asi
+	stxa		%o2, [%o0 + 0xc8] %asi
+	stxa		%o2, [%o0 + 0xd0] %asi
+	stxa		%o2, [%o0 + 0xd8] %asi
+	stxa		%o2, [%o0 + 0xe0] %asi
+	stxa		%o2, [%o0 + 0xe8] %asi
+	stxa		%o2, [%o0 + 0xf0] %asi
+	stxa		%o2, [%o0 + 0xf8] %asi
+	subcc		%g2, 256, %g2
+	bne,pt		%XCC, 45b
+	 add		%o0, 256, %o0
+
+	brz,pn		%g1, 55f
+	 nop
+
+50:
+	stxa		%o2, [%o0 + 0x00] %asi
+	stxa		%o2, [%o0 + 0x08] %asi
+	stxa		%o2, [%o0 + 0x10] %asi
+	stxa		%o2, [%o0 + 0x18] %asi
+	stxa		%o2, [%o0 + 0x20] %asi
+	stxa		%o2, [%o0 + 0x28] %asi
+	stxa		%o2, [%o0 + 0x30] %asi
+	stxa		%o2, [%o0 + 0x38] %asi
+	subcc		%g1, 64, %g1
+	bne,pt		%XCC, 50b
+	 add		%o0, 64, %o0
+
+55:
+	wr		%g0, ASI_P, %asi
+	brz,pn		%o1, 80f
+60:
+	 andncc		%o1, 0x7, %g1
+	be,pn		%XCC, 2f
+	 sub		%o1, %g1, %o1
+1:	stxa		%o2, [%o0 + 0x00] %asi
+	subcc		%g1, 8, %g1
+	bne,pt		%XCC, 1b
+	 add		%o0, 8, %o0
+2:	brz,pt		%o1, 80f
+	 nop
+
+70:
+1:	stba		%o2, [%o0 + 0x00] %asi
+	subcc		%o1, 1, %o1
+	bne,pt		%XCC, 1b
+	 add		%o0, 1, %o0
+
+	/* fallthrough */
+
+80:
+	wr		%g0, ASI_PNF, %asi
+
+90:
+	retl
+	 mov		%o3, %o0
+END(__bzero_niagara1)
+
+#endif
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset-niagara4.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset-niagara4.S
new file mode 100644
index 0000000000..c04a07a7f9
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset-niagara4.S
@@ -0,0 +1,124 @@
+/* Set a block of memory to some byte value.  For SUN4V Niagara-4.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by David S. Miller (davem@davemloft.net)
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define ASI_BLK_INIT_QUAD_LDD_P	0xe2
+
+#if IS_IN (libc)
+
+	.register	%g2, #scratch
+	.register	%g3, #scratch
+
+	.text
+	.align		32
+
+ENTRY(__memset_niagara4)
+	andcc		%o1, 0xff, %o4
+	be,pt		%icc, 1f
+	 mov		%o2, %o1
+	sllx		%o4, 8, %g1
+	or		%g1, %o4, %o2
+	sllx		%o2, 16, %g1
+	or		%g1, %o2, %o2
+	sllx		%o2, 32, %g1
+	ba,pt		%icc, 1f
+	 or		%g1, %o2, %o4
+END(__memset_niagara4)
+
+	.align		32
+ENTRY(__bzero_niagara4)
+	clr		%o4
+1:	cmp		%o1, 16
+	ble		%icc, .Ltiny
+	 mov		%o0, %o3
+	sub		%g0, %o0, %g1
+	and		%g1, 0x7, %g1
+	brz,pt		%g1, .Laligned8
+	 sub		%o1, %g1, %o1
+1:	stb		%o4, [%o0 + 0x00]
+	subcc		%g1, 1, %g1
+	bne,pt		%icc, 1b
+	 add		%o0, 1, %o0
+.Laligned8:
+	cmp		%o1, 64 + (64 - 8)
+	ble		.Lmedium
+	 sub		%g0, %o0, %g1
+	andcc		%g1, (64 - 1), %g1
+	brz,pn		%g1, .Laligned64
+	 sub		%o1, %g1, %o1
+1:	stx		%o4, [%o0 + 0x00]
+	subcc		%g1, 8, %g1
+	bne,pt		%icc, 1b
+	 add		%o0, 0x8, %o0
+.Laligned64:
+	andn		%o1, 64 - 1, %g1
+	sub		%o1, %g1, %o1
+	brnz,pn		%o4, .Lnon_bzero_loop
+	 mov		0x20, %g2
+1:	stxa		%o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+	subcc		%g1, 0x40, %g1
+	stxa		%o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+	bne,pt		%icc, 1b
+	 add		%o0, 0x40, %o0
+.Lpostloop:
+	cmp		%o1, 8
+	bl,pn		%icc, .Ltiny
+	 membar		#StoreStore|#StoreLoad
+.Lmedium:
+	andn		%o1, 0x7, %g1
+	sub		%o1, %g1, %o1
+1:	stx		%o4, [%o0 + 0x00]
+	subcc		%g1, 0x8, %g1
+	bne,pt		%icc, 1b
+	 add		%o0, 0x08, %o0
+	andcc		%o1, 0x4, %g1
+	be,pt		%icc, .Ltiny
+	 sub		%o1, %g1, %o1
+	stw		%o4, [%o0 + 0x00]
+	add		%o0, 0x4, %o0
+.Ltiny:
+	cmp		%o1, 0
+	be,pn		%icc, .Lexit
+1:	 subcc		%o1, 1, %o1
+	stb		%o4, [%o0 + 0x00]
+	bne,pt		%icc, 1b
+	 add		%o0, 1, %o0
+.Lexit:
+	retl
+	 mov		%o3, %o0
+.Lnon_bzero_loop:
+	mov		0x08, %g3
+	mov		0x28, %o5
+1:	stxa		%o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+	subcc		%g1, 0x40, %g1
+	stxa		%o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x10, %o0
+	stxa		%o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P
+	bne,pt		%icc, 1b
+	 add		%o0, 0x30, %o0
+	ba,a,pt		%icc, .Lpostloop
+END(__bzero_niagara4)
+
+#endif
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset.S
new file mode 100644
index 0000000000..9469d5e7ce
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset.S
@@ -0,0 +1,124 @@
+/* Multiple versions of memset and bzero
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by David S. Miller (davem@davemloft.net)
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(memset)
+	.type	memset, @gnu_indirect_function
+# ifdef SHARED
+	SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+	set	HWCAP_SPARC_CRYPTO, %o1
+	andcc	%o0, %o1, %g0
+	be	1f
+	 andcc	%o0, HWCAP_SPARC_BLKINIT, %g0
+# ifdef SHARED
+	sethi	%gdop_hix22(__memset_niagara4), %o1
+	xor	%o1, %gdop_lox10(__memset_niagara4), %o1
+# else
+	set	__memset_niagara4, %o1
+# endif
+	ba	10f
+	 nop
+1:	be	9f
+	 nop
+# ifdef SHARED
+	sethi	%gdop_hix22(__memset_niagara1), %o1
+	xor	%o1, %gdop_lox10(__memset_niagara1), %o1
+# else
+	set	__memset_niagara1, %o1
+# endif
+	ba	10f
+	 nop
+9:
+# ifdef SHARED
+	sethi	%gdop_hix22(__memset_ultra1), %o1
+	xor	%o1, %gdop_lox10(__memset_ultra1), %o1
+# else
+	set	__memset_ultra1, %o1
+# endif
+10:
+# ifdef SHARED
+	add	%o3, %o1, %o1
+# endif
+	retl
+	 mov	%o1, %o0
+END(memset)
+
+ENTRY(__bzero)
+	.type	bzero, @gnu_indirect_function
+# ifdef SHARED
+	SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+	set	HWCAP_SPARC_CRYPTO, %o1
+	andcc	%o0, %o1, %g0
+	be	1f
+	 andcc	%o0, HWCAP_SPARC_BLKINIT, %g0
+# ifdef SHARED
+	sethi	%gdop_hix22(__bzero_niagara4), %o1
+	xor	%o1, %gdop_lox10(__bzero_niagara4), %o1
+# else
+	set	__bzero_niagara4, %o1
+# endif
+	ba	10f
+	 nop
+1:	be	9f
+	 nop
+# ifdef SHARED
+	sethi	%gdop_hix22(__bzero_niagara1), %o1
+	xor	%o1, %gdop_lox10(__bzero_niagara1), %o1
+# else
+	set	__bzero_niagara1, %o1
+# endif
+	ba	10f
+	 nop
+9:
+# ifdef SHARED
+	sethi	%gdop_hix22(__bzero_ultra1), %o1
+	xor	%o1, %gdop_lox10(__bzero_ultra1), %o1
+# else
+	set	__bzero_ultra1, %o1
+# endif
+10:
+# ifdef SHARED
+	add	%o3, %o1, %o1
+# endif
+	retl
+	 mov	%o1, %o0
+END(__bzero)
+
+weak_alias (__bzero, bzero)
+
+# undef weak_alias
+# define weak_alias(a, b)
+
+libc_hidden_builtin_def (memset)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#define memset __memset_ultra1
+#define __bzero __bzero_ultra1
+
+#endif
+
+#include "../memset.S"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/mul_1-vis3.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/mul_1-vis3.S
new file mode 100644
index 0000000000..7c4fa49ce4
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/mul_1-vis3.S
@@ -0,0 +1,73 @@
+! SPARC v9 64-bit VIS3 __mpn_mul_1 -- Multiply a limb vector with a single
+! limb and store the product in a second limb vector.
+!
+! Copyright (C) 2013-2017 Free Software Foundation, Inc.
+! This file is part of the GNU C Library.
+! Contributed by David S. Miller <davem@davemloft.net>
+!
+! The GNU C Library is free software; you can redistribute it and/or
+! modify it under the terms of the GNU Lesser General Public
+! License as published by the Free Software Foundation; either
+! version 2.1 of the License, or (at your option) any later version.
+!
+! The GNU C Library is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+! Lesser General Public License for more details.
+!
+! You should have received a copy of the GNU Lesser General Public
+! License along with the GNU C Library; if not, see
+! <http://www.gnu.org/licenses/>.
+
+#include <sysdep.h>
+
+#define res_ptr	%o0
+#define s1_ptr	%o1
+#define sz	%o2
+#define s2_limb	%o3
+#define carry	%o5
+#define tmp1	%g1
+#define tmp2	%g2
+#define tmp3	%g3
+#define tmp4	%o4
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+ENTRY(__mpn_mul_1_vis3)
+	subcc	sz, 1, sz
+	be	.Lfinal_limb
+	 clr	carry
+
+.Lloop:
+	ldx	[s1_ptr + 0x00], tmp1
+	ldx	[s1_ptr + 0x08], tmp4
+	mulx	tmp1, s2_limb, tmp3
+	add	s1_ptr, 0x10, s1_ptr
+	umulxhi	tmp1, s2_limb, tmp2
+	sub	sz, 2, sz
+	mulx	tmp4, s2_limb, tmp1
+	add	res_ptr, 0x10, res_ptr
+	umulxhi	tmp4, s2_limb, tmp4
+	addcc	carry, tmp3, tmp3
+	stx	tmp3, [res_ptr - 0x10]
+	addxc	%g0, tmp2, carry
+	addcc	carry, tmp1, tmp1
+	addxc	%g0, tmp4, carry
+	brgz	sz, .Lloop
+	 stx	tmp1, [res_ptr - 0x08]
+
+	brlz,pt	sz, .Lfinish
+	 nop
+
+.Lfinal_limb:
+	ldx	[s1_ptr + 0x00], tmp1
+	mulx	tmp1, s2_limb, tmp3
+	umulxhi	tmp1, s2_limb, tmp2
+	addcc	carry, tmp3, tmp3
+	addxc	%g0, tmp2, carry
+	stx	tmp3, [res_ptr + 0x00]
+
+.Lfinish:
+	retl
+	 mov	carry, %o0
+END(__mpn_mul_1_vis3)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/mul_1.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/mul_1.S
new file mode 100644
index 0000000000..75fca932b7
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/mul_1.S
@@ -0,0 +1,56 @@
+/* Multiple versions of mul_1
+
+   Copyright (C) 2013-2017 Free Software Foundation, Inc.
+   Contributed by David S. Miller (davem@davemloft.net)
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY(__mpn_mul_1)
+	.type	__mpn_mul_1, @gnu_indirect_function
+# ifdef SHARED
+	SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+	set	HWCAP_SPARC_VIS3, %o1
+	andcc	%o0, %o1, %g0
+	be	1f
+	 nop
+# ifdef SHARED
+	sethi	%gdop_hix22(__mpn_mul_1_vis3), %o1
+	xor	%o1, %gdop_lox10(__mpn_mul_1_vis3), %o1
+# else
+	set	__mpn_mul_1_vis3, %o1
+# endif
+	ba	10f
+	 nop
+1:
+# ifdef SHARED
+	sethi	%gdop_hix22(__mpn_mul_1_generic), %o1
+	xor	%o1, %gdop_lox10(__mpn_mul_1_generic), %o1
+# else
+	set	__mpn_mul_1_generic, %o1
+# endif
+10:
+# ifdef SHARED
+	add	%o3, %o1, %o1
+# endif
+	retl
+	 mov	%o1, %o0
+END(__mpn_mul_1)
+
+#define __mpn_mul_1 __mpn_mul_1_generic
+#include "../mul_1.S"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/rtld-memcpy.c b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/rtld-memcpy.c
new file mode 100644
index 0000000000..2452575343
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/rtld-memcpy.c
@@ -0,0 +1 @@
+#include "../rtld-memcpy.c"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/rtld-memset.c b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/rtld-memset.c
new file mode 100644
index 0000000000..c01eb0beb9
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/rtld-memset.c
@@ -0,0 +1 @@
+#include "../rtld-memset.c"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha256-block.c b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha256-block.c
new file mode 100644
index 0000000000..9d65315a5a
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha256-block.c
@@ -0,0 +1,32 @@
+#include <sparc-ifunc.h>
+
+#define __sha256_process_block __sha256_process_block_generic
+extern void __sha256_process_block_generic (const void *buffer, size_t len,
+					    struct sha256_ctx *ctx);
+
+#include <crypt/sha256-block.c>
+
+#undef __sha256_process_block
+
+extern void __sha256_process_block_crop (const void *buffer, size_t len,
+					 struct sha256_ctx *ctx);
+
+static bool cpu_supports_sha256(int hwcap)
+{
+  unsigned long cfr;
+
+  if (!(hwcap & HWCAP_SPARC_CRYPTO))
+    return false;
+
+  __asm__ ("rd %%asr26, %0" : "=r" (cfr));
+  if (cfr & (1 << 6))
+    return true;
+
+  return false;
+}
+
+extern void __sha256_process_block (const void *buffer, size_t len,
+				    struct sha256_ctx *ctx);
+sparc_libc_ifunc (__sha256_process_block,
+		  cpu_supports_sha256(hwcap) ? __sha256_process_block_crop
+		    : __sha256_process_block_generic);
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha256-crop.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha256-crop.S
new file mode 100644
index 0000000000..8f07e4245a
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha256-crop.S
@@ -0,0 +1,101 @@
+/* SHA256 using sparc crypto opcodes.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by David S. Miller (davem@davemloft.net)
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define SHA256		\
+	.word	0x81b02840;
+
+	.text
+	.align	32
+ENTRY(__sha256_process_block_crop)
+	/* %o0=buffer, %o1=len, %o2=CTX */
+	ldx	[%o2 + 0x20], %g1
+	add	%g1, %o1, %g1
+	stx	%g1, [%o2 + 0x20]
+
+	ld	[%o2 + 0x00], %f0
+	ld	[%o2 + 0x04], %f1
+	ld	[%o2 + 0x08], %f2
+	ld	[%o2 + 0x0c], %f3
+	ld	[%o2 + 0x10], %f4
+	ld	[%o2 + 0x14], %f5
+	andcc	%o1, 0x7, %g0
+	ld	[%o2 + 0x18], %f6
+	bne,pn	%xcc, 10f
+	 ld	[%o2 + 0x1c], %f7
+
+1:
+	ldd	[%o0 + 0x00], %f8
+	ldd	[%o0 + 0x08], %f10
+	ldd	[%o0 + 0x10], %f12
+	ldd	[%o0 + 0x18], %f14
+	ldd	[%o0 + 0x20], %f16
+	ldd	[%o0 + 0x28], %f18
+	ldd	[%o0 + 0x30], %f20
+	ldd	[%o0 + 0x38], %f22
+
+	SHA256
+
+	subcc	%o1, 0x40, %o1
+	bne,pt	%xcc, 1b
+	 add	%o0, 0x40, %o0
+
+5:
+	st	%f0, [%o2 + 0x00]
+	st	%f1, [%o2 + 0x04]
+	st	%f2, [%o2 + 0x08]
+	st	%f3, [%o2 + 0x0c]
+	st	%f4, [%o2 + 0x10]
+	st	%f5, [%o2 + 0x14]
+	st	%f6, [%o2 + 0x18]
+	retl
+	 st	%f7, [%o2 + 0x1c]
+10:
+	alignaddr %o0, %g0, %o0
+
+	ldd	[%o0 + 0x00], %f10
+1:
+	ldd	[%o0 + 0x08], %f12
+	ldd	[%o0 + 0x10], %f14
+	ldd	[%o0 + 0x18], %f16
+	ldd	[%o0 + 0x20], %f18
+	ldd	[%o0 + 0x28], %f20
+	ldd	[%o0 + 0x30], %f22
+	ldd	[%o0 + 0x38], %f24
+	ldd	[%o0 + 0x40], %f26
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	SHA256
+
+	subcc	%o1, 0x40, %o1
+	fsrc2	%f26, %f10
+	bne,pt	%xcc, 1b
+	 add	%o0, 0x40, %o0
+
+	ba,a,pt	%xcc, 5b
+END(__sha256_process_block_crop)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha512-block.c b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha512-block.c
new file mode 100644
index 0000000000..2863e05d09
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha512-block.c
@@ -0,0 +1,32 @@
+#include <sparc-ifunc.h>
+
+#define __sha512_process_block __sha512_process_block_generic
+extern void __sha512_process_block_generic (const void *buffer, size_t len,
+					    struct sha512_ctx *ctx);
+
+#include <crypt/sha512-block.c>
+
+#undef __sha512_process_block
+
+extern void __sha512_process_block_crop (const void *buffer, size_t len,
+					 struct sha512_ctx *ctx);
+
+static bool cpu_supports_sha512(int hwcap)
+{
+  unsigned long cfr;
+
+  if (!(hwcap & HWCAP_SPARC_CRYPTO))
+    return false;
+
+  __asm__ ("rd %%asr26, %0" : "=r" (cfr));
+  if (cfr & (1 << 6))
+    return true;
+
+  return false;
+}
+
+extern void __sha512_process_block (const void *buffer, size_t len,
+				    struct sha512_ctx *ctx);
+sparc_libc_ifunc (__sha512_process_block,
+		  cpu_supports_sha512(hwcap) ? __sha512_process_block_crop
+		    : __sha512_process_block_generic);
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha512-crop.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha512-crop.S
new file mode 100644
index 0000000000..f78354c485
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha512-crop.S
@@ -0,0 +1,131 @@
+/* SHA512 using sparc crypto opcodes.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by David S. Miller (davem@davemloft.net)
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define SHA512		\
+	.word	0x81b02860;
+
+	.text
+	.align	32
+ENTRY(__sha512_process_block_crop)
+	/* %o0=buffer, %o1=len, %o2=CTX */
+	ldx	[%o2 + 0x48], %g1
+	add	%g1, %o1, %o4
+	stx	%o4, [%o2 + 0x48]
+	cmp	%o4, %g1
+	bgeu,pt	%xcc, 1f
+	 nop
+	ldx	[%o2 + 0x40], %g1
+	add	%g1, 1, %g1
+	stx	%g1, [%o2 + 0x40]
+
+1:	ldd	[%o2 + 0x00], %f0
+	ldd	[%o2 + 0x08], %f2
+	ldd	[%o2 + 0x10], %f4
+	ldd	[%o2 + 0x18], %f6
+	ldd	[%o2 + 0x20], %f8
+	ldd	[%o2 + 0x28], %f10
+	andcc	%o1, 0x7, %g0
+	ldd	[%o2 + 0x30], %f12
+	bne,pn	%xcc, 10f
+	 ldd	[%o2 + 0x38], %f14
+
+1:
+	ldd	[%o0 + 0x00], %f16
+	ldd	[%o0 + 0x08], %f18
+	ldd	[%o0 + 0x10], %f20
+	ldd	[%o0 + 0x18], %f22
+	ldd	[%o0 + 0x20], %f24
+	ldd	[%o0 + 0x28], %f26
+	ldd	[%o0 + 0x30], %f28
+	ldd	[%o0 + 0x38], %f30
+	ldd	[%o0 + 0x40], %f32
+	ldd	[%o0 + 0x48], %f34
+	ldd	[%o0 + 0x50], %f36
+	ldd	[%o0 + 0x58], %f38
+	ldd	[%o0 + 0x60], %f40
+	ldd	[%o0 + 0x68], %f42
+	ldd	[%o0 + 0x70], %f44
+	ldd	[%o0 + 0x78], %f46
+
+	SHA512
+
+	subcc	%o1, 0x80, %o1
+	bne,pt	%xcc, 1b
+	 add	%o0, 0x80, %o0
+
+5:
+	std	%f0, [%o2 + 0x00]
+	std	%f2, [%o2 + 0x08]
+	std	%f4, [%o2 + 0x10]
+	std	%f6, [%o2 + 0x18]
+	std	%f8, [%o2 + 0x20]
+	std	%f10, [%o2 + 0x28]
+	std	%f12, [%o2 + 0x30]
+	retl
+	 std	%f14, [%o2 + 0x38]
+10:
+	alignaddr %o0, %g0, %o0
+
+	ldd	[%o0 + 0x00], %f18
+1:
+	ldd	[%o0 + 0x08], %f20
+	ldd	[%o0 + 0x10], %f22
+	ldd	[%o0 + 0x18], %f24
+	ldd	[%o0 + 0x20], %f26
+	ldd	[%o0 + 0x28], %f28
+	ldd	[%o0 + 0x30], %f30
+	ldd	[%o0 + 0x38], %f32
+	ldd	[%o0 + 0x40], %f34
+	ldd	[%o0 + 0x48], %f36
+	ldd	[%o0 + 0x50], %f38
+	ldd	[%o0 + 0x58], %f40
+	ldd	[%o0 + 0x60], %f42
+	ldd	[%o0 + 0x68], %f44
+	ldd	[%o0 + 0x70], %f46
+	ldd	[%o0 + 0x78], %f48
+	ldd	[%o0 + 0x80], %f50
+
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+	faligndata %f26, %f28, %f24
+	faligndata %f28, %f30, %f26
+	faligndata %f30, %f32, %f28
+	faligndata %f32, %f34, %f30
+	faligndata %f34, %f36, %f32
+	faligndata %f36, %f38, %f34
+	faligndata %f38, %f40, %f36
+	faligndata %f40, %f42, %f38
+	faligndata %f42, %f44, %f40
+	faligndata %f44, %f46, %f42
+	faligndata %f46, %f48, %f44
+	faligndata %f48, %f50, %f46
+
+	SHA512
+
+	subcc	%o1, 0x80, %o1
+	fsrc2	%f50, %f18
+	bne,pt	%xcc, 1b
+	 add	%o0, 0x80, %o0
+
+	ba,a,pt	%xcc, 5b
+END(__sha512_process_block_crop)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sub_n-vis3.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sub_n-vis3.S
new file mode 100644
index 0000000000..2d2a75dff8
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sub_n-vis3.S
@@ -0,0 +1,71 @@
+! SPARC v9 64-bit VIS3 __mpn_sub_n -- Subtract two limb vectors of the same length > 0
+! and store difference in a third limb vector.
+!
+! Copyright (C) 2013-2017 Free Software Foundation, Inc.
+! This file is part of the GNU C Library.
+! Contributed by David S. Miller <davem@davemloft.net>
+!
+! The GNU C Library is free software; you can redistribute it and/or
+! modify it under the terms of the GNU Lesser General Public
+! License as published by the Free Software Foundation; either
+! version 2.1 of the License, or (at your option) any later version.
+!
+! The GNU C Library is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+! Lesser General Public License for more details.
+!
+! You should have received a copy of the GNU Lesser General Public
+! License along with the GNU C Library; if not, see
+! <http://www.gnu.org/licenses/>.
+
+#include <sysdep.h>
+
+#define res_ptr	%o0
+#define s1_ptr	%o1
+#define s2_ptr	%o2
+#define sz	%o3
+#define tmp1	%g1
+#define tmp2	%g2
+#define tmp3	%g3
+#define tmp4	%o4
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+ENTRY(__mpn_sub_n_vis3)
+	subcc	sz, 1, sz
+	be	.Lfinal_limb
+	 cmp	%g0, 1
+
+.Lloop:
+	ldx	[s2_ptr + 0x00], tmp1
+	add	s2_ptr, 0x10, s2_ptr
+	ldx	[s1_ptr + 0x00], tmp2
+	add	s1_ptr, 0x10, s1_ptr
+	ldx	[s2_ptr - 0x08], tmp3
+	add	res_ptr, 0x10, res_ptr
+	ldx	[s1_ptr - 0x08], tmp4
+	sub	sz, 2, sz
+	xnor	tmp1, %g0, tmp1
+	addxccc	tmp1, tmp2, tmp1
+	stx	tmp1, [res_ptr - 0x10]
+	xnor	tmp3, %g0, tmp3
+	addxccc	tmp3, tmp4, tmp3
+	brgz	sz, .Lloop
+	stx	tmp3, [res_ptr - 0x08]
+
+	brlz,pt	sz, .Lfinish
+	 nop
+
+.Lfinal_limb:
+	ldx	[s2_ptr + 0x00], tmp1
+	ldx	[s1_ptr + 0x00], tmp2
+	xnor	tmp1, %g0, tmp1
+	addxccc	tmp1, tmp2, tmp1
+	stx	tmp1, [res_ptr + 0x00]
+
+.Lfinish:
+	clr	%o0
+	retl
+	 movcc	%xcc, 1, %o0
+END(__mpn_sub_n_vis3)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sub_n.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sub_n.S
new file mode 100644
index 0000000000..d20a286df1
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sub_n.S
@@ -0,0 +1,56 @@
+/* Multiple versions of sub_n
+
+   Copyright (C) 2013-2017 Free Software Foundation, Inc.
+   Contributed by David S. Miller (davem@davemloft.net)
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY(__mpn_sub_n)
+	.type	__mpn_sub_n, @gnu_indirect_function
+# ifdef SHARED
+	SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+	set	HWCAP_SPARC_VIS3, %o1
+	andcc	%o0, %o1, %g0
+	be	1f
+	 nop
+# ifdef SHARED
+	sethi	%gdop_hix22(__mpn_sub_n_vis3), %o1
+	xor	%o1, %gdop_lox10(__mpn_sub_n_vis3), %o1
+# else
+	set	__mpn_sub_n_vis3, %o1
+# endif
+	ba	10f
+	 nop
+1:
+# ifdef SHARED
+	sethi	%gdop_hix22(__mpn_sub_n_generic), %o1
+	xor	%o1, %gdop_lox10(__mpn_sub_n_generic), %o1
+# else
+	set	__mpn_sub_n_generic, %o1
+# endif
+10:
+# ifdef SHARED
+	add	%o3, %o1, %o1
+# endif
+	retl
+	 mov	%o1, %o0
+END(__mpn_sub_n)
+
+#define __mpn_sub_n __mpn_sub_n_generic
+#include "../sub_n.S"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/submul_1-vis3.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/submul_1-vis3.S
new file mode 100644
index 0000000000..99644491e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/submul_1-vis3.S
@@ -0,0 +1,87 @@
+! SPARC v9 64-bit VIS3 __mpn_submul_1 -- Multiply a limb vector with a
+! limb and subtract the result from a second limb vector.
+!
+! Copyright (C) 2013-2017 Free Software Foundation, Inc.
+! This file is part of the GNU C Library.
+! Contributed by David S. Miller <davem@davemloft.net>
+!
+! The GNU C Library is free software; you can redistribute it and/or
+! modify it under the terms of the GNU Lesser General Public
+! License as published by the Free Software Foundation; either
+! version 2.1 of the License, or (at your option) any later version.
+!
+! The GNU C Library is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+! Lesser General Public License for more details.
+!
+! You should have received a copy of the GNU Lesser General Public
+! License along with the GNU C Library; if not, see
+! <http://www.gnu.org/licenses/>.
+
+#include <sysdep.h>
+
+#define res_ptr	%i0
+#define s1_ptr	%i1
+#define sz	%i2
+#define s2_limb	%i3
+#define carry	%o5
+#define tmp1	%g1
+#define tmp2	%g2
+#define tmp3	%g3
+#define tmp4	%o4
+#define tmp5	%l0
+#define tmp6	%l1
+#define tmp7	%l2
+#define tmp8	%l3
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+ENTRY(__mpn_submul_1_vis3)
+	save	%sp, -176, %sp
+	subcc	sz, 1, sz
+	be	.Lfinal_limb
+	 clr	carry
+
+.Lloop:
+	ldx	[s1_ptr  + 0x00], tmp1
+	ldx	[res_ptr + 0x00], tmp3
+	ldx	[s1_ptr  + 0x08], tmp2
+	ldx	[res_ptr + 0x08], tmp4
+	mulx	tmp1, s2_limb, tmp5
+	add	s1_ptr, 0x10, s1_ptr
+	umulxhi	tmp1, s2_limb, tmp6
+	add	res_ptr, 0x10, res_ptr
+	mulx	tmp2, s2_limb, tmp7
+	sub	sz, 2, sz
+	umulxhi	tmp2, s2_limb, tmp8
+	addcc	carry, tmp5, tmp5
+	addxc	%g0, tmp6, carry
+	subcc	tmp3, tmp5, tmp5
+	addxc	%g0, carry, carry
+	stx	tmp5, [res_ptr - 0x10]
+	addcc	carry, tmp7, tmp7
+	addxc	%g0, tmp8, carry
+	subcc	tmp4, tmp7, tmp7
+	addxc	%g0, carry, carry
+	brgz	sz, .Lloop
+	 stx	tmp7, [res_ptr - 0x08]
+
+	brlz,pt	sz, .Lfinish
+	 nop
+
+.Lfinal_limb:
+	ldx	[s1_ptr  + 0x00], tmp1
+	ldx	[res_ptr + 0x00], tmp3
+	mulx	tmp1, s2_limb, tmp5
+	umulxhi	tmp1, s2_limb, tmp6
+	addcc	carry, tmp5, tmp5
+	addxc	%g0, tmp6, carry
+	subcc	tmp3, tmp5, tmp5
+	addxc	%g0, carry, carry
+	stx	tmp5, [res_ptr + 0x00]
+
+.Lfinish:
+	jmpl	%i7 + 8, %g0
+	 restore carry, 0, %o0
+END(__mpn_submul_1_vis3)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/submul_1.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/submul_1.S
new file mode 100644
index 0000000000..3c297d989b
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/submul_1.S
@@ -0,0 +1,56 @@
+/* Multiple versions of submul_1
+
+   Copyright (C) 2013-2017 Free Software Foundation, Inc.
+   Contributed by David S. Miller (davem@davemloft.net)
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY(__mpn_submul_1)
+	.type	__mpn_submul_1, @gnu_indirect_function
+# ifdef SHARED
+	SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+	set	HWCAP_SPARC_VIS3, %o1
+	andcc	%o0, %o1, %g0
+	be	1f
+	 nop
+# ifdef SHARED
+	sethi	%gdop_hix22(__mpn_submul_1_vis3), %o1
+	xor	%o1, %gdop_lox10(__mpn_submul_1_vis3), %o1
+# else
+	set	__mpn_submul_1_vis3, %o1
+# endif
+	ba	10f
+	 nop
+1:
+# ifdef SHARED
+	sethi	%gdop_hix22(__mpn_submul_1_generic), %o1
+	xor	%o1, %gdop_lox10(__mpn_submul_1_generic), %o1
+# else
+	set	__mpn_submul_1_generic, %o1
+# endif
+10:
+# ifdef SHARED
+	add	%o3, %o1, %o1
+# endif
+	retl
+	 mov	%o1, %o0
+END(__mpn_submul_1)
+
+#define __mpn_submul_1 __mpn_submul_1_generic
+#include "../submul_1.S"