about summary refs log tree commit diff
path: root/REORG.TODO/sysdeps/i386
diff options
context:
space:
mode:
authorZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
committerZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
commit5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree4470480d904b65cf14ca524f96f79eca818c3eaf /REORG.TODO/sysdeps/i386
parent199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
downloadglibc-zack/build-layout-experiment.tar.gz
glibc-zack/build-layout-experiment.tar.xz
glibc-zack/build-layout-experiment.zip
Prepare for radical source tree reorganization. zack/build-layout-experiment
All top-level files and directories are moved into a temporary storage
directory, REORG.TODO, except for files that will certainly still
exist in their current form at top level when we're done (COPYING,
COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which
are moved to the new directory OldChangeLogs, instead), and the
generated file INSTALL (which is just deleted; in the new order, there
will be no generated files checked into version control).
Diffstat (limited to 'REORG.TODO/sysdeps/i386')
-rw-r--r--REORG.TODO/sysdeps/i386/Implies5
-rw-r--r--REORG.TODO/sysdeps/i386/Makefile103
-rw-r--r--REORG.TODO/sysdeps/i386/Versions35
-rw-r--r--REORG.TODO/sysdeps/i386/____longjmp_chk.S1
-rw-r--r--REORG.TODO/sysdeps/i386/__longjmp.S72
-rw-r--r--REORG.TODO/sysdeps/i386/abort-instr.h2
-rw-r--r--REORG.TODO/sysdeps/i386/add_n.S111
-rw-r--r--REORG.TODO/sysdeps/i386/addmul_1.S86
-rw-r--r--REORG.TODO/sysdeps/i386/asm-syntax.h24
-rw-r--r--REORG.TODO/sysdeps/i386/atomic-machine.h545
-rw-r--r--REORG.TODO/sysdeps/i386/backtrace.c163
-rw-r--r--REORG.TODO/sysdeps/i386/bcopy.S4
-rw-r--r--REORG.TODO/sysdeps/i386/bsd-_setjmp.S56
-rw-r--r--REORG.TODO/sysdeps/i386/bsd-setjmp.S66
-rw-r--r--REORG.TODO/sysdeps/i386/bzero.S5
-rw-r--r--REORG.TODO/sysdeps/i386/cacheinfo.c3
-rw-r--r--REORG.TODO/sysdeps/i386/configure84
-rw-r--r--REORG.TODO/sysdeps/i386/configure.ac52
-rw-r--r--REORG.TODO/sysdeps/i386/crti.S84
-rw-r--r--REORG.TODO/sysdeps/i386/crtn.S47
-rw-r--r--REORG.TODO/sysdeps/i386/dl-irel.h51
-rw-r--r--REORG.TODO/sysdeps/i386/dl-lookupcfg.h32
-rw-r--r--REORG.TODO/sysdeps/i386/dl-machine.h757
-rw-r--r--REORG.TODO/sysdeps/i386/dl-procinfo.c65
-rw-r--r--REORG.TODO/sysdeps/i386/dl-tls.h61
-rw-r--r--REORG.TODO/sysdeps/i386/dl-tlsdesc.S285
-rw-r--r--REORG.TODO/sysdeps/i386/dl-tlsdesc.h61
-rw-r--r--REORG.TODO/sysdeps/i386/dl-trampoline.S215
-rw-r--r--REORG.TODO/sysdeps/i386/ffs.c50
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/Implies1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/Versions6
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/doasin.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acos.S25
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acosf.S24
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acosh.S101
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acoshf.S101
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acoshl.S107
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acosl.c29
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_asin.S38
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_asinf.S39
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atan2.S30
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atan2f.S30
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atan2l.c19
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atanh.S112
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atanhf.S109
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atanhl.S127
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp.S73
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp10.S53
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp10f.S53
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp10l.S2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp2.S52
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp2f.S52
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp2l.S60
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_expf.S74
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_expl.S226
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_fmod.S18
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_fmodf.S19
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_fmodl.c23
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_hypot.S75
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_hypotf.S64
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_ilogb.S42
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S42
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log.S92
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log10.S68
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log10f.S69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log10l.S71
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log2.S69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log2f.S69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log2l.S70
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_logf.S93
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_logl.S97
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_pow.S456
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_powf.S392
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_powl.S459
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c3
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_remainder.S18
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_remainderf.S18
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_remainderl.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_scalb.S100
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_scalbf.S102
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_scalbl.S90
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_sqrt.S23
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S13
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c54
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c54
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fegetenv.c49
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fegetexcept.c31
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fegetmode.c32
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fegetround.c33
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c50
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fenv_private.h501
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fesetenv.c131
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fesetexcept.c31
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fesetmode.c54
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fesetround.c54
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/feupdateenv.c60
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c57
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c124
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/ftestexcept.c40
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/halfulp.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h340
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/libm-test-ulps2202
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/math-tests.h27
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/math_private.h7
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mpatan.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mpatan2.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mpexp.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mplog.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mpsqrt.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_asinh.S139
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_asinhf.S139
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_asinhl.S144
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_atan.S30
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_atanf.S30
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_atanl.c22
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_cbrt.S200
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S177
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S229
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_ceil.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_ceilf.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_ceill.S40
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_copysign.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_copysignf.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_copysignl.S21
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_expm1.S113
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_expm1f.S113
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_expm1l.S2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fabs.S9
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fabsf.S9
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fabsl.S9
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fdim.c50
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_finite.S17
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_finitef.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_finitel.S15
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_floor.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_floorf.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_floorl.S40
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fmax.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S71
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fmin.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fminf.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fminl.S71
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c42
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_frexp.S83
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_frexpf.S80
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_frexpl.S92
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_isinfl.c32
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_isnanl.c43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_llrint.S36
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_llrintf.S36
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_llrintl.S36
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_log1p.S67
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_log1pf.S67
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_log1pl.S76
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_logb.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_logbf.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_logbl.c19
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_lrint.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_lrintf.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_lrintl.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S23
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c125
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c93
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c77
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_remquo.S45
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_remquof.S45
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_remquol.S45
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_rint.S15
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_rintf.S15
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_rintl.c18
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalbln.c2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalbn.S24
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S24
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S23
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_significand.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_significandf.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_significandl.c19
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_trunc.S37
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_truncf.S37
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_truncl.S40
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/slowexp.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/slowpow.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/t_exp.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c8
-rw-r--r--REORG.TODO/sysdeps/i386/gccframe.h27
-rw-r--r--REORG.TODO/sysdeps/i386/gmp-mparam.h28
-rw-r--r--REORG.TODO/sysdeps/i386/htonl.S34
-rw-r--r--REORG.TODO/sysdeps/i386/htons.S35
-rw-r--r--REORG.TODO/sysdeps/i386/i386-mcount.S79
-rw-r--r--REORG.TODO/sysdeps/i386/i586/add_n.S143
-rw-r--r--REORG.TODO/sysdeps/i386/i586/addmul_1.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i586/bzero.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i586/init-arch.h19
-rw-r--r--REORG.TODO/sysdeps/i386/i586/lshift.S255
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memcopy.h95
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memcpy.S124
-rw-r--r--REORG.TODO/sysdeps/i386/i586/mempcpy.S8
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memset.S121
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memusage.h1
-rw-r--r--REORG.TODO/sysdeps/i386/i586/mul_1.S90
-rw-r--r--REORG.TODO/sysdeps/i386/i586/rshift.S255
-rw-r--r--REORG.TODO/sysdeps/i386/i586/stpcpy.S8
-rw-r--r--REORG.TODO/sysdeps/i386/i586/strchr.S348
-rw-r--r--REORG.TODO/sysdeps/i386/i586/strcpy.S169
-rw-r--r--REORG.TODO/sysdeps/i386/i586/strlen.S182
-rw-r--r--REORG.TODO/sysdeps/i386/i586/sub_n.S143
-rw-r--r--REORG.TODO/sysdeps/i386/i586/submul_1.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i686/Makefile12
-rw-r--r--REORG.TODO/sysdeps/i386/i686/add_n.S110
-rw-r--r--REORG.TODO/sysdeps/i386/i686/bcopy.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/bzero.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/dl-hash.h79
-rw-r--r--REORG.TODO/sysdeps/i386/i686/ffs.c48
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/e_log.S29
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S30
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S22
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S325
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps2188
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S553
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c29
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S586
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c30
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S566
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c28
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S58
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S58
-rw-r--r--REORG.TODO/sysdeps/i386/i686/hp-timing.h42
-rw-r--r--REORG.TODO/sysdeps/i386/i686/init-arch.h19
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memcmp.S408
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memcpy.S98
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memmove.S120
-rw-r--r--REORG.TODO/sysdeps/i386/i686/mempcpy.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memset.S100
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memusage.h21
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/Makefile44
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S59
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S62
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c376
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym11
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S502
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S709
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S1225
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S2157
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S62
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S681
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S1809
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S3162
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S78
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S50
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S89
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S81
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S50
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S417
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S724
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S45
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S811
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S860
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset.S75
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S82
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c27
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c34
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c27
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c34
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c12
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c13
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S1245
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S572
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S92
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S158
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S348
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S57
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S804
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S2810
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S95
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S2250
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S3901
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S116
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S75
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S125
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S695
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S60
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c13
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c10
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S282
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S708
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S57
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S56
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c22
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S219
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S36
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c14
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S1018
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S600
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S36
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S193
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S354
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S35
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S40
-rw-r--r--REORG.TODO/sysdeps/i386/i686/nptl/tls.h35
-rw-r--r--REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S20
-rw-r--r--REORG.TODO/sysdeps/i386/i686/stack-aliasing.h23
-rw-r--r--REORG.TODO/sysdeps/i386/i686/strcmp.S52
-rw-r--r--REORG.TODO/sysdeps/i386/i686/tst-stack-align.h44
-rw-r--r--REORG.TODO/sysdeps/i386/i786/Implies2
-rw-r--r--REORG.TODO/sysdeps/i386/init-arch.h19
-rw-r--r--REORG.TODO/sysdeps/i386/jmpbuf-offsets.h25
-rw-r--r--REORG.TODO/sysdeps/i386/jmpbuf-unwind.h47
-rw-r--r--REORG.TODO/sysdeps/i386/ldbl2mpn.c120
-rw-r--r--REORG.TODO/sysdeps/i386/ldsodefs.h41
-rw-r--r--REORG.TODO/sysdeps/i386/link-defines.sym20
-rw-r--r--REORG.TODO/sysdeps/i386/lshift.S103
-rw-r--r--REORG.TODO/sysdeps/i386/machine-gmon.h40
-rw-r--r--REORG.TODO/sysdeps/i386/memchr.S322
-rw-r--r--REORG.TODO/sysdeps/i386/memcmp.S73
-rw-r--r--REORG.TODO/sysdeps/i386/memcopy.h92
-rw-r--r--REORG.TODO/sysdeps/i386/memcpy.S95
-rw-r--r--REORG.TODO/sysdeps/i386/memcpy_chk.S34
-rw-r--r--REORG.TODO/sysdeps/i386/memmove.S4
-rw-r--r--REORG.TODO/sysdeps/i386/memmove_chk.S33
-rw-r--r--REORG.TODO/sysdeps/i386/mempcpy.S7
-rw-r--r--REORG.TODO/sysdeps/i386/mempcpy_chk.S33
-rw-r--r--REORG.TODO/sysdeps/i386/memset.S68
-rw-r--r--REORG.TODO/sysdeps/i386/memset_chk.S33
-rw-r--r--REORG.TODO/sysdeps/i386/memusage.h20
-rw-r--r--REORG.TODO/sysdeps/i386/mp_clz_tab.c1
-rw-r--r--REORG.TODO/sysdeps/i386/mul_1.S86
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/Makefile26
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c19
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S37
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S31
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/pthreaddef.h40
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym17
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/tls.h435
-rw-r--r--REORG.TODO/sysdeps/i386/preconfigure5
-rw-r--r--REORG.TODO/sysdeps/i386/pthread_spin_trylock.S46
-rw-r--r--REORG.TODO/sysdeps/i386/rawmemchr.S222
-rw-r--r--REORG.TODO/sysdeps/i386/rshift.S105
-rw-r--r--REORG.TODO/sysdeps/i386/setfpucw.c54
-rw-r--r--REORG.TODO/sysdeps/i386/setjmp.S58
-rw-r--r--REORG.TODO/sysdeps/i386/stackguard-macros.h12
-rw-r--r--REORG.TODO/sysdeps/i386/stackinfo.h43
-rw-r--r--REORG.TODO/sysdeps/i386/start.S139
-rw-r--r--REORG.TODO/sysdeps/i386/stpcpy.S88
-rw-r--r--REORG.TODO/sysdeps/i386/stpncpy.S147
-rw-r--r--REORG.TODO/sysdeps/i386/strcat.S265
-rw-r--r--REORG.TODO/sysdeps/i386/strchr.S290
-rw-r--r--REORG.TODO/sysdeps/i386/strchrnul.S278
-rw-r--r--REORG.TODO/sysdeps/i386/strcspn.S240
-rw-r--r--REORG.TODO/sysdeps/i386/string-inlines.c47
-rw-r--r--REORG.TODO/sysdeps/i386/strlen.S132
-rw-r--r--REORG.TODO/sysdeps/i386/strlen.c35
-rw-r--r--REORG.TODO/sysdeps/i386/strpbrk.S243
-rw-r--r--REORG.TODO/sysdeps/i386/strrchr.S334
-rw-r--r--REORG.TODO/sysdeps/i386/strspn.S240
-rw-r--r--REORG.TODO/sysdeps/i386/sub_n.S111
-rw-r--r--REORG.TODO/sysdeps/i386/submul_1.S86
-rw-r--r--REORG.TODO/sysdeps/i386/symbol-hacks.h21
-rw-r--r--REORG.TODO/sysdeps/i386/sys/ucontext.h139
-rw-r--r--REORG.TODO/sysdeps/i386/sysdep.h159
-rw-r--r--REORG.TODO/sysdeps/i386/tls-macros.h78
-rw-r--r--REORG.TODO/sysdeps/i386/tlsdesc.c268
-rw-r--r--REORG.TODO/sysdeps/i386/tlsdesc.sym17
-rw-r--r--REORG.TODO/sysdeps/i386/tst-audit.h25
-rw-r--r--REORG.TODO/sysdeps/i386/tst-audit3.c37
-rw-r--r--REORG.TODO/sysdeps/i386/tst-audit3.h20
-rw-r--r--REORG.TODO/sysdeps/i386/tst-auditmod3a.c38
-rw-r--r--REORG.TODO/sysdeps/i386/tst-auditmod3b.c186
-rwxr-xr-xREORG.TODO/sysdeps/i386/tst-ld-sse-use.sh103
-rw-r--r--REORG.TODO/sysdeps/i386/tst-stack-align.h41
450 files changed, 62011 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/i386/Implies b/REORG.TODO/sysdeps/i386/Implies
new file mode 100644
index 0000000000..20b2dffc29
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/Implies
@@ -0,0 +1,5 @@
+x86
+wordsize-32
+ieee754/ldbl-96
+ieee754/dbl-64
+ieee754/flt-32
diff --git a/REORG.TODO/sysdeps/i386/Makefile b/REORG.TODO/sysdeps/i386/Makefile
new file mode 100644
index 0000000000..e30e1339f0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/Makefile
@@ -0,0 +1,103 @@
+# The mpn functions need a #define for asm syntax flavor.
+# Every i386 port in use uses gas syntax (I think).
+asm-CPPFLAGS += -DGAS_SYNTAX
+
+# The i386 `long double' is a distinct type we support.
+long-double-fcts = yes
+
+ifeq ($(subdir),string)
+sysdep_routines += cacheinfo
+endif
+
+ifeq ($(subdir),gmon)
+sysdep_routines += i386-mcount
+endif
+
+ifeq ($(subdir),elf)
+CFLAGS-rtld.c += -Wno-uninitialized -Wno-unused
+CFLAGS-dl-load.c += -Wno-unused
+CFLAGS-dl-reloc.c += -Wno-unused
+endif
+
+ifeq ($(subdir),debug)
+CFLAGS-backtrace.c += -fexceptions
+endif
+
+# Most of the glibc routines don't ever call user defined callbacks
+# nor use any FPU or SSE* and as such don't need bigger %esp alignment
+# than 4 bytes.
+# Lots of routines in math will use FPU, so make math subdir an exception
+# here.
+# In gcc 4.6 (and maybe earlier?) giving -mpreferred-stack-boundary=2 is
+# an error, so don't try to reduce it here like we used to.  We still
+# explicit set -mpreferred-stack-boundary=4 the places where it matters,
+# in case an older compiler defaulted to 2.
+ifeq ($(subdir),math)
+sysdep-CFLAGS += -mpreferred-stack-boundary=4
+else
+ifeq ($(subdir),csu)
+sysdep-CFLAGS += -mpreferred-stack-boundary=4
+gen-as-const-headers += link-defines.sym
+else
+# Likewise, any function which calls user callbacks
+uses-callbacks += -mpreferred-stack-boundary=4
+# Likewise, any stack alignment tests
+stack-align-test-flags += -malign-double -mpreferred-stack-boundary=4
+endif
+endif
+
+# And a couple of other routines
+ifeq ($(subdir),stdlib)
+CFLAGS-exit.c += -mpreferred-stack-boundary=4
+CFLAGS-cxa_finalize.c += -mpreferred-stack-boundary=4
+endif
+ifeq ($(subdir),elf)
+CFLAGS-dl-init.c += -mpreferred-stack-boundary=4
+CFLAGS-dl-fini.c += -mpreferred-stack-boundary=4
+CFLAGS-dl-open.c += -mpreferred-stack-boundary=4
+CFLAGS-dl-close.c += -mpreferred-stack-boundary=4
+CFLAGS-dl-error.c += -mpreferred-stack-boundary=4
+endif
+ifeq ($(subdir),dlfcn)
+CFLAGS-dlopen.c += -mpreferred-stack-boundary=4
+CFLAGS-dlopenold.c += -mpreferred-stack-boundary=4
+CFLAGS-dlclose.c += -mpreferred-stack-boundary=4
+CFLAGS-dlerror.c += -mpreferred-stack-boundary=4
+endif
+
+ifneq (,$(filter -mno-tls-direct-seg-refs,$(CFLAGS)))
+defines += -DNO_TLS_DIRECT_SEG_REFS
+endif
+
+ifeq ($(subdir),elf)
+sysdep-dl-routines += tlsdesc dl-tlsdesc
+
+tests += tst-audit3
+modules-names += tst-auditmod3a tst-auditmod3b
+
+$(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so
+$(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so
+tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so
+endif
+
+ifeq ($(subdir),csu)
+gen-as-const-headers += tlsdesc.sym
+endif
+
+# Make sure no code in ld.so uses mm/xmm/ymm/zmm registers on i386 since
+# the first 3 mm/xmm/ymm/zmm registers are used to pass vector parameters
+# which must be preserved.
+# With SSE disabled, ensure -fpmath is not set to use sse either.
+rtld-CFLAGS += -mno-sse -mno-mmx -mfpmath=387
+ifeq ($(subdir),elf)
+CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
+		   $(rtld-CFLAGS))
+
+tests-special += $(objpfx)tst-ld-sse-use.out
+$(objpfx)tst-ld-sse-use.out: ../sysdeps/i386/tst-ld-sse-use.sh $(objpfx)ld.so
+	@echo "Checking ld.so for SSE register use.  This will take a few seconds..."
+	$(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \
+	$(evaluate-test)
+else
+CFLAGS-.os += $(if $(filter rtld-%.os,$(@F)), $(rtld-CFLAGS))
+endif
diff --git a/REORG.TODO/sysdeps/i386/Versions b/REORG.TODO/sysdeps/i386/Versions
new file mode 100644
index 0000000000..7be44aad7a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/Versions
@@ -0,0 +1,35 @@
+ld {
+  GLIBC_2.3 {
+    # The alternative i386 runtime interface to TLS.
+    ___tls_get_addr;
+  }
+}
+libc {
+  GLIBC_2.0 {
+    # Functions from libgcc.
+    __divdi3; __moddi3; __udivdi3; __umoddi3;
+  }
+  GLIBC_2.1 {
+    # global variable
+    _fp_hw;
+  }
+  GLIBC_2.1.1 {
+    # extern inline functions used by <bits/string.h>
+    __memcpy_c; __memset_cc; __memset_cg; __memset_gg;
+    __memcpy_by2; __memcpy_by4; __memcpy_g; __mempcpy_by2; __mempcpy_by4;
+    __mempcpy_byn; __memset_ccn_by2; __memset_ccn_by4; __memset_gcn_by2;
+    __memset_gcn_by4; __stpcpy_g; __strcat_c; __strcat_g; __strchr_c;
+    __strchr_g; __strchrnul_c; __strchrnul_g; __strcmp_gg; __strcpy_g;
+    __strcspn_c1; __strcspn_cg; __strcspn_g; __strlen_g; __strncat_g;
+    __strncmp_g; __strncpy_by2; __strncpy_by4; __strncpy_byn; __strncpy_gg;
+    __strpbrk_cg; __strpbrk_g; __strrchr_c; __strrchr_g; __strspn_c1;
+    __strspn_cg; __strspn_g; __strstr_cg; __strstr_g;
+  }
+}
+libm {
+  GLIBC_2.1 {
+    # A generic bug got this omitted from other configurations' version
+    # sets, but we always had it.
+    exp2l;
+  }
+}
diff --git a/REORG.TODO/sysdeps/i386/____longjmp_chk.S b/REORG.TODO/sysdeps/i386/____longjmp_chk.S
new file mode 100644
index 0000000000..0910861a9d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/____longjmp_chk.S
@@ -0,0 +1 @@
+#error "OS-specific version needed"
diff --git a/REORG.TODO/sysdeps/i386/__longjmp.S b/REORG.TODO/sysdeps/i386/__longjmp.S
new file mode 100644
index 0000000000..3719763cd6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/__longjmp.S
@@ -0,0 +1,72 @@
+/* longjmp for i386.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <jmpbuf-offsets.h>
+#include <asm-syntax.h>
+#include <stap-probe.h>
+
+	.text
+ENTRY (__longjmp)
+#ifdef PTR_DEMANGLE
+	movl 4(%esp), %eax	/* User's jmp_buf in %eax.  */
+
+	/* Save the return address now.  */
+	movl (JB_PC*4)(%eax), %edx
+	/* Get the stack pointer.  */
+	movl (JB_SP*4)(%eax), %ecx
+	PTR_DEMANGLE (%edx)
+	PTR_DEMANGLE (%ecx)
+	LIBC_PROBE (longjmp, 3, 4@%eax, -4@8(%esp), 4@%edx)
+	cfi_def_cfa(%eax, 0)
+	cfi_register(%eip, %edx)
+	cfi_register(%esp, %ecx)
+	cfi_offset(%ebx, JB_BX*4)
+	cfi_offset(%esi, JB_SI*4)
+	cfi_offset(%edi, JB_DI*4)
+	cfi_offset(%ebp, JB_BP*4)
+	/* Restore registers.  */
+	movl (JB_BX*4)(%eax), %ebx
+	movl (JB_SI*4)(%eax), %esi
+	movl (JB_DI*4)(%eax), %edi
+	movl (JB_BP*4)(%eax), %ebp
+	cfi_restore(%ebx)
+	cfi_restore(%esi)
+	cfi_restore(%edi)
+	cfi_restore(%ebp)
+
+	LIBC_PROBE (longjmp_target, 3, 4@%eax, -4@8(%esp), 4@%edx)
+	movl 8(%esp), %eax	/* Second argument is return value.  */
+	movl %ecx, %esp
+#else
+	movl 4(%esp), %ecx	/* User's jmp_buf in %ecx.  */
+	movl 8(%esp), %eax	/* Second argument is return value.  */
+	/* Save the return address now.  */
+	movl (JB_PC*4)(%ecx), %edx
+	LIBC_PROBE (longjmp, 3, 4@%ecx, -4@%eax, 4@%edx)
+	/* Restore registers.  */
+	movl (JB_BX*4)(%ecx), %ebx
+	movl (JB_SI*4)(%ecx), %esi
+	movl (JB_DI*4)(%ecx), %edi
+	movl (JB_BP*4)(%ecx), %ebp
+	movl (JB_SP*4)(%ecx), %esp
+	LIBC_PROBE (longjmp_target, 3, 4@%ecx, -4@%ecx, 4@%edx)
+#endif
+	/* Jump to saved PC.  */
+	jmp *%edx
+END (__longjmp)
diff --git a/REORG.TODO/sysdeps/i386/abort-instr.h b/REORG.TODO/sysdeps/i386/abort-instr.h
new file mode 100644
index 0000000000..810f10379b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/abort-instr.h
@@ -0,0 +1,2 @@
+/* An instruction which should crash any program is `hlt'.  */
+#define ABORT_INSTRUCTION asm ("hlt")
diff --git a/REORG.TODO/sysdeps/i386/add_n.S b/REORG.TODO/sysdeps/i386/add_n.S
new file mode 100644
index 0000000000..c2923094a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/add_n.S
@@ -0,0 +1,111 @@
+/* Add two limb vectors of the same length > 0 and store sum in a third
+   limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+8	/* space for 2 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define S2	S1+4
+#define SIZE	S2+4
+
+	.text
+ENTRY (__mpn_add_n)
+
+	pushl %edi
+	cfi_adjust_cfa_offset (4)
+	pushl %esi
+	cfi_adjust_cfa_offset (4)
+
+	movl RES(%esp),%edi
+	cfi_rel_offset (edi, 4)
+	movl S1(%esp),%esi
+	cfi_rel_offset (esi, 0)
+	movl S2(%esp),%edx
+	movl SIZE(%esp),%ecx
+	movl	%ecx,%eax
+	shrl	$3,%ecx			/* compute count for unrolled loop */
+	negl	%eax
+	andl	$7,%eax			/* get index where to start loop */
+	jz	L(oop)			/* necessary special case for 0 */
+	incl	%ecx			/* adjust loop count */
+	shll	$2,%eax			/* adjustment for pointers... */
+	subl	%eax,%edi		/* ... since they are offset ... */
+	subl	%eax,%esi		/* ... by a constant when we ... */
+	subl	%eax,%edx		/* ... enter the loop */
+	shrl	$2,%eax			/* restore previous value */
+#ifdef PIC
+/* Calculate start address in loop for PIC.  Due to limitations in some
+   assemblers, Loop-L0-3 cannot be put into the leal */
+	call	L(0)
+	cfi_adjust_cfa_offset (4)
+L(0):	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$(L(oop)-L(0)-3),%eax
+	addl	$4,%esp
+	cfi_adjust_cfa_offset (-4)
+#else
+/* Calculate start address in loop for non-PIC.  */
+ 	leal	(L(oop) - 3)(%eax,%eax,8),%eax
+#endif
+	jmp	*%eax			/* jump into loop */
+	ALIGN (3)
+L(oop):	movl	(%esi),%eax
+	adcl	(%edx),%eax
+	movl	%eax,(%edi)
+	movl	4(%esi),%eax
+	adcl	4(%edx),%eax
+	movl	%eax,4(%edi)
+	movl	8(%esi),%eax
+	adcl	8(%edx),%eax
+	movl	%eax,8(%edi)
+	movl	12(%esi),%eax
+	adcl	12(%edx),%eax
+	movl	%eax,12(%edi)
+	movl	16(%esi),%eax
+	adcl	16(%edx),%eax
+	movl	%eax,16(%edi)
+	movl	20(%esi),%eax
+	adcl	20(%edx),%eax
+	movl	%eax,20(%edi)
+	movl	24(%esi),%eax
+	adcl	24(%edx),%eax
+	movl	%eax,24(%edi)
+	movl	28(%esi),%eax
+	adcl	28(%edx),%eax
+	movl	%eax,28(%edi)
+	leal	32(%edi),%edi
+	leal	32(%esi),%esi
+	leal	32(%edx),%edx
+	decl	%ecx
+	jnz	L(oop)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl %esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl %edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_add_n)
diff --git a/REORG.TODO/sysdeps/i386/addmul_1.S b/REORG.TODO/sysdeps/i386/addmul_1.S
new file mode 100644
index 0000000000..ad90ea53e5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/addmul_1.S
@@ -0,0 +1,86 @@
+/* i80386 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+   the result to a second limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16	/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define SIZE	S1+4
+#define S2LIMB	SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define sizeP ecx
+#define s2_limb ebx
+
+	.text
+ENTRY (__mpn_addmul_1)
+
+	pushl	%res_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%s1_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%s2_limb
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp), %res_ptr
+	cfi_rel_offset (res_ptr, 12)
+	movl	S1(%esp), %s1_ptr
+	cfi_rel_offset (s1_ptr, 8)
+	movl	SIZE(%esp), %sizeP
+	movl	S2LIMB(%esp), %s2_limb
+	cfi_rel_offset (s2_limb, 0)
+	leal	(%res_ptr,%sizeP,4), %res_ptr
+	leal	(%s1_ptr,%sizeP,4), %s1_ptr
+	negl	%sizeP
+	xorl	%ebp, %ebp
+	cfi_rel_offset (ebp, 4)
+	ALIGN (3)
+L(oop):
+	movl	(%s1_ptr,%sizeP,4), %eax
+	mull	%s2_limb
+	addl	%ebp, %eax
+	adcl	$0, %edx
+	addl	%eax, (%res_ptr,%sizeP,4)
+	adcl	$0, %edx
+	movl	%edx, %ebp
+
+	incl	%sizeP
+	jnz	L(oop)
+	movl	%ebp, %eax
+
+	popl	%s2_limb
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s2_limb)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%s1_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s1_ptr)
+	popl	%res_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (res_ptr)
+
+	ret
+END (__mpn_addmul_1)
diff --git a/REORG.TODO/sysdeps/i386/asm-syntax.h b/REORG.TODO/sysdeps/i386/asm-syntax.h
new file mode 100644
index 0000000000..a992da2dd1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/asm-syntax.h
@@ -0,0 +1,24 @@
+/* Definitions for x86 syntax variations.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.  Its master source is NOT part of
+   the C library, however.  The master source lives in the GNU MP Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#undef ALIGN
+#define ALIGN(log) .align 1<<log
+
+#undef L
+#define L(body) .L##body
diff --git a/REORG.TODO/sysdeps/i386/atomic-machine.h b/REORG.TODO/sysdeps/i386/atomic-machine.h
new file mode 100644
index 0000000000..0e24200617
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/atomic-machine.h
@@ -0,0 +1,545 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <tls.h>	/* For tcbhead_t.  */
+
+
+typedef int8_t atomic8_t;
+typedef uint8_t uatomic8_t;
+typedef int_fast8_t atomic_fast8_t;
+typedef uint_fast8_t uatomic_fast8_t;
+
+typedef int16_t atomic16_t;
+typedef uint16_t uatomic16_t;
+typedef int_fast16_t atomic_fast16_t;
+typedef uint_fast16_t uatomic_fast16_t;
+
+typedef int32_t atomic32_t;
+typedef uint32_t uatomic32_t;
+typedef int_fast32_t atomic_fast32_t;
+typedef uint_fast32_t uatomic_fast32_t;
+
+typedef int64_t atomic64_t;
+typedef uint64_t uatomic64_t;
+typedef int_fast64_t atomic_fast64_t;
+typedef uint_fast64_t uatomic_fast64_t;
+
+typedef intptr_t atomicptr_t;
+typedef uintptr_t uatomicptr_t;
+typedef intmax_t atomic_max_t;
+typedef uintmax_t uatomic_max_t;
+
+
+#ifndef LOCK_PREFIX
+# ifdef UP
+#  define LOCK_PREFIX	/* nothing */
+# else
+#  define LOCK_PREFIX "lock;"
+# endif
+#endif
+
+#define __HAVE_64B_ATOMICS 0
+#define USE_ATOMIC_COMPILER_BUILTINS 0
+#define ATOMIC_EXCHANGE_USES_CAS 0
+
+
+#define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \
+  __sync_val_compare_and_swap (mem, oldval, newval)
+#define atomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
+  (! __sync_bool_compare_and_swap (mem, oldval, newval))
+
+
+#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile ("cmpl $0, %%gs:%P5\n\t"                                \
+                       "je 0f\n\t"                                            \
+                       "lock\n"                                               \
+                       "0:\tcmpxchgb %b2, %1"				      \
+		       : "=a" (ret), "=m" (*mem)			      \
+		       : "q" (newval), "m" (*mem), "0" (oldval),	      \
+			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     ret; })
+
+#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile ("cmpl $0, %%gs:%P5\n\t"                                \
+                       "je 0f\n\t"                                            \
+                       "lock\n"                                               \
+                       "0:\tcmpxchgw %w2, %1"				      \
+		       : "=a" (ret), "=m" (*mem)			      \
+		       : "r" (newval), "m" (*mem), "0" (oldval),	      \
+			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     ret; })
+
+#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile ("cmpl $0, %%gs:%P5\n\t"                                \
+                       "je 0f\n\t"                                            \
+                       "lock\n"                                               \
+                       "0:\tcmpxchgl %2, %1"				      \
+		       : "=a" (ret), "=m" (*mem)			      \
+		       : "r" (newval), "m" (*mem), "0" (oldval),	      \
+			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     ret; })
+
+/* XXX We do not really need 64-bit compare-and-exchange.  At least
+   not in the moment.  Using it would mean causing portability
+   problems since not many other 32-bit architectures have support for
+   such an operation.  So don't define any code for now.  If it is
+   really going to be used the code below can be used on Intel Pentium
+   and later, but NOT on i486.  */
+#if 1
+# define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval)	      \
+  ({ __typeof (*mem) ret = *(mem);					      \
+     abort ();								      \
+     ret = (newval);							      \
+     ret = (oldval);							      \
+     ret; })
+# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval)	      \
+  ({ __typeof (*mem) ret = *(mem);					      \
+     abort ();								      \
+     ret = (newval);							      \
+     ret = (oldval);							      \
+     ret; })
+#else
+# ifdef __PIC__
+#  define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile ("xchgl %2, %%ebx\n\t"				      \
+		       LOCK_PREFIX "cmpxchg8b %1\n\t"			      \
+		       "xchgl %2, %%ebx"				      \
+		       : "=A" (ret), "=m" (*mem)			      \
+		       : "DS" (((unsigned long long int) (newval))	      \
+			       & 0xffffffff),				      \
+			 "c" (((unsigned long long int) (newval)) >> 32),     \
+			 "m" (*mem), "a" (((unsigned long long int) (oldval)) \
+					  & 0xffffffff),		      \
+			 "d" (((unsigned long long int) (oldval)) >> 32));    \
+     ret; })
+
+#  define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile ("xchgl %2, %%ebx\n\t"				      \
+		       "cmpl $0, %%gs:%P7\n\t"				      \
+		       "je 0f\n\t"					      \
+		       "lock\n"						      \
+		       "0:\tcmpxchg8b %1\n\t"				      \
+		       "xchgl %2, %%ebx"				      \
+		       : "=A" (ret), "=m" (*mem)			      \
+		       : "DS" (((unsigned long long int) (newval))	      \
+			       & 0xffffffff),				      \
+			 "c" (((unsigned long long int) (newval)) >> 32),     \
+			 "m" (*mem), "a" (((unsigned long long int) (oldval)) \
+					  & 0xffffffff),		      \
+			 "d" (((unsigned long long int) (oldval)) >> 32),     \
+			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     ret; })
+# else
+#  define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile (LOCK_PREFIX "cmpxchg8b %1"			      \
+		       : "=A" (ret), "=m" (*mem)			      \
+		       : "b" (((unsigned long long int) (newval))	      \
+			      & 0xffffffff),				      \
+			 "c" (((unsigned long long int) (newval)) >> 32),     \
+			 "m" (*mem), "a" (((unsigned long long int) (oldval)) \
+					  & 0xffffffff),		      \
+			 "d" (((unsigned long long int) (oldval)) >> 32));    \
+     ret; })
+
+#  define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile ("cmpl $0, %%gs:%P7\n\t"				      \
+		       "je 0f\n\t"					      \
+		       "lock\n"						      \
+		       "0:\tcmpxchg8b %1"				      \
+		       : "=A" (ret), "=m" (*mem)			      \
+		       : "b" (((unsigned long long int) (newval))	      \
+			      & 0xffffffff),				      \
+			 "c" (((unsigned long long int) (newval)) >> 32),     \
+			 "m" (*mem), "a" (((unsigned long long int) (oldval)) \
+					  & 0xffffffff),		      \
+			 "d" (((unsigned long long int) (oldval)) >> 32),     \
+			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     ret; })
+# endif
+#endif
+
+
+/* Note that we need no lock prefix.  */
+#define atomic_exchange_acq(mem, newvalue) \
+  ({ __typeof (*mem) result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile ("xchgb %b0, %1"				      \
+			 : "=q" (result), "=m" (*mem)			      \
+			 : "0" (newvalue), "m" (*mem));			      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile ("xchgw %w0, %1"				      \
+			 : "=r" (result), "=m" (*mem)			      \
+			 : "0" (newvalue), "m" (*mem));			      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile ("xchgl %0, %1"					      \
+			 : "=r" (result), "=m" (*mem)			      \
+			 : "0" (newvalue), "m" (*mem));			      \
+     else								      \
+       {								      \
+	 result = 0;							      \
+	 abort ();							      \
+       }								      \
+     result; })
+
+
+#define __arch_exchange_and_add_body(lock, pfx, mem, value) \
+  ({ __typeof (*mem) __result;						      \
+     __typeof (value) __addval = (value);				      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (lock "xaddb %b0, %1"				      \
+			 : "=q" (__result), "=m" (*mem)			      \
+			 : "0" (__addval), "m" (*mem),			      \
+			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (lock "xaddw %w0, %1"				      \
+			 : "=r" (__result), "=m" (*mem)			      \
+			 : "0" (__addval), "m" (*mem),			      \
+			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (lock "xaddl %0, %1"				      \
+			 : "=r" (__result), "=m" (*mem)			      \
+			 : "0" (__addval), "m" (*mem),			      \
+			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+     else								      \
+       {								      \
+	 __typeof (mem) __memp = (mem);					      \
+	 __typeof (*mem) __tmpval;					      \
+	 __result = *__memp;						      \
+	 do								      \
+	   __tmpval = __result;						      \
+	 while ((__result = pfx##_compare_and_exchange_val_64_acq	      \
+		 (__memp, __result + __addval, __result)) == __tmpval);	      \
+       }								      \
+     __result; })
+
+#define atomic_exchange_and_add(mem, value) \
+  __sync_fetch_and_add (mem, value)
+
+#define __arch_exchange_and_add_cprefix \
+  "cmpl $0, %%gs:%P4\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_exchange_and_add(mem, value) \
+  __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, __arch_c,    \
+				mem, value)
+
+
+#define __arch_add_body(lock, pfx, mem, value) \
+  do {									      \
+    if (__builtin_constant_p (value) && (value) == 1)			      \
+      atomic_increment (mem);						      \
+    else if (__builtin_constant_p (value) && (value) == -1)		      \
+      atomic_decrement (mem);						      \
+    else if (sizeof (*mem) == 1)					      \
+      __asm __volatile (lock "addb %b1, %0"				      \
+			: "=m" (*mem)					      \
+			: "iq" (value), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "addw %w1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (value), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "addl %1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (value), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      {									      \
+	__typeof (value) __addval = (value);				      \
+	__typeof (mem) __memp = (mem);					      \
+	__typeof (*mem) __oldval = *__memp;				      \
+	__typeof (*mem) __tmpval;					      \
+	do								      \
+	  __tmpval = __oldval;						      \
+	while ((__oldval = pfx##_compare_and_exchange_val_64_acq	      \
+		(__memp, __oldval + __addval, __oldval)) == __tmpval);	      \
+      }									      \
+  } while (0)
+
+#define atomic_add(mem, value) \
+  __arch_add_body (LOCK_PREFIX, __arch, mem, value)
+
+#define __arch_add_cprefix \
+  "cmpl $0, %%gs:%P3\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_add(mem, value) \
+  __arch_add_body (__arch_add_cprefix, __arch_c, mem, value)
+
+
+#define atomic_add_negative(mem, value) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "addb %b2, %0; sets %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "iq" (value), "m" (*mem));			      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "addw %w2, %0; sets %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "ir" (value), "m" (*mem));			      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "addl %2, %0; sets %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "ir" (value), "m" (*mem));			      \
+     else								      \
+       abort ();							      \
+     __result; })
+
+
+#define atomic_add_zero(mem, value) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "addb %b2, %0; setz %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "iq" (value), "m" (*mem));			      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "addw %w2, %0; setz %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "ir" (value), "m" (*mem));			      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "addl %2, %0; setz %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "ir" (value), "m" (*mem));			      \
+     else								      \
+       abort ();							      \
+     __result; })
+
+
+#define __arch_increment_body(lock,  pfx, mem) \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (lock "incb %b0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "incw %w0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "incl %0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      {									      \
+	__typeof (mem) __memp = (mem);					      \
+	__typeof (*mem) __oldval = *__memp;				      \
+	__typeof (*mem) __tmpval;					      \
+	do								      \
+	  __tmpval = __oldval;						      \
+	while ((__oldval = pfx##_compare_and_exchange_val_64_acq	      \
+		(__memp, __oldval + 1, __oldval)) == __tmpval);		      \
+      }									      \
+  } while (0)
+
+#define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, __arch, mem)
+
+#define __arch_increment_cprefix \
+  "cmpl $0, %%gs:%P2\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_increment(mem) \
+  __arch_increment_body (__arch_increment_cprefix, __arch_c, mem)
+
+
+#define atomic_increment_and_test(mem) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "incb %0; sete %b1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "incw %0; sete %w1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "incl %0; sete %1"			      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else								      \
+       abort ();							      \
+     __result; })
+
+
+#define __arch_decrement_body(lock, pfx, mem) \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (lock "decb %b0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "decw %w0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "decl %0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      {									      \
+	__typeof (mem) __memp = (mem);					      \
+	__typeof (*mem) __oldval = *__memp;				      \
+	__typeof (*mem) __tmpval;					      \
+	do								      \
+	  __tmpval = __oldval;						      \
+	while ((__oldval = pfx##_compare_and_exchange_val_64_acq	      \
+		(__memp, __oldval - 1, __oldval)) == __tmpval); 	      \
+      }									      \
+  } while (0)
+
+#define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, __arch, mem)
+
+#define __arch_decrement_cprefix \
+  "cmpl $0, %%gs:%P2\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_decrement(mem) \
+  __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem)
+
+
+#define atomic_decrement_and_test(mem) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "decb %b0; sete %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "decw %w0; sete %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "decl %0; sete %1"			      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else								      \
+       abort ();							      \
+     __result; })
+
+
+#define atomic_bit_set(mem, bit) \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (LOCK_PREFIX "orb %b2, %0"			      \
+			: "=m" (*mem)					      \
+			: "m" (*mem), "iq" (1 << (bit)));		      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (LOCK_PREFIX "orw %w2, %0"			      \
+			: "=m" (*mem)					      \
+			: "m" (*mem), "ir" (1 << (bit)));		      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (LOCK_PREFIX "orl %2, %0"			      \
+			: "=m" (*mem)					      \
+			: "m" (*mem), "ir" (1 << (bit)));		      \
+    else								      \
+      abort ();								      \
+  } while (0)
+
+
+#define atomic_bit_test_set(mem, bit) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "btsb %3, %1; setc %0"		      \
+			 : "=q" (__result), "=m" (*mem)			      \
+			 : "m" (*mem), "ir" (bit));			      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "btsw %3, %1; setc %0"		      \
+			 : "=q" (__result), "=m" (*mem)			      \
+			 : "m" (*mem), "ir" (bit));			      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "btsl %3, %1; setc %0"		      \
+			 : "=q" (__result), "=m" (*mem)			      \
+			 : "m" (*mem), "ir" (bit));			      \
+     else							      	      \
+       abort ();							      \
+     __result; })
+
+
+#define atomic_spin_nop() asm ("rep; nop")
+
+
+#define __arch_and_body(lock, mem, mask) \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (lock "andb %b1, %0"				      \
+			: "=m" (*mem)					      \
+			: "iq" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "andw %w1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "andl %1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      abort ();								      \
+  } while (0)
+
+#define __arch_cprefix \
+  "cmpl $0, %%gs:%P3\n\tje 0f\n\tlock\n0:\t"
+
+#define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask)
+
+#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask)
+
+
+#define __arch_or_body(lock, mem, mask) \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (lock "orb %b1, %0"				      \
+			: "=m" (*mem)					      \
+			: "iq" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "orw %w1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "orl %1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      abort ();								      \
+  } while (0)
+
+#define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask)
+
+#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask)
+
+/* We don't use mfence because it is supposedly slower due to having to
+   provide stronger guarantees (e.g., regarding self-modifying code).  */
+#define atomic_full_barrier() \
+    __asm __volatile (LOCK_PREFIX "orl $0, (%%esp)" ::: "memory")
+#define atomic_read_barrier() __asm ("" ::: "memory")
+#define atomic_write_barrier() __asm ("" ::: "memory")
diff --git a/REORG.TODO/sysdeps/i386/backtrace.c b/REORG.TODO/sysdeps/i386/backtrace.c
new file mode 100644
index 0000000000..ee8238d0ce
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/backtrace.c
@@ -0,0 +1,163 @@
+/* Return backtrace of current program state.
+   Copyright (C) 1998-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <libc-lock.h>
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <stdlib.h>
+#include <unwind.h>
+
+struct trace_arg
+{
+  void **array;
+  int cnt, size;
+  void *lastebp, *lastesp;
+};
+
+#ifdef SHARED
+static _Unwind_Reason_Code (*unwind_backtrace) (_Unwind_Trace_Fn, void *);
+static _Unwind_Ptr (*unwind_getip) (struct _Unwind_Context *);
+static _Unwind_Ptr (*unwind_getcfa) (struct _Unwind_Context *);
+static _Unwind_Ptr (*unwind_getgr) (struct _Unwind_Context *, int);
+static void *libgcc_handle;
+
+static void
+init (void)
+{
+  libgcc_handle = __libc_dlopen ("libgcc_s.so.1");
+
+  if (libgcc_handle == NULL)
+    return;
+
+  unwind_backtrace = __libc_dlsym (libgcc_handle, "_Unwind_Backtrace");
+  unwind_getip = __libc_dlsym (libgcc_handle, "_Unwind_GetIP");
+  unwind_getcfa = __libc_dlsym (libgcc_handle, "_Unwind_GetCFA");
+  unwind_getgr = __libc_dlsym (libgcc_handle, "_Unwind_GetGR");
+  if (unwind_getip == NULL || unwind_getgr == NULL || unwind_getcfa == NULL)
+    {
+      unwind_backtrace = NULL;
+      __libc_dlclose (libgcc_handle);
+      libgcc_handle = NULL;
+    }
+}
+#else
+# define unwind_backtrace _Unwind_Backtrace
+# define unwind_getip _Unwind_GetIP
+# define unwind_getcfa _Unwind_GetCFA
+# define unwind_getgr _Unwind_GetGR
+#endif
+
+static _Unwind_Reason_Code
+backtrace_helper (struct _Unwind_Context *ctx, void *a)
+{
+  struct trace_arg *arg = a;
+
+  /* We are first called with address in the __backtrace function.
+     Skip it.  */
+  if (arg->cnt != -1)
+    arg->array[arg->cnt] = (void *) unwind_getip (ctx);
+  if (++arg->cnt == arg->size)
+    return _URC_END_OF_STACK;
+
+  /* %ebp is DWARF2 register 5 on IA-32.  */
+  arg->lastebp = (void *) unwind_getgr (ctx, 5);
+  arg->lastesp = (void *) unwind_getcfa (ctx);
+  return _URC_NO_REASON;
+}
+
+
+/* This is a global variable set at program start time.  It marks the
+   highest used stack address.  */
+extern void *__libc_stack_end;
+
+
+/* This is the stack layout we see with every stack frame
+   if not compiled without frame pointer.
+
+            +-----------------+        +-----------------+
+    %ebp -> | %ebp last frame--------> | %ebp last frame--->...
+            |                 |        |                 |
+            | return address  |        | return address  |
+            +-----------------+        +-----------------+
+
+   First try as far to get as far as possible using
+   _Unwind_Backtrace which handles -fomit-frame-pointer
+   as well, but requires .eh_frame info.  Then fall back to
+   walking the stack manually.  */
+
+struct layout
+{
+  struct layout *ebp;
+  void *ret;
+};
+
+
+int
+__backtrace (void **array, int size)
+{
+  struct trace_arg arg = { .array = array, .size = size, .cnt = -1 };
+
+  if (size <= 0)
+    return 0;
+
+#ifdef SHARED
+  __libc_once_define (static, once);
+
+  __libc_once (once, init);
+  if (unwind_backtrace == NULL)
+    return 0;
+#endif
+
+  unwind_backtrace (backtrace_helper, &arg);
+
+  if (arg.cnt > 1 && arg.array[arg.cnt - 1] == NULL)
+    --arg.cnt;
+  else if (arg.cnt < size)
+    {
+      struct layout *ebp = (struct layout *) arg.lastebp;
+
+      while (arg.cnt < size)
+	{
+	  /* Check for out of range.  */
+	  if ((void *) ebp < arg.lastesp || (void *) ebp > __libc_stack_end
+	      || ((long) ebp & 3))
+	    break;
+
+	  array[arg.cnt++] = ebp->ret;
+	  ebp = ebp->ebp;
+	}
+    }
+  return arg.cnt != -1 ? arg.cnt : 0;
+}
+weak_alias (__backtrace, backtrace)
+libc_hidden_def (__backtrace)
+
+
+#ifdef SHARED
+/* Free all resources if necessary.  */
+libc_freeres_fn (free_mem)
+{
+  unwind_backtrace = NULL;
+  if (libgcc_handle != NULL)
+    {
+      __libc_dlclose (libgcc_handle);
+      libgcc_handle = NULL;
+    }
+}
+#endif
diff --git a/REORG.TODO/sysdeps/i386/bcopy.S b/REORG.TODO/sysdeps/i386/bcopy.S
new file mode 100644
index 0000000000..12b8ddb886
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/bcopy.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		bcopy
+#include "memcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/bsd-_setjmp.S b/REORG.TODO/sysdeps/i386/bsd-_setjmp.S
new file mode 100644
index 0000000000..6496304946
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/bsd-_setjmp.S
@@ -0,0 +1,56 @@
+/* BSD `_setjmp' entry point to `sigsetjmp (..., 0)'.  i386 version.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This just does a tail-call to `__sigsetjmp (ARG, 0)'.
+   We cannot do it in C because it must be a tail-call, so frame-unwinding
+   in setjmp doesn't clobber the state restored by longjmp.  */
+
+#include <sysdep.h>
+#include <jmpbuf-offsets.h>
+#include <stap-probe.h>
+
+#define PARMS	4		/* no space for saved regs */
+#define JMPBUF	PARMS
+#define SIGMSK	JMPBUF+4
+
+ENTRY (_setjmp)
+
+	xorl %eax, %eax
+	movl JMPBUF(%esp), %edx
+
+	/* Save registers.  */
+	movl %ebx, (JB_BX*4)(%edx)
+	movl %esi, (JB_SI*4)(%edx)
+	movl %edi, (JB_DI*4)(%edx)
+	leal JMPBUF(%esp), %ecx	/* Save SP as it will be after we return.  */
+#ifdef PTR_MANGLE
+	PTR_MANGLE (%ecx)
+#endif
+	movl %ecx, (JB_SP*4)(%edx)
+	movl 0(%esp), %ecx	/* Save PC we are returning to now.  */
+	LIBC_PROBE (setjmp, 3, 4@%edx, -4@$0, 4@%ecx)
+#ifdef PTR_MANGLE
+	PTR_MANGLE (%ecx)
+#endif
+	movl %ecx, (JB_PC*4)(%edx)
+	movl %ebp, (JB_BP*4)(%edx) /* Save caller's frame pointer.  */
+
+	movl %eax, JB_SIZE(%edx) /* No signal mask set.  */
+	ret
+END (_setjmp)
+libc_hidden_def (_setjmp)
diff --git a/REORG.TODO/sysdeps/i386/bsd-setjmp.S b/REORG.TODO/sysdeps/i386/bsd-setjmp.S
new file mode 100644
index 0000000000..5710e1f42b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/bsd-setjmp.S
@@ -0,0 +1,66 @@
+/* BSD `setjmp' entry point to `sigsetjmp (..., 1)'.  i386 version.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This just does a tail-call to `__sigsetjmp (ARG, 1)'.
+   We cannot do it in C because it must be a tail-call, so frame-unwinding
+   in setjmp doesn't clobber the state restored by longjmp.  */
+
+#include <sysdep.h>
+#include <jmpbuf-offsets.h>
+#include <stap-probe.h>
+
+#define PARMS  4		/* no space for saved regs */
+#define JMPBUF PARMS
+#define SIGMSK JMPBUF+4
+
+ENTRY (setjmp)
+	/* Note that we have to use a non-exported symbol in the next
+	   jump since otherwise gas will emit it as a jump through the
+	   PLT which is what we cannot use here.  */
+
+	movl JMPBUF(%esp), %eax
+
+	/* Save registers.  */
+	movl %ebx, (JB_BX*4)(%eax)
+	movl %esi, (JB_SI*4)(%eax)
+	movl %edi, (JB_DI*4)(%eax)
+	leal JMPBUF(%esp), %ecx	/* Save SP as it will be after we return.  */
+#ifdef PTR_MANGLE
+	PTR_MANGLE (%ecx)
+#endif
+	movl %ecx, (JB_SP*4)(%eax)
+	movl 0(%esp), %ecx	/* Save PC we are returning to now.  */
+	LIBC_PROBE (setjmp, 3, 4@%eax, -4@$1, 4@%ecx)
+#ifdef PTR_MANGLE
+	PTR_MANGLE (%ecx)
+#endif
+	movl %ecx, (JB_PC*4)(%eax)
+	movl %ebp, (JB_BP*4)(%eax) /* Save caller's frame pointer.  */
+
+	/* Call __sigjmp_save.  */
+	pushl $1
+	cfi_adjust_cfa_offset (4)
+	pushl 8(%esp)
+	cfi_adjust_cfa_offset (4)
+	call __sigjmp_save
+	popl %ecx
+	cfi_adjust_cfa_offset (-4)
+	popl %edx
+	cfi_adjust_cfa_offset (-4)
+	ret
+END (setjmp)
diff --git a/REORG.TODO/sysdeps/i386/bzero.S b/REORG.TODO/sysdeps/i386/bzero.S
new file mode 100644
index 0000000000..c8dd47b4da
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/bzero.S
@@ -0,0 +1,5 @@
+#define USE_AS_BZERO
+#define memset __bzero
+#include "memset.S"
+
+weak_alias (__bzero, bzero)
diff --git a/REORG.TODO/sysdeps/i386/cacheinfo.c b/REORG.TODO/sysdeps/i386/cacheinfo.c
new file mode 100644
index 0000000000..f15fe0779a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/cacheinfo.c
@@ -0,0 +1,3 @@
+#define DISABLE_PREFETCHW
+
+#include <sysdeps/x86/cacheinfo.c>
diff --git a/REORG.TODO/sysdeps/i386/configure b/REORG.TODO/sysdeps/i386/configure
new file mode 100644
index 0000000000..5b55c5affe
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/configure
@@ -0,0 +1,84 @@
+# This file is generated from configure.ac by Autoconf.  DO NOT EDIT!
+ # Local configure fragment for sysdeps/i386.
+
+# We no longer support i386 since it lacks the atomic instructions
+# required to implement NPTL threading.
+if test "$config_machine" = i386; then
+  as_fn_error $? "
+*** ERROR: Support for i386 is deprecated.
+*** Please use host i786, i686, i585 or i486.
+*** For example: /src/glibc/configure --host=i686-pc-linux-gnu ...\"" "$LINENO" 5
+fi
+
+# The GNU C Library can't be built for i386.  There are several reasons for
+# this restriction.  The primary reason is that i386 lacks the atomic
+# operations required to support the current NPTL implementation.  While it is
+# possible that such atomic operations could be emulated in the kernel to date
+# no such work has been done to enable this.  Even with NPTL disabled you still
+# have no atomic.h implementation.  Given the declining use of i386 we disable
+# support for building with `-march=i386' or `-mcpu=i386.' We don't explicitly
+# check for i386, instead we make sure the compiler has support for inlining
+# the builtin __sync_val_compare_and_swap. If it does then we should have no
+# problem building for i386.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for compiler support of inlined builtin function __sync_val_compare_and_swap" >&5
+$as_echo_n "checking for compiler support of inlined builtin function __sync_val_compare_and_swap... " >&6; }
+libc_compiler_builtin_inlined=no
+cat > conftest.c <<EOF
+int _start (void) { int a, b, c; __sync_val_compare_and_swap (&a, b, c); return 0; }
+EOF
+if ! { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS
+		     -O0 -nostdlib -nostartfiles
+		     -S conftest.c -o - | fgrep "__sync_val_compare_and_swap"
+		     1>&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+then
+  libc_compiler_builtin_inlined=yes
+fi
+rm -f conftest*
+if test $libc_compiler_builtin_inlined = yes; then
+  libc_cv_unsupported_i386=no
+else
+  as_fn_error $? "
+*** Building with -march=i386/-mcpu=i386 is not supported.
+*** Please use host i786, i686, i586, or i486.
+*** For example: /source/glibc/configure CFLAGS='-O2 -march=i686' ..." "$LINENO" 5
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_compiler_builtin_inlined" >&5
+$as_echo "$libc_compiler_builtin_inlined" >&6; }
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Intel MPX support" >&5
+$as_echo_n "checking for Intel MPX support... " >&6; }
+if ${libc_cv_asm_mpx+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.s <<\EOF
+        bndmov %bnd0,(%esp)
+EOF
+if { ac_try='${CC-cc} -c $ASFLAGS conftest.s 1>&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+  libc_cv_asm_mpx=yes
+else
+  libc_cv_asm_mpx=no
+fi
+rm -f conftest*
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_mpx" >&5
+$as_echo "$libc_cv_asm_mpx" >&6; }
+if test $libc_cv_asm_mpx = yes; then
+  $as_echo "#define HAVE_MPX_SUPPORT 1" >>confdefs.h
+
+fi
+
+$as_echo "#define USE_REGPARMS 1" >>confdefs.h
+
+
+$as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h
+
diff --git a/REORG.TODO/sysdeps/i386/configure.ac b/REORG.TODO/sysdeps/i386/configure.ac
new file mode 100644
index 0000000000..19ef33f34a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/configure.ac
@@ -0,0 +1,52 @@
+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
+# Local configure fragment for sysdeps/i386.
+
+# We no longer support i386 since it lacks the atomic instructions
+# required to implement NPTL threading.
+if test "$config_machine" = i386; then
+  AC_MSG_ERROR([
+*** ERROR: Support for i386 is deprecated.
+*** Please use host i786, i686, i585 or i486.
+*** For example: /src/glibc/configure --host=i686-pc-linux-gnu ..."])
+fi
+
+# The GNU C Library can't be built for i386.  There are several reasons for
+# this restriction.  The primary reason is that i386 lacks the atomic
+# operations required to support the current NPTL implementation.  While it is
+# possible that such atomic operations could be emulated in the kernel to date
+# no such work has been done to enable this.  Even with NPTL disabled you still
+# have no atomic.h implementation.  Given the declining use of i386 we disable
+# support for building with `-march=i386' or `-mcpu=i386.' We don't explicitly
+# check for i386, instead we make sure the compiler has support for inlining
+# the builtin __sync_val_compare_and_swap. If it does then we should have no
+# problem building for i386.
+LIBC_COMPILER_BUILTIN_INLINED(
+  [__sync_val_compare_and_swap],
+  [int a, b, c; __sync_val_compare_and_swap (&a, b, c);],
+  [-O0],
+  [libc_cv_unsupported_i386=no],
+  [AC_MSG_ERROR([
+*** Building with -march=i386/-mcpu=i386 is not supported.
+*** Please use host i786, i686, i586, or i486.
+*** For example: /source/glibc/configure CFLAGS='-O2 -march=i686' ...])])
+
+dnl Check whether asm supports Intel MPX
+AC_CACHE_CHECK(for Intel MPX support, libc_cv_asm_mpx, [dnl
+cat > conftest.s <<\EOF
+        bndmov %bnd0,(%esp)
+EOF
+if AC_TRY_COMMAND(${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD); then
+  libc_cv_asm_mpx=yes
+else
+  libc_cv_asm_mpx=no
+fi
+rm -f conftest*])
+if test $libc_cv_asm_mpx = yes; then
+  AC_DEFINE(HAVE_MPX_SUPPORT)
+fi
+
+AC_DEFINE(USE_REGPARMS)
+
+dnl It is always possible to access static and hidden symbols in an
+dnl position independent way.
+AC_DEFINE(PI_STATIC_AND_HIDDEN)
diff --git a/REORG.TODO/sysdeps/i386/crti.S b/REORG.TODO/sysdeps/i386/crti.S
new file mode 100644
index 0000000000..f800209990
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/crti.S
@@ -0,0 +1,84 @@
+/* Special .init and .fini section support for x86.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file with other
+   programs, and to distribute those programs without any restriction
+   coming from the use of this file. (The GNU Lesser General Public
+   License restrictions do apply in other respects; for example, they
+   cover modification of the file, and distribution when not linked
+   into another program.)
+
+   Note that people who make modified versions of this file are not
+   obligated to grant this special exception for their modified
+   versions; it is their choice whether to do so. The GNU Lesser
+   General Public License gives permission to release a modified
+   version without this exception; this exception also makes it
+   possible to release a modified version which carries forward this
+   exception.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* crti.S puts a function prologue at the beginning of the .init and
+   .fini sections and defines global symbols for those addresses, so
+   they can be called as functions.  The symbols _init and _fini are
+   magic and cause the linker to emit DT_INIT and DT_FINI.  */
+
+#include <libc-symbols.h>
+#include <sysdep.h>
+
+#ifndef PREINIT_FUNCTION
+# define PREINIT_FUNCTION __gmon_start__
+#endif
+
+#ifndef PREINIT_FUNCTION_WEAK
+# define PREINIT_FUNCTION_WEAK 1
+#endif
+
+#if PREINIT_FUNCTION_WEAK
+	weak_extern (PREINIT_FUNCTION)
+#else
+	.hidden PREINIT_FUNCTION
+#endif
+
+	.section .init,"ax",@progbits
+	.p2align 2
+	.globl _init
+	.type _init, @function
+_init:
+	pushl %ebx
+	/* Maintain 16-byte stack alignment for called functions.  */
+	subl $8, %esp
+	LOAD_PIC_REG (bx)
+#if PREINIT_FUNCTION_WEAK
+	movl PREINIT_FUNCTION@GOT(%ebx), %eax
+	testl %eax, %eax
+	je .Lno_weak_fn
+	call PREINIT_FUNCTION@PLT
+.Lno_weak_fn:
+#else
+	call PREINIT_FUNCTION
+#endif
+
+	.section .fini,"ax",@progbits
+	.p2align 2
+	.globl _fini
+	.type _fini, @function
+_fini:
+	pushl %ebx
+	subl $8, %esp
+	LOAD_PIC_REG (bx)
diff --git a/REORG.TODO/sysdeps/i386/crtn.S b/REORG.TODO/sysdeps/i386/crtn.S
new file mode 100644
index 0000000000..b18b9c171a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/crtn.S
@@ -0,0 +1,47 @@
+/* Special .init and .fini section support for x86.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file with other
+   programs, and to distribute those programs without any restriction
+   coming from the use of this file. (The GNU Lesser General Public
+   License restrictions do apply in other respects; for example, they
+   cover modification of the file, and distribution when not linked
+   into another program.)
+
+   Note that people who make modified versions of this file are not
+   obligated to grant this special exception for their modified
+   versions; it is their choice whether to do so. The GNU Lesser
+   General Public License gives permission to release a modified
+   version without this exception; this exception also makes it
+   possible to release a modified version which carries forward this
+   exception.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* crtn.S puts function epilogues in the .init and .fini sections
+   corresponding to the prologues in crti.S. */
+
+	.section .init,"ax",@progbits
+	addl $8, %esp
+	popl %ebx
+	ret
+
+	.section .fini,"ax",@progbits
+	addl $8, %esp
+	popl %ebx
+	ret
diff --git a/REORG.TODO/sysdeps/i386/dl-irel.h b/REORG.TODO/sysdeps/i386/dl-irel.h
new file mode 100644
index 0000000000..824e81aed1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-irel.h
@@ -0,0 +1,51 @@
+/* Machine-dependent ELF indirect relocation inline functions.
+   i386 version.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_IREL_H
+#define _DL_IREL_H
+
+#include <stdio.h>
+#include <unistd.h>
+
+#define ELF_MACHINE_IREL	1
+
+static inline Elf32_Addr
+__attribute ((always_inline))
+elf_ifunc_invoke (Elf32_Addr addr)
+{
+  return ((Elf32_Addr (*) (void)) (addr)) ();
+}
+
+static inline void
+__attribute ((always_inline))
+elf_irel (const Elf32_Rel *reloc)
+{
+  Elf32_Addr *const reloc_addr = (void *) reloc->r_offset;
+  const unsigned long int r_type = ELF32_R_TYPE (reloc->r_info);
+
+  if (__glibc_likely (r_type == R_386_IRELATIVE))
+    {
+      Elf32_Addr value = elf_ifunc_invoke(*reloc_addr);
+      *reloc_addr = value;
+    }
+  else
+    __libc_fatal ("unexpected reloc type in static binary");
+}
+
+#endif /* dl-irel.h */
diff --git a/REORG.TODO/sysdeps/i386/dl-lookupcfg.h b/REORG.TODO/sysdeps/i386/dl-lookupcfg.h
new file mode 100644
index 0000000000..47b534a059
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-lookupcfg.h
@@ -0,0 +1,32 @@
+/* Configuration of lookup functions.
+   Copyright (C) 2005-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define DL_UNMAP_IS_SPECIAL
+
+#include_next <dl-lookupcfg.h>
+
+/* Address of protected data defined in the shared library may be
+   external due to copy relocation.   */
+#define DL_EXTERN_PROTECTED_DATA
+
+struct link_map;
+
+extern void _dl_unmap (struct link_map *map)
+  internal_function attribute_hidden;
+
+#define DL_UNMAP(map) _dl_unmap (map)
diff --git a/REORG.TODO/sysdeps/i386/dl-machine.h b/REORG.TODO/sysdeps/i386/dl-machine.h
new file mode 100644
index 0000000000..57d4a0bdbd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-machine.h
@@ -0,0 +1,757 @@
+/* Machine-dependent ELF dynamic relocation inline functions.  i386 version.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef dl_machine_h
+#define dl_machine_h
+
+#define ELF_MACHINE_NAME "i386"
+
+#include <sys/param.h>
+#include <sysdep.h>
+#include <tls.h>
+#include <dl-tlsdesc.h>
+#include <cpu-features.c>
+
+/* Return nonzero iff ELF header is compatible with the running host.  */
+static inline int __attribute__ ((unused))
+elf_machine_matches_host (const Elf32_Ehdr *ehdr)
+{
+  return ehdr->e_machine == EM_386;
+}
+
+
+/* Return the link-time address of _DYNAMIC.  Conveniently, this is the
+   first element of the GOT, a special entry that is never relocated.  */
+static inline Elf32_Addr __attribute__ ((unused, const))
+elf_machine_dynamic (void)
+{
+  /* This produces a GOTOFF reloc that resolves to zero at link time, so in
+     fact just loads from the GOT register directly.  By doing it without
+     an asm we can let the compiler choose any register.  */
+  extern const Elf32_Addr _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
+  return _GLOBAL_OFFSET_TABLE_[0];
+}
+
+/* Return the run-time load address of the shared object.  */
+static inline Elf32_Addr __attribute__ ((unused))
+elf_machine_load_address (void)
+{
+  /* Compute the difference between the runtime address of _DYNAMIC as seen
+     by a GOTOFF reference, and the link-time address found in the special
+     unrelocated first GOT entry.  */
+  extern Elf32_Dyn bygotoff[] asm ("_DYNAMIC") attribute_hidden;
+  return (Elf32_Addr) &bygotoff - elf_machine_dynamic ();
+}
+
+/* Set up the loaded object described by L so its unrelocated PLT
+   entries will jump to the on-demand fixup code in dl-runtime.c.  */
+
+static inline int __attribute__ ((unused, always_inline))
+elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
+{
+  Elf32_Addr *got;
+  extern void _dl_runtime_resolve (Elf32_Word) attribute_hidden;
+  extern void _dl_runtime_profile (Elf32_Word) attribute_hidden;
+
+  if (l->l_info[DT_JMPREL] && lazy)
+    {
+      /* The GOT entries for functions in the PLT have not yet been filled
+	 in.  Their initial contents will arrange when called to push an
+	 offset into the .rel.plt section, push _GLOBAL_OFFSET_TABLE_[1],
+	 and then jump to _GLOBAL_OFFSET_TABLE[2].  */
+      got = (Elf32_Addr *) D_PTR (l, l_info[DT_PLTGOT]);
+      /* If a library is prelinked but we have to relocate anyway,
+	 we have to be able to undo the prelinking of .got.plt.
+	 The prelinker saved us here address of .plt + 0x16.  */
+      if (got[1])
+	{
+	  l->l_mach.plt = got[1] + l->l_addr;
+	  l->l_mach.gotplt = (Elf32_Addr) &got[3];
+	}
+      got[1] = (Elf32_Addr) l;	/* Identify this shared object.  */
+
+      /* The got[2] entry contains the address of a function which gets
+	 called to get the address of a so far unresolved function and
+	 jump to it.  The profiling extension of the dynamic linker allows
+	 to intercept the calls to collect information.  In this case we
+	 don't store the address in the GOT so that all future calls also
+	 end in this function.  */
+      if (__glibc_unlikely (profile))
+	{
+	  got[2] = (Elf32_Addr) &_dl_runtime_profile;
+
+	  if (GLRO(dl_profile) != NULL
+	      && _dl_name_match_p (GLRO(dl_profile), l))
+	    /* This is the object we are looking for.  Say that we really
+	       want profiling and the timers are started.  */
+	    GL(dl_profile_map) = l;
+	}
+      else
+	/* This function will get called to fix up the GOT entry indicated by
+	   the offset on the stack, and then jump to the resolved address.  */
+	got[2] = (Elf32_Addr) &_dl_runtime_resolve;
+    }
+
+  return lazy;
+}
+
+#ifdef IN_DL_RUNTIME
+
+# ifndef PROF
+/* We add a declaration of this function here so that in dl-runtime.c
+   the ELF_MACHINE_RUNTIME_TRAMPOLINE macro really can pass the parameters
+   in registers.
+
+   We cannot use this scheme for profiling because the _mcount call
+   destroys the passed register information.  */
+#define ARCH_FIXUP_ATTRIBUTE __attribute__ ((regparm (3), stdcall, unused))
+
+extern ElfW(Addr) _dl_fixup (struct link_map *l,
+			     ElfW(Word) reloc_offset)
+     ARCH_FIXUP_ATTRIBUTE;
+extern ElfW(Addr) _dl_profile_fixup (struct link_map *l,
+				     ElfW(Word) reloc_offset,
+				     ElfW(Addr) retaddr, void *regs,
+				     long int *framesizep)
+     ARCH_FIXUP_ATTRIBUTE;
+# endif
+
+#endif
+
+/* Mask identifying addresses reserved for the user program,
+   where the dynamic linker should not map anything.  */
+#define ELF_MACHINE_USER_ADDRESS_MASK	0xf8000000UL
+
+/* Initial entry point code for the dynamic linker.
+   The C function `_dl_start' is the real entry point;
+   its return value is the user program's entry point.  */
+
+#define RTLD_START asm ("\n\
+	.text\n\
+	.align 16\n\
+0:	movl (%esp), %ebx\n\
+	ret\n\
+	.align 16\n\
+.globl _start\n\
+.globl _dl_start_user\n\
+_start:\n\
+	# Note that _dl_start gets the parameter in %eax.\n\
+	movl %esp, %eax\n\
+	call _dl_start\n\
+_dl_start_user:\n\
+	# Save the user entry point address in %edi.\n\
+	movl %eax, %edi\n\
+	# Point %ebx at the GOT.\n\
+	call 0b\n\
+	addl $_GLOBAL_OFFSET_TABLE_, %ebx\n\
+	# See if we were run as a command with the executable file\n\
+	# name as an extra leading argument.\n\
+	movl _dl_skip_args@GOTOFF(%ebx), %eax\n\
+	# Pop the original argument count.\n\
+	popl %edx\n\
+	# Adjust the stack pointer to skip _dl_skip_args words.\n\
+	leal (%esp,%eax,4), %esp\n\
+	# Subtract _dl_skip_args from argc.\n\
+	subl %eax, %edx\n\
+	# Push argc back on the stack.\n\
+	push %edx\n\
+	# The special initializer gets called with the stack just\n\
+	# as the application's entry point will see it; it can\n\
+	# switch stacks if it moves these contents over.\n\
+" RTLD_START_SPECIAL_INIT "\n\
+	# Load the parameters again.\n\
+	# (eax, edx, ecx, *--esp) = (_dl_loaded, argc, argv, envp)\n\
+	movl _rtld_local@GOTOFF(%ebx), %eax\n\
+	leal 8(%esp,%edx,4), %esi\n\
+	leal 4(%esp), %ecx\n\
+	movl %esp, %ebp\n\
+	# Make sure _dl_init is run with 16 byte aligned stack.\n\
+	andl $-16, %esp\n\
+	pushl %eax\n\
+	pushl %eax\n\
+	pushl %ebp\n\
+	pushl %esi\n\
+	# Clear %ebp, so that even constructors have terminated backchain.\n\
+	xorl %ebp, %ebp\n\
+	# Call the function to run the initializers.\n\
+	call _dl_init\n\
+	# Pass our finalizer function to the user in %edx, as per ELF ABI.\n\
+	leal _dl_fini@GOTOFF(%ebx), %edx\n\
+	# Restore %esp _start expects.\n\
+	movl (%esp), %esp\n\
+	# Jump to the user's entry point.\n\
+	jmp *%edi\n\
+	.previous\n\
+");
+
+#ifndef RTLD_START_SPECIAL_INIT
+# define RTLD_START_SPECIAL_INIT /* nothing */
+#endif
+
+/* ELF_RTYPE_CLASS_PLT iff TYPE describes relocation of a PLT entry or
+   TLS variable, so undefined references should not be allowed to
+   define the value.
+   ELF_RTYPE_CLASS_COPY iff TYPE should not be allowed to resolve to one
+   of the main executable's symbols, as for a COPY reloc.
+   ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA iff TYPE describes relocation may
+   against protected data whose address be external due to copy relocation.
+ */
+# define elf_machine_type_class(type) \
+  ((((type) == R_386_JMP_SLOT || (type) == R_386_TLS_DTPMOD32		      \
+     || (type) == R_386_TLS_DTPOFF32 || (type) == R_386_TLS_TPOFF32	      \
+     || (type) == R_386_TLS_TPOFF || (type) == R_386_TLS_DESC)		      \
+    * ELF_RTYPE_CLASS_PLT)						      \
+   | (((type) == R_386_COPY) * ELF_RTYPE_CLASS_COPY)			      \
+   | (((type) == R_386_GLOB_DAT) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA))
+
+/* A reloc type used for ld.so cmdline arg lookups to reject PLT entries.  */
+#define ELF_MACHINE_JMP_SLOT	R_386_JMP_SLOT
+
+/* The i386 never uses Elf32_Rela relocations for the dynamic linker.
+   Prelinked libraries may use Elf32_Rela though.  */
+#define ELF_MACHINE_PLT_REL 1
+
+/* We define an initialization functions.  This is called very early in
+   _dl_sysdep_start.  */
+#define DL_PLATFORM_INIT dl_platform_init ()
+
+static inline void __attribute__ ((unused))
+dl_platform_init (void)
+{
+#if IS_IN (rtld)
+  /* init_cpu_features has been called early from __libc_start_main in
+     static executable.  */
+  init_cpu_features (&GLRO(dl_x86_cpu_features));
+#else
+  if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
+    /* Avoid an empty string which would disturb us.  */
+    GLRO(dl_platform) = NULL;
+#endif
+}
+
+static inline Elf32_Addr
+elf_machine_fixup_plt (struct link_map *map, lookup_t t,
+		       const Elf32_Rel *reloc,
+		       Elf32_Addr *reloc_addr, Elf32_Addr value)
+{
+  return *reloc_addr = value;
+}
+
+/* Return the final value of a plt relocation.  */
+static inline Elf32_Addr
+elf_machine_plt_value (struct link_map *map, const Elf32_Rel *reloc,
+		       Elf32_Addr value)
+{
+  return value;
+}
+
+
+/* Names of the architecture-specific auditing callback functions.  */
+#define ARCH_LA_PLTENTER i86_gnu_pltenter
+#define ARCH_LA_PLTEXIT i86_gnu_pltexit
+
+#endif /* !dl_machine_h */
+
+/* The i386 never uses Elf32_Rela relocations for the dynamic linker.
+   Prelinked libraries may use Elf32_Rela though.  */
+#define ELF_MACHINE_NO_RELA defined RTLD_BOOTSTRAP
+#define ELF_MACHINE_NO_REL 0
+
+#ifdef RESOLVE_MAP
+
+/* Perform the relocation specified by RELOC and SYM (which is fully resolved).
+   MAP is the object containing the reloc.  */
+
+auto inline void
+__attribute ((always_inline))
+elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
+		 const Elf32_Sym *sym, const struct r_found_version *version,
+		 void *const reloc_addr_arg, int skip_ifunc)
+{
+  Elf32_Addr *const reloc_addr = reloc_addr_arg;
+  const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+
+# if !defined RTLD_BOOTSTRAP || !defined HAVE_Z_COMBRELOC
+  if (__glibc_unlikely (r_type == R_386_RELATIVE))
+    {
+#  if !defined RTLD_BOOTSTRAP && !defined HAVE_Z_COMBRELOC
+      /* This is defined in rtld.c, but nowhere in the static libc.a;
+	 make the reference weak so static programs can still link.
+	 This declaration cannot be done when compiling rtld.c
+	 (i.e. #ifdef RTLD_BOOTSTRAP) because rtld.c contains the
+	 common defn for _dl_rtld_map, which is incompatible with a
+	 weak decl in the same file.  */
+#   ifndef SHARED
+      weak_extern (_dl_rtld_map);
+#   endif
+      if (map != &GL(dl_rtld_map)) /* Already done in rtld itself.  */
+#  endif
+	*reloc_addr += map->l_addr;
+    }
+#  ifndef RTLD_BOOTSTRAP
+  else if (__glibc_unlikely (r_type == R_386_NONE))
+    return;
+#  endif
+  else
+# endif	/* !RTLD_BOOTSTRAP and have no -z combreloc */
+    {
+# ifndef RTLD_BOOTSTRAP
+      const Elf32_Sym *const refsym = sym;
+# endif
+      struct link_map *sym_map = RESOLVE_MAP (&sym, version, r_type);
+      Elf32_Addr value = sym_map == NULL ? 0 : sym_map->l_addr + sym->st_value;
+
+      if (sym != NULL
+	  && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC,
+			       0)
+	  && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)
+	  && __builtin_expect (!skip_ifunc, 1))
+	{
+# ifndef RTLD_BOOTSTRAP
+	  if (sym_map != map
+	      && sym_map->l_type != lt_executable
+	      && !sym_map->l_relocated)
+	    {
+	      const char *strtab
+		= (const char *) D_PTR (map, l_info[DT_STRTAB]);
+	      _dl_error_printf ("\
+%s: Relink `%s' with `%s' for IFUNC symbol `%s'\n",
+				RTLD_PROGNAME, map->l_name,
+				sym_map->l_name,
+				strtab + refsym->st_name);
+	    }
+# endif
+	  value = ((Elf32_Addr (*) (void)) value) ();
+	}
+
+      switch (r_type)
+	{
+# ifndef RTLD_BOOTSTRAP
+	case R_386_SIZE32:
+	  /* Set to symbol size plus addend.  */
+	  *reloc_addr += sym->st_size;
+	  break;
+# endif
+	case R_386_GLOB_DAT:
+	case R_386_JMP_SLOT:
+	  *reloc_addr = value;
+	  break;
+
+	case R_386_TLS_DTPMOD32:
+# ifdef RTLD_BOOTSTRAP
+	  /* During startup the dynamic linker is always the module
+	     with index 1.
+	     XXX If this relocation is necessary move before RESOLVE
+	     call.  */
+	  *reloc_addr = 1;
+# else
+	  /* Get the information from the link map returned by the
+	     resolv function.  */
+	  if (sym_map != NULL)
+	    *reloc_addr = sym_map->l_tls_modid;
+# endif
+	  break;
+	case R_386_TLS_DTPOFF32:
+# ifndef RTLD_BOOTSTRAP
+	  /* During relocation all TLS symbols are defined and used.
+	     Therefore the offset is already correct.  */
+	  if (sym != NULL)
+	    *reloc_addr = sym->st_value;
+# endif
+	  break;
+	case R_386_TLS_DESC:
+	  {
+	    struct tlsdesc volatile *td =
+	      (struct tlsdesc volatile *)reloc_addr;
+
+# ifndef RTLD_BOOTSTRAP
+	    if (! sym)
+	      td->entry = _dl_tlsdesc_undefweak;
+	    else
+# endif
+	      {
+# ifndef RTLD_BOOTSTRAP
+#  ifndef SHARED
+		CHECK_STATIC_TLS (map, sym_map);
+#  else
+		if (!TRY_STATIC_TLS (map, sym_map))
+		  {
+		    td->arg = _dl_make_tlsdesc_dynamic
+		      (sym_map, sym->st_value + (ElfW(Word))td->arg);
+		    td->entry = _dl_tlsdesc_dynamic;
+		  }
+		else
+#  endif
+# endif
+		  {
+		    td->arg = (void*)(sym->st_value - sym_map->l_tls_offset
+				      + (ElfW(Word))td->arg);
+		    td->entry = _dl_tlsdesc_return;
+		  }
+	      }
+	    break;
+	  }
+	case R_386_TLS_TPOFF32:
+	  /* The offset is positive, backward from the thread pointer.  */
+#  ifdef RTLD_BOOTSTRAP
+	  *reloc_addr += map->l_tls_offset - sym->st_value;
+#  else
+	  /* We know the offset of object the symbol is contained in.
+	     It is a positive value which will be subtracted from the
+	     thread pointer.  To get the variable position in the TLS
+	     block we subtract the offset from that of the TLS block.  */
+	  if (sym != NULL)
+	    {
+	      CHECK_STATIC_TLS (map, sym_map);
+	      *reloc_addr += sym_map->l_tls_offset - sym->st_value;
+	    }
+# endif
+	  break;
+	case R_386_TLS_TPOFF:
+	  /* The offset is negative, forward from the thread pointer.  */
+# ifdef RTLD_BOOTSTRAP
+	  *reloc_addr += sym->st_value - map->l_tls_offset;
+# else
+	  /* We know the offset of object the symbol is contained in.
+	     It is a negative value which will be added to the
+	     thread pointer.  */
+	  if (sym != NULL)
+	    {
+	      CHECK_STATIC_TLS (map, sym_map);
+	      *reloc_addr += sym->st_value - sym_map->l_tls_offset;
+	    }
+# endif
+	  break;
+
+# ifndef RTLD_BOOTSTRAP
+	case R_386_32:
+	  *reloc_addr += value;
+	  break;
+	case R_386_PC32:
+	  *reloc_addr += (value - (Elf32_Addr) reloc_addr);
+	  break;
+	case R_386_COPY:
+	  if (sym == NULL)
+	    /* This can happen in trace mode if an object could not be
+	       found.  */
+	    break;
+	  if (__builtin_expect (sym->st_size > refsym->st_size, 0)
+	      || (__builtin_expect (sym->st_size < refsym->st_size, 0)
+		  && GLRO(dl_verbose)))
+	    {
+	      const char *strtab;
+
+	      strtab = (const char *) D_PTR (map, l_info[DT_STRTAB]);
+	      _dl_error_printf ("\
+%s: Symbol `%s' has different size in shared object, consider re-linking\n",
+				RTLD_PROGNAME, strtab + refsym->st_name);
+	    }
+	  memcpy (reloc_addr_arg, (void *) value,
+		  MIN (sym->st_size, refsym->st_size));
+	  break;
+	case R_386_IRELATIVE:
+	  value = map->l_addr + *reloc_addr;
+	  value = ((Elf32_Addr (*) (void)) value) ();
+	  *reloc_addr = value;
+	  break;
+	default:
+	  _dl_reloc_bad_type (map, r_type, 0);
+	  break;
+# endif	/* !RTLD_BOOTSTRAP */
+	}
+    }
+}
+
+# ifndef RTLD_BOOTSTRAP
+auto inline void
+__attribute__ ((always_inline))
+elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc,
+		  const Elf32_Sym *sym, const struct r_found_version *version,
+		  void *const reloc_addr_arg, int skip_ifunc)
+{
+  Elf32_Addr *const reloc_addr = reloc_addr_arg;
+  const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+
+  if (ELF32_R_TYPE (reloc->r_info) == R_386_RELATIVE)
+    *reloc_addr = map->l_addr + reloc->r_addend;
+  else if (r_type != R_386_NONE)
+    {
+#  ifndef RESOLVE_CONFLICT_FIND_MAP
+      const Elf32_Sym *const refsym = sym;
+#  endif
+      struct link_map *sym_map = RESOLVE_MAP (&sym, version, r_type);
+      Elf32_Addr value = sym == NULL ? 0 : sym_map->l_addr + sym->st_value;
+
+      if (sym != NULL
+	  && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)
+	  && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0)
+	  && __builtin_expect (!skip_ifunc, 1))
+	value = ((Elf32_Addr (*) (void)) value) ();
+
+      switch (ELF32_R_TYPE (reloc->r_info))
+	{
+	case R_386_SIZE32:
+	  /* Set to symbol size plus addend.  */
+	  value = sym->st_size;
+	case R_386_GLOB_DAT:
+	case R_386_JMP_SLOT:
+	case R_386_32:
+	  *reloc_addr = value + reloc->r_addend;
+	  break;
+#  ifndef RESOLVE_CONFLICT_FIND_MAP
+	  /* Not needed for dl-conflict.c.  */
+	case R_386_PC32:
+	  *reloc_addr = (value + reloc->r_addend - (Elf32_Addr) reloc_addr);
+	  break;
+
+	case R_386_TLS_DTPMOD32:
+	  /* Get the information from the link map returned by the
+	     resolv function.  */
+	  if (sym_map != NULL)
+	    *reloc_addr = sym_map->l_tls_modid;
+	  break;
+	case R_386_TLS_DTPOFF32:
+	  /* During relocation all TLS symbols are defined and used.
+	     Therefore the offset is already correct.  */
+	  *reloc_addr = (sym == NULL ? 0 : sym->st_value) + reloc->r_addend;
+	  break;
+	case R_386_TLS_DESC:
+	  {
+	    struct tlsdesc volatile *td =
+	      (struct tlsdesc volatile *)reloc_addr;
+
+#   ifndef RTLD_BOOTSTRAP
+	    if (!sym)
+	      {
+		td->arg = (void*)reloc->r_addend;
+		td->entry = _dl_tlsdesc_undefweak;
+	      }
+	    else
+#   endif
+	      {
+#   ifndef RTLD_BOOTSTRAP
+#    ifndef SHARED
+		CHECK_STATIC_TLS (map, sym_map);
+#    else
+		if (!TRY_STATIC_TLS (map, sym_map))
+		  {
+		    td->arg = _dl_make_tlsdesc_dynamic
+		      (sym_map, sym->st_value + reloc->r_addend);
+		    td->entry = _dl_tlsdesc_dynamic;
+		  }
+		else
+#    endif
+#   endif
+		  {
+		    td->arg = (void*)(sym->st_value - sym_map->l_tls_offset
+				      + reloc->r_addend);
+		    td->entry = _dl_tlsdesc_return;
+		  }
+	      }
+	  }
+	  break;
+	case R_386_TLS_TPOFF32:
+	  /* The offset is positive, backward from the thread pointer.  */
+	  /* We know the offset of object the symbol is contained in.
+	     It is a positive value which will be subtracted from the
+	     thread pointer.  To get the variable position in the TLS
+	     block we subtract the offset from that of the TLS block.  */
+	  if (sym != NULL)
+	    {
+	      CHECK_STATIC_TLS (map, sym_map);
+	      *reloc_addr = sym_map->l_tls_offset - sym->st_value
+			    + reloc->r_addend;
+	    }
+	  break;
+	case R_386_TLS_TPOFF:
+	  /* The offset is negative, forward from the thread pointer.  */
+	  /* We know the offset of object the symbol is contained in.
+	     It is a negative value which will be added to the
+	     thread pointer.  */
+	  if (sym != NULL)
+	    {
+	      CHECK_STATIC_TLS (map, sym_map);
+	      *reloc_addr = sym->st_value - sym_map->l_tls_offset
+			    + reloc->r_addend;
+	    }
+	  break;
+	case R_386_COPY:
+	  if (sym == NULL)
+	    /* This can happen in trace mode if an object could not be
+	       found.  */
+	    break;
+	  if (__builtin_expect (sym->st_size > refsym->st_size, 0)
+	      || (__builtin_expect (sym->st_size < refsym->st_size, 0)
+		  && GLRO(dl_verbose)))
+	    {
+	      const char *strtab;
+
+	      strtab = (const char *) D_PTR (map, l_info[DT_STRTAB]);
+	      _dl_error_printf ("\
+%s: Symbol `%s' has different size in shared object, consider re-linking\n",
+				RTLD_PROGNAME, strtab + refsym->st_name);
+	    }
+	  memcpy (reloc_addr_arg, (void *) value,
+		  MIN (sym->st_size, refsym->st_size));
+	  break;
+#  endif /* !RESOLVE_CONFLICT_FIND_MAP */
+	case R_386_IRELATIVE:
+	  value = map->l_addr + reloc->r_addend;
+	  value = ((Elf32_Addr (*) (void)) value) ();
+	  *reloc_addr = value;
+	  break;
+	default:
+	  /* We add these checks in the version to relocate ld.so only
+	     if we are still debugging.  */
+	  _dl_reloc_bad_type (map, r_type, 0);
+	  break;
+	}
+    }
+}
+# endif	/* !RTLD_BOOTSTRAP */
+
+auto inline void
+__attribute ((always_inline))
+elf_machine_rel_relative (Elf32_Addr l_addr, const Elf32_Rel *reloc,
+			  void *const reloc_addr_arg)
+{
+  Elf32_Addr *const reloc_addr = reloc_addr_arg;
+  assert (ELF32_R_TYPE (reloc->r_info) == R_386_RELATIVE);
+  *reloc_addr += l_addr;
+}
+
+# ifndef RTLD_BOOTSTRAP
+auto inline void
+__attribute__ ((always_inline))
+elf_machine_rela_relative (Elf32_Addr l_addr, const Elf32_Rela *reloc,
+			   void *const reloc_addr_arg)
+{
+  Elf32_Addr *const reloc_addr = reloc_addr_arg;
+  *reloc_addr = l_addr + reloc->r_addend;
+}
+# endif	/* !RTLD_BOOTSTRAP */
+
+auto inline void
+__attribute__ ((always_inline))
+elf_machine_lazy_rel (struct link_map *map,
+		      Elf32_Addr l_addr, const Elf32_Rel *reloc,
+		      int skip_ifunc)
+{
+  Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
+  const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+  /* Check for unexpected PLT reloc type.  */
+  if (__glibc_likely (r_type == R_386_JMP_SLOT))
+    {
+      if (__builtin_expect (map->l_mach.plt, 0) == 0)
+	*reloc_addr += l_addr;
+      else
+	*reloc_addr = (map->l_mach.plt
+		       + (((Elf32_Addr) reloc_addr) - map->l_mach.gotplt) * 4);
+    }
+  else if (__glibc_likely (r_type == R_386_TLS_DESC))
+    {
+      struct tlsdesc volatile * __attribute__((__unused__)) td =
+	(struct tlsdesc volatile *)reloc_addr;
+
+      /* Handle relocations that reference the local *ABS* in a simple
+	 way, so as to preserve a potential addend.  */
+      if (ELF32_R_SYM (reloc->r_info) == 0)
+	td->entry = _dl_tlsdesc_resolve_abs_plus_addend;
+      /* Given a known-zero addend, we can store a pointer to the
+	 reloc in the arg position.  */
+      else if (td->arg == 0)
+	{
+	  td->arg = (void*)reloc;
+	  td->entry = _dl_tlsdesc_resolve_rel;
+	}
+      else
+	{
+	  /* We could handle non-*ABS* relocations with non-zero addends
+	     by allocating dynamically an arg to hold a pointer to the
+	     reloc, but that sounds pointless.  */
+	  const Elf32_Rel *const r = reloc;
+	  /* The code below was borrowed from elf_dynamic_do_rel().  */
+	  const ElfW(Sym) *const symtab =
+	    (const void *) D_PTR (map, l_info[DT_SYMTAB]);
+
+# ifdef RTLD_BOOTSTRAP
+	  /* The dynamic linker always uses versioning.  */
+	  assert (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL);
+# else
+	  if (map->l_info[VERSYMIDX (DT_VERSYM)])
+# endif
+	    {
+	      const ElfW(Half) *const version =
+		(const void *) D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
+	      ElfW(Half) ndx = version[ELFW(R_SYM) (r->r_info)] & 0x7fff;
+	      elf_machine_rel (map, r, &symtab[ELFW(R_SYM) (r->r_info)],
+			       &map->l_versions[ndx],
+			       (void *) (l_addr + r->r_offset), skip_ifunc);
+	    }
+# ifndef RTLD_BOOTSTRAP
+	  else
+	    elf_machine_rel (map, r, &symtab[ELFW(R_SYM) (r->r_info)], NULL,
+			     (void *) (l_addr + r->r_offset), skip_ifunc);
+# endif
+	}
+    }
+  else if (__glibc_unlikely (r_type == R_386_IRELATIVE))
+    {
+      Elf32_Addr value = map->l_addr + *reloc_addr;
+      if (__glibc_likely (!skip_ifunc))
+	value = ((Elf32_Addr (*) (void)) value) ();
+      *reloc_addr = value;
+    }
+  else
+    _dl_reloc_bad_type (map, r_type, 1);
+}
+
+# ifndef RTLD_BOOTSTRAP
+
+auto inline void
+__attribute__ ((always_inline))
+elf_machine_lazy_rela (struct link_map *map,
+		       Elf32_Addr l_addr, const Elf32_Rela *reloc,
+		       int skip_ifunc)
+{
+  Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
+  const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+  if (__glibc_likely (r_type == R_386_JMP_SLOT))
+    ;
+  else if (__glibc_likely (r_type == R_386_TLS_DESC))
+    {
+      struct tlsdesc volatile * __attribute__((__unused__)) td =
+	(struct tlsdesc volatile *)reloc_addr;
+
+      td->arg = (void*)reloc;
+      td->entry = _dl_tlsdesc_resolve_rela;
+    }
+  else if (__glibc_unlikely (r_type == R_386_IRELATIVE))
+    {
+      Elf32_Addr value = map->l_addr + reloc->r_addend;
+      if (__glibc_likely (!skip_ifunc))
+	value = ((Elf32_Addr (*) (void)) value) ();
+      *reloc_addr = value;
+    }
+  else
+    _dl_reloc_bad_type (map, r_type, 1);
+}
+
+# endif	/* !RTLD_BOOTSTRAP */
+
+#endif /* RESOLVE_MAP */
diff --git a/REORG.TODO/sysdeps/i386/dl-procinfo.c b/REORG.TODO/sysdeps/i386/dl-procinfo.c
new file mode 100644
index 0000000000..7237f778b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-procinfo.c
@@ -0,0 +1,65 @@
+/* Data for i386 version of processor capability information.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2001.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* If anything should be added here check whether the size of each string
+   is still ok with the given array size.
+
+   All the #ifdefs in the definitions are quite irritating but
+   necessary if we want to avoid duplicating the information.  There
+   are three different modes:
+
+   - PROCINFO_DECL is defined.  This means we are only interested in
+     declarations.
+
+   - PROCINFO_DECL is not defined:
+
+     + if SHARED is defined the file is included in an array
+       initializer.  The .element = { ... } syntax is needed.
+
+     + if SHARED is not defined a normal array initialization is
+       needed.
+  */
+
+#ifndef PROCINFO_CLASS
+# define PROCINFO_CLASS
+#endif
+
+#include <sysdeps/x86/dl-procinfo.c>
+
+#if !defined PROCINFO_DECL && defined SHARED
+  ._dl_x86_cap_flags
+#else
+PROCINFO_CLASS const char _dl_x86_cap_flags[32][8]
+#endif
+#ifndef PROCINFO_DECL
+= {
+    "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
+    "cx8", "apic", "10", "sep", "mtrr", "pge", "mca", "cmov",
+    "pat", "pse36", "pn", "clflush", "20", "dts", "acpi", "mmx",
+    "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe"
+  }
+#endif
+#if !defined SHARED || defined PROCINFO_DECL
+;
+#else
+,
+#endif
+
+#undef PROCINFO_DECL
+#undef PROCINFO_CLASS
diff --git a/REORG.TODO/sysdeps/i386/dl-tls.h b/REORG.TODO/sysdeps/i386/dl-tls.h
new file mode 100644
index 0000000000..525ebab992
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-tls.h
@@ -0,0 +1,61 @@
+/* Thread-local storage handling in the ELF dynamic linker.  i386 version.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+/* Type used for the representation of TLS information in the GOT.  */
+typedef struct dl_tls_index
+{
+  unsigned long int ti_module;
+  unsigned long int ti_offset;
+} tls_index;
+
+
+#ifdef SHARED
+/* This is the prototype for the GNU version.  */
+extern void *___tls_get_addr (tls_index *ti)
+     __attribute__ ((__regparm__ (1)));
+extern void *___tls_get_addr_internal (tls_index *ti)
+     __attribute__ ((__regparm__ (1))) attribute_hidden;
+
+# if IS_IN (rtld)
+/* The special thing about the x86 TLS ABI is that we have two
+   variants of the __tls_get_addr function with different calling
+   conventions.  The GNU version, which we are mostly concerned here,
+   takes the parameter in a register.  The name is changed by adding
+   an additional underscore at the beginning.  The Sun version uses
+   the normal calling convention.  */
+void *
+__tls_get_addr (tls_index *ti)
+{
+  return ___tls_get_addr_internal (ti);
+}
+
+
+/* Prepare using the definition of __tls_get_addr in the generic
+   version of this file.  */
+# define __tls_get_addr __attribute__ ((__regparm__ (1))) ___tls_get_addr
+strong_alias (___tls_get_addr, ___tls_get_addr_internal)
+rtld_hidden_proto (___tls_get_addr)
+rtld_hidden_def (___tls_get_addr)
+#else
+
+/* Users should get the better interface.  */
+# define __tls_get_addr ___tls_get_addr
+
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/dl-tlsdesc.S b/REORG.TODO/sysdeps/i386/dl-tlsdesc.S
new file mode 100644
index 0000000000..8befdc2b39
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-tlsdesc.S
@@ -0,0 +1,285 @@
+/* Thread-local storage handling in the ELF dynamic linker.  i386 version.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <tls.h>
+#include "tlsdesc.h"
+
+	.text
+
+     /* This function is used to compute the TP offset for symbols in
+	Static TLS, i.e., whose TP offset is the same for all
+	threads.
+
+	The incoming %eax points to the TLS descriptor, such that
+	0(%eax) points to _dl_tlsdesc_return itself, and 4(%eax) holds
+	the TP offset of the symbol corresponding to the object
+	denoted by the argument.  */
+
+	.hidden _dl_tlsdesc_return
+	.global	_dl_tlsdesc_return
+	.type	_dl_tlsdesc_return,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_return:
+	movl	4(%eax), %eax
+	ret
+	cfi_endproc
+	.size	_dl_tlsdesc_return, .-_dl_tlsdesc_return
+
+     /* This function is used for undefined weak TLS symbols, for
+	which the base address (i.e., disregarding any addend) should
+	resolve to NULL.
+
+	%eax points to the TLS descriptor, such that 0(%eax) points to
+	_dl_tlsdesc_undefweak itself, and 4(%eax) holds the addend.
+	We return the addend minus the TP, such that, when the caller
+	adds TP, it gets the addend back.  If that's zero, as usual,
+	that's most likely a NULL pointer.  */
+
+	.hidden _dl_tlsdesc_undefweak
+	.global	_dl_tlsdesc_undefweak
+	.type	_dl_tlsdesc_undefweak,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_undefweak:
+	movl	4(%eax), %eax
+	subl	%gs:0, %eax
+	ret
+	cfi_endproc
+	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
+
+#ifdef SHARED
+	.hidden _dl_tlsdesc_dynamic
+	.global	_dl_tlsdesc_dynamic
+	.type	_dl_tlsdesc_dynamic,@function
+
+     /* This function is used for symbols that need dynamic TLS.
+
+	%eax points to the TLS descriptor, such that 0(%eax) points to
+	_dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
+	tlsdesc_dynamic_arg object.  It must return in %eax the offset
+	between the thread pointer and the object denoted by the
+	argument, without clobbering any registers.
+
+	The assembly code that follows is a rendition of the following
+	C code, hand-optimized a little bit.
+
+ptrdiff_t
+__attribute__ ((__regparm__ (1)))
+_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
+{
+  struct tlsdesc_dynamic_arg *td = tdp->arg;
+  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
+  if (__builtin_expect (td->gen_count <= dtv[0].counter
+			&& (dtv[td->tlsinfo.ti_module].pointer.val
+			    != TLS_DTV_UNALLOCATED),
+			1))
+    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
+      - __thread_pointer;
+
+  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
+}
+*/
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_dynamic:
+	/* Like all TLS resolvers, preserve call-clobbered registers.
+	   We need two scratch regs anyway.  */
+	subl	$28, %esp
+	cfi_adjust_cfa_offset (28)
+	movl	%ecx, 20(%esp)
+	movl	%edx, 24(%esp)
+	movl	TLSDESC_ARG(%eax), %eax
+	movl	%gs:DTV_OFFSET, %edx
+	movl	TLSDESC_GEN_COUNT(%eax), %ecx
+	cmpl	(%edx), %ecx
+	ja	.Lslow
+	movl	TLSDESC_MODID(%eax), %ecx
+	movl	(%edx,%ecx,8), %edx
+	cmpl	$-1, %edx
+	je	.Lslow
+	movl	TLSDESC_MODOFF(%eax), %eax
+	addl	%edx, %eax
+.Lret:
+	movl	20(%esp), %ecx
+	subl	%gs:0, %eax
+	movl	24(%esp), %edx
+	addl	$28, %esp
+	cfi_adjust_cfa_offset (-28)
+	ret
+	.p2align 4,,7
+.Lslow:
+	cfi_adjust_cfa_offset (28)
+	call	HIDDEN_JUMPTARGET (___tls_get_addr)
+	jmp	.Lret
+	cfi_endproc
+	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+#endif /* SHARED */
+
+     /* This function is a wrapper for a lazy resolver for TLS_DESC
+	REL relocations that reference the *ABS* segment in their own
+	link maps.  %ebx points to the caller's GOT.  %eax points to a
+	TLS descriptor, such that 0(%eax) holds the address of the
+	resolver wrapper itself (unless some other thread beat us to
+	it) and 4(%eax) holds the addend in the relocation.
+
+	When the actual resolver returns, it will have adjusted the
+	TLS descriptor such that we can tail-call it for it to return
+	the TP offset of the symbol.  */
+
+	.hidden _dl_tlsdesc_resolve_abs_plus_addend
+	.global	_dl_tlsdesc_resolve_abs_plus_addend
+	.type	_dl_tlsdesc_resolve_abs_plus_addend,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_resolve_abs_plus_addend:
+0:
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	pushl	%ecx
+	cfi_adjust_cfa_offset (4)
+	pushl	%edx
+	cfi_adjust_cfa_offset (4)
+	movl	$1f - 0b, %ecx
+	movl	4(%ebx), %edx
+	call	_dl_tlsdesc_resolve_abs_plus_addend_fixup
+1:
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	popl	%ecx
+	cfi_adjust_cfa_offset (-4)
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	jmp	*(%eax)
+	cfi_endproc
+	.size	_dl_tlsdesc_resolve_abs_plus_addend, .-_dl_tlsdesc_resolve_abs_plus_addend
+
+     /* This function is a wrapper for a lazy resolver for TLS_DESC
+	REL relocations that had zero addends.  %ebx points to the
+	caller's GOT.  %eax points to a TLS descriptor, such that
+	0(%eax) holds the address of the resolver wrapper itself
+	(unless some other thread beat us to it) and 4(%eax) holds a
+	pointer to the relocation.
+
+	When the actual resolver returns, it will have adjusted the
+	TLS descriptor such that we can tail-call it for it to return
+	the TP offset of the symbol.  */
+
+	.hidden _dl_tlsdesc_resolve_rel
+	.global	_dl_tlsdesc_resolve_rel
+	.type	_dl_tlsdesc_resolve_rel,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_resolve_rel:
+0:
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	pushl	%ecx
+	cfi_adjust_cfa_offset (4)
+	pushl	%edx
+	cfi_adjust_cfa_offset (4)
+	movl	$1f - 0b, %ecx
+	movl	4(%ebx), %edx
+	call	_dl_tlsdesc_resolve_rel_fixup
+1:
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	popl	%ecx
+	cfi_adjust_cfa_offset (-4)
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	jmp	*(%eax)
+	cfi_endproc
+	.size	_dl_tlsdesc_resolve_rel, .-_dl_tlsdesc_resolve_rel
+
+     /* This function is a wrapper for a lazy resolver for TLS_DESC
+	RELA relocations.  %ebx points to the caller's GOT.  %eax
+	points to a TLS descriptor, such that 0(%eax) holds the
+	address of the resolver wrapper itself (unless some other
+	thread beat us to it) and 4(%eax) holds a pointer to the
+	relocation.
+
+	When the actual resolver returns, it will have adjusted the
+	TLS descriptor such that we can tail-call it for it to return
+	the TP offset of the symbol.  */
+
+	.hidden _dl_tlsdesc_resolve_rela
+	.global	_dl_tlsdesc_resolve_rela
+	.type	_dl_tlsdesc_resolve_rela,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_resolve_rela:
+0:
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	pushl	%ecx
+	cfi_adjust_cfa_offset (4)
+	pushl	%edx
+	cfi_adjust_cfa_offset (4)
+	movl	$1f - 0b, %ecx
+	movl	4(%ebx), %edx
+	call	_dl_tlsdesc_resolve_rela_fixup
+1:
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	popl	%ecx
+	cfi_adjust_cfa_offset (-4)
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	jmp	*(%eax)
+	cfi_endproc
+	.size	_dl_tlsdesc_resolve_rela, .-_dl_tlsdesc_resolve_rela
+
+     /* This function is a placeholder for lazy resolving of TLS
+	relocations.  Once some thread starts resolving a TLS
+	relocation, it sets up the TLS descriptor to use this
+	resolver, such that other threads that would attempt to
+	resolve it concurrently may skip the call to the original lazy
+	resolver and go straight to a condition wait.
+
+	When the actual resolver returns, it will have adjusted the
+	TLS descriptor such that we can tail-call it for it to return
+	the TP offset of the symbol.  */
+
+	.hidden _dl_tlsdesc_resolve_hold
+	.global	_dl_tlsdesc_resolve_hold
+	.type	_dl_tlsdesc_resolve_hold,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_resolve_hold:
+0:
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	pushl	%ecx
+	cfi_adjust_cfa_offset (4)
+	pushl	%edx
+	cfi_adjust_cfa_offset (4)
+	movl	$1f - 0b, %ecx
+	movl	4(%ebx), %edx
+	call	_dl_tlsdesc_resolve_hold_fixup
+1:
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	popl	%ecx
+	cfi_adjust_cfa_offset (-4)
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	jmp	*(%eax)
+	cfi_endproc
+	.size	_dl_tlsdesc_resolve_hold, .-_dl_tlsdesc_resolve_hold
diff --git a/REORG.TODO/sysdeps/i386/dl-tlsdesc.h b/REORG.TODO/sysdeps/i386/dl-tlsdesc.h
new file mode 100644
index 0000000000..242bebfc8e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-tlsdesc.h
@@ -0,0 +1,61 @@
+/* Thread-local storage descriptor handling in the ELF dynamic linker.
+   i386 version.
+   Copyright (C) 2005-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _I386_DL_TLSDESC_H
+# define _I386_DL_TLSDESC_H 1
+
+/* Type used to represent a TLS descriptor in the GOT.  */
+struct tlsdesc
+{
+  ptrdiff_t __attribute__ ((regparm (1))) (*entry) (struct tlsdesc *);
+  void *arg;
+};
+
+typedef struct dl_tls_index
+{
+  unsigned long int ti_module;
+  unsigned long int ti_offset;
+} tls_index;
+
+/* Type used as the argument in a TLS descriptor for a symbol that
+   needs dynamic TLS offsets.  */
+struct tlsdesc_dynamic_arg
+{
+  tls_index tlsinfo;
+  size_t gen_count;
+};
+
+extern ptrdiff_t attribute_hidden __attribute__ ((regparm (1)))
+  _dl_tlsdesc_return (struct tlsdesc *),
+  _dl_tlsdesc_undefweak (struct tlsdesc *),
+  _dl_tlsdesc_resolve_abs_plus_addend (struct tlsdesc *),
+  _dl_tlsdesc_resolve_rel (struct tlsdesc *),
+  _dl_tlsdesc_resolve_rela (struct tlsdesc *),
+  _dl_tlsdesc_resolve_hold (struct tlsdesc *);
+
+# ifdef SHARED
+extern void *_dl_make_tlsdesc_dynamic (struct link_map *map,
+				       size_t ti_offset)
+  internal_function attribute_hidden;
+
+extern ptrdiff_t attribute_hidden __attribute__ ((regparm (1)))
+  _dl_tlsdesc_dynamic (struct tlsdesc *);
+# endif
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/dl-trampoline.S b/REORG.TODO/sysdeps/i386/dl-trampoline.S
new file mode 100644
index 0000000000..6e7f3aef92
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-trampoline.S
@@ -0,0 +1,215 @@
+/* PLT trampolines.  i386 version.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <link-defines.h>
+
+#ifdef HAVE_MPX_SUPPORT
+# define PRESERVE_BND_REGS_PREFIX bnd
+#else
+# define PRESERVE_BND_REGS_PREFIX .byte 0xf2
+#endif
+
+	.text
+	.globl _dl_runtime_resolve
+	.type _dl_runtime_resolve, @function
+	cfi_startproc
+	.align 16
+_dl_runtime_resolve:
+	cfi_adjust_cfa_offset (8)
+	pushl %eax		# Preserve registers otherwise clobbered.
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %edx
+	cfi_adjust_cfa_offset (4)
+	movl 16(%esp), %edx	# Copy args pushed by PLT in register.  Note
+	movl 12(%esp), %eax	# that `fixup' takes its parameters in regs.
+	call _dl_fixup		# Call resolver.
+	popl %edx		# Get register content back.
+	cfi_adjust_cfa_offset (-4)
+	movl (%esp), %ecx
+	movl %eax, (%esp)	# Store the function address.
+	movl 4(%esp), %eax
+	ret $12			# Jump to function address.
+	cfi_endproc
+	.size _dl_runtime_resolve, .-_dl_runtime_resolve
+
+
+#ifndef PROF
+	.globl _dl_runtime_profile
+	.type _dl_runtime_profile, @function
+	cfi_startproc
+	.align 16
+_dl_runtime_profile:
+	cfi_adjust_cfa_offset (8)
+	pushl %esp
+	cfi_adjust_cfa_offset (4)
+	addl $8, (%esp)		# Account for the pushed PLT data
+	pushl %ebp
+	cfi_adjust_cfa_offset (4)
+	pushl %eax		# Preserve registers otherwise clobbered.
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %edx
+	cfi_adjust_cfa_offset (4)
+	movl %esp, %ecx
+	subl $8, %esp
+	cfi_adjust_cfa_offset (8)
+	movl $-1, 4(%esp)
+	leal 4(%esp), %edx
+	movl %edx, (%esp)
+	pushl %ecx		# Address of the register structure
+	cfi_adjust_cfa_offset (4)
+	movl 40(%esp), %ecx	# Load return address
+	movl 36(%esp), %edx	# Copy args pushed by PLT in register.  Note
+	movl 32(%esp), %eax	# that `fixup' takes its parameters in regs.
+	call _dl_profile_fixup	# Call resolver.
+	cfi_adjust_cfa_offset (-8)
+	movl (%esp), %edx
+	testl %edx, %edx
+	jns 1f
+	popl %edx
+	cfi_adjust_cfa_offset (-4)
+	popl %edx		# Get register content back.
+	cfi_adjust_cfa_offset (-4)
+	movl (%esp), %ecx
+	movl %eax, (%esp)	# Store the function address.
+	movl 4(%esp), %eax
+	ret $20			# Jump to function address.
+
+	/*
+	    +32     return address
+	    +28     PLT1
+	    +24     PLT2
+	    +20     %esp
+	    +16     %ebp
+	    +12     %eax
+	    +8      %ecx
+	    +4      %edx
+	   %esp     free
+	*/
+	cfi_adjust_cfa_offset (8)
+1:	movl %ebx, (%esp)
+	cfi_rel_offset (ebx, 0)
+	movl %edx, %ebx		# This is the frame buffer size
+	pushl %edi
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+	pushl %esi
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (esi, 0)
+	leal 44(%esp), %esi
+	movl %ebx, %ecx
+	orl $4, %ebx		# Increase frame size if necessary to align
+				# stack for the function call
+	andl $~3, %ebx
+	movl %esp, %edi
+	subl %ebx, %edi
+	movl %esp, %ebx
+	cfi_def_cfa_register (ebx)
+	movl %edi, %esp
+	shrl $2, %ecx
+	rep
+	movsl
+	movl (%ebx), %esi
+	cfi_restore (esi)
+	movl 4(%ebx), %edi
+	cfi_restore (edi)
+	/*
+	   %ebx+40  return address
+	   %ebx+36  PLT1
+	   %ebx+32  PLT2
+	   %ebx+28  %esp
+	   %ebx+24  %ebp
+	   %ebx+20  %eax
+	   %ebx+16  %ecx
+	   %ebx+12  %edx
+	   %ebx+8   %ebx
+	   %ebx+4   free
+	   %ebx     free
+	   %esp     copied stack frame
+	*/
+	movl %eax, (%ebx)
+	movl 12(%ebx), %edx
+	movl 16(%ebx), %ecx
+	movl 20(%ebx), %eax
+	call *(%ebx)
+	movl %ebx, %esp
+	cfi_def_cfa_register (esp)
+	movl 8(%esp), %ebx
+	cfi_restore (ebx)
+	/*
+	    +40     return address
+	    +36     PLT1
+	    +32     PLT2
+	    +28     %esp
+	    +24     %ebp
+	    +20     %eax
+	    +16     %ecx
+	    +12     %edx
+	    +8      free
+	    +4      free
+	   %esp     free
+	*/
+#if LONG_DOUBLE_SIZE != 12
+# error "long double size must be 12 bytes"
+#endif
+	# Allocate space for La_i86_retval and subtract 12 free bytes.
+	subl $(LRV_SIZE - 12), %esp
+	cfi_adjust_cfa_offset (LRV_SIZE - 12)
+	movl %eax, LRV_EAX_OFFSET(%esp)
+	movl %edx, LRV_EDX_OFFSET(%esp)
+	fstpt LRV_ST0_OFFSET(%esp)
+	fstpt LRV_ST1_OFFSET(%esp)
+#ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, LRV_BND0_OFFSET(%esp)
+	bndmov %bnd1, LRV_BND1_OFFSET(%esp)
+#else
+	.byte 0x66,0x0f,0x1b,0x44,0x24,LRV_BND0_OFFSET
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,LRV_BND1_OFFSET
+#endif
+	pushl %esp
+	cfi_adjust_cfa_offset (4)
+	# Address of La_i86_regs area.
+	leal (LRV_SIZE + 4)(%esp), %ecx
+	# PLT2
+	movl (LRV_SIZE + 4 + LR_SIZE)(%esp), %eax
+	# PLT1
+	movl (LRV_SIZE + 4 + LR_SIZE + 4)(%esp), %edx
+	call _dl_call_pltexit
+	movl LRV_EAX_OFFSET(%esp), %eax
+	movl LRV_EDX_OFFSET(%esp), %edx
+	fldt LRV_ST1_OFFSET(%esp)
+	fldt LRV_ST0_OFFSET(%esp)
+#ifdef HAVE_MPX_SUPPORT
+	bndmov LRV_BND0_OFFSET(%esp), %bnd0
+	bndmov LRV_BND1_OFFSET(%esp), %bnd1
+#else
+	.byte 0x66,0x0f,0x1a,0x44,0x24,LRV_BND0_OFFSET
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,LRV_BND1_OFFSET
+#endif
+	# Restore stack before return.
+	addl $(LRV_SIZE + 4 + LR_SIZE + 4), %esp
+	cfi_adjust_cfa_offset (-(LRV_SIZE + 4 + LR_SIZE + 4))
+	PRESERVE_BND_REGS_PREFIX
+	ret
+	cfi_endproc
+	.size _dl_runtime_profile, .-_dl_runtime_profile
+#endif
diff --git a/REORG.TODO/sysdeps/i386/ffs.c b/REORG.TODO/sysdeps/i386/ffs.c
new file mode 100644
index 0000000000..c229c8166e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/ffs.c
@@ -0,0 +1,50 @@
+/* ffs -- find first set bit in a word, counted from least significant end.
+   For Intel 80x86, x>=3.
+   This file is part of the GNU C Library.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   Contributed by Torbjorn Granlund (tege@sics.se).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define ffsl __something_else
+#include <string.h>
+
+#undef	ffs
+
+#ifdef	__GNUC__
+
+int
+__ffs (int x)
+{
+  int cnt;
+  int tmp;
+
+  asm ("xorl %0,%0\n"		/* Set CNT to zero.  */
+       "bsfl %2,%1\n"		/* Count low bits in X and store in %1.  */
+       "jz 1f\n"		/* Jump if OK, i.e. X was non-zero.  */
+       "leal 1(%1),%0\n"	/* Return bsfl-result plus one on %0.  */
+       "1:" : "=&a" (cnt), "=r" (tmp) : "rm" (x));
+
+  return cnt;
+}
+weak_alias (__ffs, ffs)
+libc_hidden_def (__ffs)
+libc_hidden_builtin_def (ffs)
+#undef ffsl
+weak_alias (__ffs, ffsl)
+
+#else
+#include <string/ffs.c>
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/Implies b/REORG.TODO/sysdeps/i386/fpu/Implies
new file mode 100644
index 0000000000..2b745a34fb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/Implies
@@ -0,0 +1 @@
+x86/fpu
diff --git a/REORG.TODO/sysdeps/i386/fpu/Versions b/REORG.TODO/sysdeps/i386/fpu/Versions
new file mode 100644
index 0000000000..a2eec371f1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/Versions
@@ -0,0 +1,6 @@
+libm {
+  GLIBC_2.2 {
+    # functions used in inline functions or macros
+    __expl; __expm1l;
+  }
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/doasin.c b/REORG.TODO/sysdeps/i386/fpu/doasin.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/doasin.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acos.S b/REORG.TODO/sysdeps/i386/fpu/e_acos.S
new file mode 100644
index 0000000000..586c7fc406
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acos.S
@@ -0,0 +1,25 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: e_acos.S,v 1.4 1995/05/08 23:44:37 jtc Exp $")
+
+/* acos = atan (sqrt((1-x) (1+x)) / x) */
+ENTRY(__ieee754_acos)
+	fldl	4(%esp)			/* x */
+	fld	%st			/* x : x */
+	fld1				/* 1 : x : x */
+	fsubp				/* 1 - x : x */
+	fld1				/* 1 : 1 - x : x */
+	fadd	%st(2)			/* 1 + x : 1 - x : x */
+	fmulp				/* 1 - x^2 : x */
+	fsqrt				/* sqrt (1 - x^2) : x */
+	fabs
+	fxch	%st(1)			/* x : sqrt (1 - x^2) */
+	fpatan				/* atan (sqrt(1 - x^2) / x) */
+	ret
+END (__ieee754_acos)
+strong_alias (__ieee754_acos, __acos_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acosf.S b/REORG.TODO/sysdeps/i386/fpu/e_acosf.S
new file mode 100644
index 0000000000..54930af8b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acosf.S
@@ -0,0 +1,24 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+/* acos = atan (sqrt(1 - x^2) / x) */
+ENTRY(__ieee754_acosf)
+	flds	4(%esp)			/* x */
+	fld	%st
+	fmul	%st(0)			/* x^2 */
+	fld1
+	fsubp				/* 1 - x^2 */
+	fsqrt				/* sqrt (1 - x^2) */
+	fabs
+	fxch	%st(1)
+	fpatan
+	ret
+END (__ieee754_acosf)
+strong_alias (__ieee754_acosf, __acosf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acosh.S b/REORG.TODO/sysdeps/i386/fpu/e_acosh.S
new file mode 100644
index 0000000000..9555ef8078
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acosh.S
@@ -0,0 +1,101 @@
+/* ix87 specific implementation of arcsinh.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_acosh)
+	movl	8(%esp), %ecx
+	cmpl	$0x3ff00000, %ecx
+	jl	5f			// < 1 => invalid
+	fldln2				// log(2)
+	fldl	4(%esp)			// x : log(2)
+	cmpl	$0x41b00000, %ecx
+	ja	3f			// x > 2^28
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpl	$0x40000000, %ecx
+	ja	4f			// x > 2
+
+	// 1 <= x <= 2 => y = log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+	fsubl	MO(one)			// x-1 : log(2)
+	fabs				// acosh(1) is +0 in all rounding modes
+	fld	%st			// x-1 : x-1 : log(2)
+	fmul	%st(1)			// (x-1)^2 : x-1 : log(2)
+	fadd	%st(1)			// x-1+(x-1)^2 : x-1 : log(2)
+	fadd	%st(1)			// 2*(x-1)+(x-1)^2 : x-1 : log(2)
+	fsqrt				// sqrt(2*(x-1)+(x-1)^2) : x-1 : log(2)
+	faddp				// x-1+sqrt(2*(x-1)+(x-1)^2) : log(2)
+	fcoml	MO(limit)
+	fnstsw
+	sahf
+	ja	2f
+	fyl2xp1				// log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+	ret
+
+2:	faddl	MO(one)			// x+sqrt(2*(x-1)+(x-1)^2) : log(2)
+	fyl2x				// log(x+sqrt(2*(x-1)+(x-1)^2))
+	ret
+
+	// x > 2^28 => y = log(x) + log(2)
+	.align ALIGNARG(4)
+3:	fyl2x				// log(x)
+	fldln2				// log(2) : log(x)
+	faddp				// log(x)+log(2)
+	ret
+
+	// 2^28 > x > 2 => y = log(2*x - 1/(x+sqrt(x*x-1)))
+	.align ALIGNARG(4)
+4:	fld	%st			// x : x : log(2)
+	fadd	%st, %st(1)		// x : 2*x : log(2)
+	fld	%st			// x : x : 2*x : log(2)
+	fmul	%st(1)			// x^2 : x : 2*x : log(2)
+	fsubl	MO(one)			// x^2-1 : x : 2*x : log(2)
+	fsqrt				// sqrt(x^2-1) : x : 2*x : log(2)
+	faddp				// x+sqrt(x^2-1) : 2*x : log(2)
+	fdivrl	MO(one)			// 1/(x+sqrt(x^2-1)) : 2*x : log(2)
+	fsubrp				// 2*x+1/(x+sqrt(x^2)-1) : log(2)
+	fyl2x				// log(2*x+1/(x+sqrt(x^2-1)))
+	ret
+
+	// x < 1 (or -NaN) => NaN
+	.align ALIGNARG(4)
+5:	fldl	4(%esp)
+	fsub	%st
+	fdiv	%st, %st(0)
+	ret
+END(__ieee754_acosh)
+strong_alias (__ieee754_acosh, __acosh_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acoshf.S b/REORG.TODO/sysdeps/i386/fpu/e_acoshf.S
new file mode 100644
index 0000000000..662fda3c06
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acoshf.S
@@ -0,0 +1,101 @@
+/* ix87 specific implementation of arcsinh.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_acoshf)
+	movl	4(%esp), %ecx
+	cmpl	$0x3f800000, %ecx
+	jl	5f			// < 1 => invalid
+	fldln2				// log(2)
+	flds	4(%esp)			// x : log(2)
+	cmpl	$0x47000000, %ecx
+	ja	3f			// x > 2^14
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpl	$0x40000000, %ecx
+	ja	4f			// x > 2
+
+	// 1 <= x <= 2 => y = log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+	fsubl	MO(one)			// x-1 : log(2)
+	fabs				// acosh(1) is +0 in all rounding modes
+	fld	%st			// x-1 : x-1 : log(2)
+	fmul	%st(1)			// (x-1)^2 : x-1 : log(2)
+	fadd	%st(1)			// x-1+(x-1)^2 : x-1 : log(2)
+	fadd	%st(1)			// 2*(x-1)+(x-1)^2 : x-1 : log(2)
+	fsqrt				// sqrt(2*(x-1)+(x-1)^2) : x-1 : log(2)
+	faddp				// x-1+sqrt(2*(x-1)+(x-1)^2) : log(2)
+	fcoml	MO(limit)
+	fnstsw
+	sahf
+	ja	2f
+	fyl2xp1				// log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+	ret
+
+2:	faddl	MO(one)			// x+sqrt(2*(x-1)+(x-1)^2) : log(2)
+	fyl2x				// log(x+sqrt(2*(x-1)+(x-1)^2))
+	ret
+
+	// x > 2^14 => y = log(x) + log(2)
+	.align ALIGNARG(4)
+3:	fyl2x				// log(x)
+	fldln2				// log(2) : log(x)
+	faddp				// log(x)+log(2)
+	ret
+
+	// 2^28 > x > 2 => y = log(2*x - 1/(x+sqrt(x*x-1)))
+	.align ALIGNARG(4)
+4:	fld	%st			// x : x : log(2)
+	fadd	%st, %st(1)		// x : 2*x : log(2)
+	fld	%st			// x : x : 2*x : log(2)
+	fmul	%st(1)			// x^2 : x : 2*x : log(2)
+	fsubl	MO(one)			// x^2-1 : x : 2*x : log(2)
+	fsqrt				// sqrt(x^2-1) : x : 2*x : log(2)
+	faddp				// x+sqrt(x^2-1) : 2*x : log(2)
+	fdivrl	MO(one)			// 1/(x+sqrt(x^2-1)) : 2*x : log(2)
+	fsubrp				// 2*x+1/(x+sqrt(x^2)-1) : log(2)
+	fyl2x				// log(2*x+1/(x+sqrt(x^2-1)))
+	ret
+
+	// x < 1 (or -NaN) => NaN
+	.align ALIGNARG(4)
+5:	flds	4(%esp)
+	fsub	%st
+	fdiv	%st, %st(0)
+	ret
+END(__ieee754_acoshf)
+strong_alias (__ieee754_acoshf, __acoshf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acoshl.S b/REORG.TODO/sysdeps/i386/fpu/e_acoshl.S
new file mode 100644
index 0000000000..e0d6466aac
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acoshl.S
@@ -0,0 +1,107 @@
+/* ix87 specific implementation of arcsinh.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	/* Please note that we use double value for 1.0.  This number
+	   has an exact representation and so we don't get accuracy
+	   problems.  The advantage is that the code is simpler.  */
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_acoshl)
+	movl	12(%esp), %ecx
+	andl	$0xffff, %ecx
+	cmpl	$0x3fff, %ecx
+	jl	5f			// < 1 => invalid
+	fldln2				// log(2)
+	fldt	4(%esp)			// x : log(2)
+	cmpl	$0x4020, %ecx
+	ja	3f			// x > 2^34
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpl	$0x4000, %ecx
+	ja	4f			// x > 2
+
+	// 1 <= x <= 2 => y = log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+	fsubl	MO(one)			// x-1 : log(2)
+	fabs				// acosh(1) is +0 in all rounding modes
+	fld	%st			// x-1 : x-1 : log(2)
+	fmul	%st(1)			// (x-1)^2 : x-1 : log(2)
+	fadd	%st(1)			// x-1+(x-1)^2 : x-1 : log(2)
+	fadd	%st(1)			// 2*(x-1)+(x-1)^2 : x-1 : log(2)
+	fsqrt				// sqrt(2*(x-1)+(x-1)^2) : x-1 : log(2)
+	faddp				// x-1+sqrt(2*(x-1)+(x-1)^2) : log(2)
+	fcoml	MO(limit)
+	fnstsw
+	sahf
+	ja	2f
+	fyl2xp1				// log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+	ret
+
+2:	faddl	MO(one)			// x+sqrt(2*(x-1)+(x-1)^2) : log(2)
+	fyl2x				// log(x+sqrt(2*(x-1)+(x-1)^2))
+	ret
+
+	// x > 2^34 => y = log(x) + log(2)
+	.align ALIGNARG(4)
+3:	fyl2x				// log(x)
+	fldln2				// log(2) : log(x)
+	faddp				// log(x)+log(2)
+	ret
+
+	// 2^34 > x > 2 => y = log(2*x - 1/(x+sqrt(x*x-1)))
+	.align ALIGNARG(4)
+4:	fld	%st			// x : x : log(2)
+	fadd	%st, %st(1)		// x : 2*x : log(2)
+	fld	%st			// x : x : 2*x : log(2)
+	fmul	%st(1)			// x^2 : x : 2*x : log(2)
+	fsubl	MO(one)			// x^2-1 : x : 2*x : log(2)
+	fsqrt				// sqrt(x^2-1) : x : 2*x : log(2)
+	faddp				// x+sqrt(x^2-1) : 2*x : log(2)
+	fdivrl	MO(one)			// 1/(x+sqrt(x^2-1)) : 2*x : log(2)
+	fsubrp				// 2*x+1/(x+sqrt(x^2)-1) : log(2)
+	fyl2x				// log(2*x+1/(x+sqrt(x^2-1)))
+	ret
+
+	// x < 1 => NaN
+	.align ALIGNARG(4)
+5:	fldz
+	fdiv	%st, %st(0)
+	ret
+END(__ieee754_acoshl)
+strong_alias (__ieee754_acoshl, __acoshl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acosl.c b/REORG.TODO/sysdeps/i386/fpu/e_acosl.c
new file mode 100644
index 0000000000..ab08931924
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acosl.c
@@ -0,0 +1,29 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+long double
+__ieee754_acosl (long double x)
+{
+  long double res;
+
+  /* acosl = atanl (sqrtl((1-x) (1+x)) / x) */
+  asm (	"fld	%%st\n"
+	"fld1\n"
+	"fsubp\n"
+	"fld1\n"
+	"fadd	%%st(2)\n"
+	"fmulp\n"			/* 1 - x^2 */
+	"fsqrt\n"			/* sqrtl (1 - x^2) */
+	"fabs\n"
+	"fxch	%%st(1)\n"
+	"fpatan"
+	: "=t" (res) : "0" (x) : "st(1)");
+  return res;
+}
+strong_alias (__ieee754_acosl, __acosl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_asin.S b/REORG.TODO/sysdeps/i386/fpu/e_asin.S
new file mode 100644
index 0000000000..39c8b47da4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_asin.S
@@ -0,0 +1,38 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: e_asin.S,v 1.4 1995/05/08 23:45:40 jtc Exp $")
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+
+/* asin = atan (x / sqrt((1-x) (1+x))) */
+ENTRY(__ieee754_asin)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	4(%esp)			/* x */
+	fld	%st
+	fld1				/* 1 : x : x */
+	fsubp				/* 1 - x : x */
+	fld1				/* 1 : 1 - x : x */
+	fadd	%st(2)			/* 1 + x : 1 - x : x */
+	fmulp				/* 1 - x^2 */
+	fsqrt				/* sqrt (1 - x^2) */
+	fpatan
+	DBL_CHECK_FORCE_UFLOW
+	ret
+END (__ieee754_asin)
+strong_alias (__ieee754_asin, __asin_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_asinf.S b/REORG.TODO/sysdeps/i386/fpu/e_asinf.S
new file mode 100644
index 0000000000..1102bdedfd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_asinf.S
@@ -0,0 +1,39 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: $")
+
+	.section .rodata.cst4,"aM",@progbits,4
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+
+/* asin = atan (x / sqrt(1 - x^2)) */
+ENTRY(__ieee754_asinf)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	flds	4(%esp)			/* x */
+	fld	%st
+	fmul	%st(0)			/* x^2 */
+	fld1
+	fsubp				/* 1 - x^2 */
+	fsqrt				/* sqrt (1 - x^2) */
+	fpatan
+	FLT_CHECK_FORCE_UFLOW
+	ret
+END (__ieee754_asinf)
+strong_alias (__ieee754_asinf, __asinf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atan2.S b/REORG.TODO/sysdeps/i386/fpu/e_atan2.S
new file mode 100644
index 0000000000..25f43bb5a1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atan2.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: e_atan2.S,v 1.4 1995/05/08 23:46:28 jtc Exp $")
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_atan2)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	 4(%esp)
+	fldl	12(%esp)
+	fpatan
+	DBL_CHECK_FORCE_UFLOW_NARROW
+	ret
+END (__ieee754_atan2)
+strong_alias (__ieee754_atan2, __atan2_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atan2f.S b/REORG.TODO/sysdeps/i386/fpu/e_atan2f.S
new file mode 100644
index 0000000000..2bc909a762
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atan2f.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: e_atan2f.S,v 1.1 1995/05/08 23:35:10 jtc Exp $")
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_atan2f)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	flds	4(%esp)
+	flds	8(%esp)
+	fpatan
+	FLT_CHECK_FORCE_UFLOW_NARROW
+	ret
+END (__ieee754_atan2f)
+strong_alias (__ieee754_atan2f, __atan2f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atan2l.c b/REORG.TODO/sysdeps/i386/fpu/e_atan2l.c
new file mode 100644
index 0000000000..9f88bfcc08
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atan2l.c
@@ -0,0 +1,19 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+long double
+__ieee754_atan2l (long double y, long double x)
+{
+  long double res;
+
+  asm ("fpatan" : "=t" (res) : "u" (y), "0" (x) : "st(1)");
+
+  return res;
+}
+strong_alias (__ieee754_atan2l, __atan2l_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atanh.S b/REORG.TODO/sysdeps/i386/fpu/e_atanh.S
new file mode 100644
index 0000000000..cbc93d5da2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atanh.S
@@ -0,0 +1,112 @@
+/* ix87 specific implementation of arctanh function.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type half,@object
+half:	.double 0.5
+	ASM_SIZE_DIRECTIVE(half)
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	.type ln2_2,@object
+ln2_2:	.tfloat 0.3465735902799726547086160
+	ASM_SIZE_DIRECTIVE(ln2_2)
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_atanh)
+	movl	8(%esp), %ecx
+
+	movl	%ecx, %eax
+	andl	$0x7fffffff, %eax
+	cmpl	$0x7ff00000, %eax
+	jae	5f
+7:
+
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+	andl	$0x80000000, %ecx // ECX == 0 iff X >= 0
+
+	fldt	MO(ln2_2)	// 0.5*ln2
+	xorl	%ecx, 8(%esp)
+	fldl	4(%esp)		// |x| : 0.5*ln2
+	fcoml	MO(half)	// |x| : 0.5*ln2
+	fld	%st		// |x| : |x| : 0.5*ln2
+	fnstsw			// |x| : |x| : 0.5*ln2
+	sahf
+	jae	2f
+	fadd	%st, %st(1)	// |x| : 2*|x| : 0.5*ln2
+	fld	%st		// |x| : |x| : 2*|x| : 0.5*ln2
+	fsubrl	MO(one)		// 1-|x| : |x| : 2*|x| : 0.5*ln2
+	fxch			// |x| : 1-|x| : 2*|x| : 0.5*ln2
+	fmul	%st(2)		// 2*|x|^2 : 1-|x| : 2*|x| : 0.5*ln2
+	fdivp			// (2*|x|^2)/(1-|x|) : 2*|x| : 0.5*ln2
+	faddp			// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fcoml	MO(limit)	// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fnstsw			// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	sahf
+	jae	4f
+	fyl2xp1			// 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+	DBL_CHECK_FORCE_UFLOW_NONNEG
+	jecxz	3f
+	fchs			// 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3:	ret
+
+	.align ALIGNARG(4)
+4:	faddl	MO(one)		// 1+2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fyl2x			// 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+	jecxz	3f
+	fchs			// 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3:	ret
+
+	.align ALIGNARG(4)
+2:	faddl	MO(one)		// 1+|x| : |x| : 0.5*ln2
+	fxch			// |x| : 1+|x| : 0.5*ln2
+	fsubrl	MO(one)		// 1-|x| : 1+|x| : 0.5*ln2
+	fdivrp			// (1+|x|)/(1-|x|) : 0.5*ln2
+	fyl2x			// 0.5*ln2*ld((1+|x|)/(1-|x|))
+	jecxz	3f
+	fchs			// 0.5*ln2*ld((1+x)/(1-x))
+3:	ret
+
+	// x == NaN or ±Inf
+5:	ja	6f
+	cmpl	$0, 4(%esp)
+	je	7b
+6:	fldl	4(%esp)
+	ret
+END(__ieee754_atanh)
+strong_alias (__ieee754_atanh, __atanh_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atanhf.S b/REORG.TODO/sysdeps/i386/fpu/e_atanhf.S
new file mode 100644
index 0000000000..92fda3fd82
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atanhf.S
@@ -0,0 +1,109 @@
+/* ix87 specific implementation of arctanh function.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type half,@object
+half:	.double 0.5
+	ASM_SIZE_DIRECTIVE(half)
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	.align ALIGNARG(4)
+	.type ln2_2,@object
+ln2_2:	.tfloat 0.3465735902799726547086160
+	ASM_SIZE_DIRECTIVE(ln2_2)
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_atanhf)
+	movl	4(%esp), %ecx
+
+	movl	%ecx, %eax
+	andl	$0x7fffffff, %eax
+	cmpl	$0x7f800000, %eax
+	ja	5f
+
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+	andl	$0x80000000, %ecx // ECX == 0 iff X >= 0
+
+	fldt	MO(ln2_2)	// 0.5*ln2
+	xorl	%ecx, 4(%esp)
+	flds	4(%esp)		// |x| : 0.5*ln2
+	fcoml	MO(half)	// |x| : 0.5*ln2
+	fld	%st(0)		// |x| : |x| : 0.5*ln2
+	fnstsw			// |x| : |x| : 0.5*ln2
+	sahf
+	jae	2f
+	fadd	%st, %st(1)	// |x| : 2*|x| : 0.5*ln2
+	fld	%st		// |x| : |x| : 2*|x| : 0.5*ln2
+	fsubrl	MO(one)		// 1-|x| : |x| : 2*|x| : 0.5*ln2
+	fxch			// |x| : 1-|x| : 2*|x| : 0.5*ln2
+	fmul	%st(2)		// 2*|x|^2 : 1-|x| : 2*|x| : 0.5*ln2
+	fdivp			// (2*|x|^2)/(1-|x|) : 2*|x| : 0.5*ln2
+	faddp			// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fcoml	MO(limit)	// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fnstsw			// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	sahf
+	jae	4f
+	fyl2xp1			// 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+	FLT_CHECK_FORCE_UFLOW_NONNEG
+	jecxz	3f
+	fchs			// 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3:	ret
+
+	.align ALIGNARG(4)
+4:	faddl	MO(one)		// 1+2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fyl2x			// 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+	jecxz	3f
+	fchs			// 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3:	ret
+
+	.align ALIGNARG(4)
+2:	faddl	MO(one)		// 1+|x| : |x| : 0.5*ln2
+	fxch			// |x| : 1+|x| : 0.5*ln2
+	fsubrl	MO(one)		// 1-|x| : 1+|x| : 0.5*ln2
+	fdivrp			// (1+|x|)/(1-|x|) : 0.5*ln2
+	fyl2x			// 0.5*ln2*ld((1+|x|)/(1-|x|))
+	jecxz	3f
+	fchs			// 0.5*ln2*ld((1+x)/(1-x))
+3:	ret
+
+	// x == NaN
+5:	flds	4(%esp)
+	ret
+END(__ieee754_atanhf)
+strong_alias (__ieee754_atanhf, __atanhf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atanhl.S b/REORG.TODO/sysdeps/i386/fpu/e_atanhl.S
new file mode 100644
index 0000000000..31ff7e5182
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atanhl.S
@@ -0,0 +1,127 @@
+/* ix87 specific implementation of arctanh function.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	/* Please note that we use double values for 0.5 and 1.0.  These
+	   numbers have exact representations and so we don't get accuracy
+	   problems.  The advantage is that the code is simpler.  */
+	.type half,@object
+half:	.double 0.5
+	ASM_SIZE_DIRECTIVE(half)
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	.align ALIGNARG(4)
+	.type ln2_2,@object
+ln2_2:	.tfloat 0.3465735902799726547086160
+	ASM_SIZE_DIRECTIVE(ln2_2)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_atanhl)
+	movl	12(%esp), %ecx
+
+	movl	%ecx, %eax
+	andl	$0x7fff, %eax
+	cmpl	$0x7fff, %eax
+	je	5f
+	cmpl	$0x3fdf, %eax
+	jge	7f
+	// Exponent below -32; return x, with underflow if subnormal.
+	fldt	4(%esp)
+	cmpl	$0, %eax
+	jne	8f
+	fld	%st(0)
+	fmul	%st(0)
+	fstp	%st(0)
+8:	ret
+7:
+
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+	andl	$0x8000, %ecx	// ECX == 0 iff X >= 0
+
+	fldt	MO(ln2_2)	// 0.5*ln2
+	xorl	%ecx, 12(%esp)
+	fldt	4(%esp)		// |x| : 0.5*ln2
+	fcoml	MO(half)	// |x| : 0.5*ln2
+	fld	%st(0)		// |x| : |x| : 0.5*ln2
+	fnstsw			// |x| : |x| : 0.5*ln2
+	sahf
+	jae	2f
+	fadd	%st, %st(1)	// |x| : 2*|x| : 0.5*ln2
+	fld	%st		// |x| : |x| : 2*|x| : 0.5*ln2
+	fsubrl	MO(one)		// 1-|x| : |x| : 2*|x| : 0.5*ln2
+	fxch			// |x| : 1-|x| : 2*|x| : 0.5*ln2
+	fmul	%st(2)		// 2*|x|^2 : 1-|x| : 2*|x| : 0.5*ln2
+	fdivp			// (2*|x|^2)/(1-|x|) : 2*|x| : 0.5*ln2
+	faddp			// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fcoml	MO(limit)	// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fnstsw			// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	sahf
+	jae	4f
+	fyl2xp1			// 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+	jecxz	3f
+	fchs			// 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3:	ret
+
+	.align ALIGNARG(4)
+4:	faddl	MO(one)		// 1+2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fyl2x			// 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+	jecxz	3f
+	fchs			// 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3:	ret
+
+	.align ALIGNARG(4)
+2:	faddl	MO(one)		// 1+|x| : |x| : 0.5*ln2
+	fxch			// |x| : 1+|x| : 0.5*ln2
+	fsubrl	MO(one)		// 1-|x| : 1+|x| : 0.5*ln2
+	fdivrp			// (1+|x|)/(1-|x|) : 0.5*ln2
+	fyl2x			// 0.5*ln2*ld((1+|x|)/(1-|x|))
+	jecxz	3f
+	fchs			// 0.5*ln2*ld((1+x)/(1-x))
+3:	ret
+
+	// x == NaN or ±Inf
+5:	cmpl	$0x80000000, 8(%esp)
+	ja	6f
+	cmpl	$0, 4(%esp)
+	je	7b
+6:	fldt	4(%esp)
+	fadd	%st(0)
+	ret
+END(__ieee754_atanhl)
+strong_alias (__ieee754_atanhl, __atanhl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp.S b/REORG.TODO/sysdeps/i386/fpu/e_exp.S
new file mode 100644
index 0000000000..a7e7f13f6f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp.S
@@ -0,0 +1,73 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+/* e^x = 2^(x * log2(e)) */
+ENTRY(__ieee754_exp)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	fldl2e
+	fmulp				/* x * log2(e) */
+	fld	%st
+	frndint				/* int(x * log2(e)) */
+	fsubr	%st,%st(1)		/* fract(x * log2(e)) */
+	fxch
+	f2xm1				/* 2^(fract(x * log2(e))) - 1 */
+	fld1
+	faddp				/* 2^(fract(x * log2(e))) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	DBL_NARROW_EVAL_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_exp)
+
+
+ENTRY(__exp_finite)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl2e
+	fmull	4(%esp)			/* x * log2(e) */
+	fld	%st
+	frndint				/* int(x * log2(e)) */
+	fsubr	%st,%st(1)		/* fract(x * log2(e)) */
+	fxch
+	f2xm1				/* 2^(fract(x * log2(e))) - 1 */
+	fld1
+	faddp				/* 2^(fract(x * log2(e))) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	DBL_NARROW_EVAL_UFLOW_NONNEG
+	ret
+END(__exp_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp10.S b/REORG.TODO/sysdeps/i386/fpu/e_exp10.S
new file mode 100644
index 0000000000..acb5160a3f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp10.S
@@ -0,0 +1,53 @@
+/*
+ * Written by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+/* 10^x = 2^(x * log2(10)) */
+ENTRY(__ieee754_exp10)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	fldl2t
+	fmulp				/* x * log2(10) */
+	fld	%st
+	frndint				/* int(x * log2(10)) */
+	fsubr	%st,%st(1)		/* fract(x * log2(10)) */
+	fxch
+	f2xm1				/* 2^(fract(x * log2(10))) - 1 */
+	fld1
+	faddp				/* 2^(fract(x * log2(10))) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	DBL_NARROW_EVAL_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_exp10)
+strong_alias (__ieee754_exp10, __exp10_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp10f.S b/REORG.TODO/sysdeps/i386/fpu/e_exp10f.S
new file mode 100644
index 0000000000..1812b34398
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp10f.S
@@ -0,0 +1,53 @@
+/*
+ * Written by Ulrich Drepper.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+/* 10^x = 2^(x * log2(10)) */
+ENTRY(__ieee754_exp10f)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	flds	4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	fldl2t
+	fmulp				/* x * log2(10) */
+	fld	%st
+	frndint				/* int(x * log2(10)) */
+	fsubr	%st,%st(1)		/* fract(x * log2(10)) */
+	fxch
+	f2xm1				/* 2^(fract(x * log2(10))) - 1 */
+	fld1
+	faddp				/* 2^(fract(x * log2(10))) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	FLT_NARROW_EVAL_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_exp10f)
+strong_alias (__ieee754_exp10f, __exp10f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp10l.S b/REORG.TODO/sysdeps/i386/fpu/e_exp10l.S
new file mode 100644
index 0000000000..d843e2b5e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp10l.S
@@ -0,0 +1,2 @@
+#define USE_AS_EXP10L
+#include <e_expl.S>
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp2.S b/REORG.TODO/sysdeps/i386/fpu/e_exp2.S
new file mode 100644
index 0000000000..fc16a96053
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp2.S
@@ -0,0 +1,52 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for exp2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_exp2)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	fld	%st
+	frndint				/* int(x) */
+	fsubr	%st,%st(1)		/* fract(x) */
+	fxch
+	f2xm1				/* 2^(fract(x)) - 1 */
+	fld1
+	faddp				/* 2^(fract(x)) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	DBL_NARROW_EVAL_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_exp2)
+strong_alias (__ieee754_exp2, __exp2_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp2f.S b/REORG.TODO/sysdeps/i386/fpu/e_exp2f.S
new file mode 100644
index 0000000000..30623cd850
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp2f.S
@@ -0,0 +1,52 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for exp2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_exp2f)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	flds	4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	fld	%st
+	frndint				/* int(x) */
+	fsubr	%st,%st(1)		/* fract(x) */
+	fxch
+	f2xm1				/* 2^(fract(x)) - 1 */
+	fld1
+	faddp				/* 2^(fract(x)) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	FLT_NARROW_EVAL_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_exp2f)
+strong_alias (__ieee754_exp2f, __exp2f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp2l.S b/REORG.TODO/sysdeps/i386/fpu/e_exp2l.S
new file mode 100644
index 0000000000..c4cb73d589
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp2l.S
@@ -0,0 +1,60 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for exp2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_LDBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_exp2l)
+#ifdef PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldt	4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	movzwl	4+8(%esp), %eax
+	andl	$0x7fff, %eax
+	cmpl	$0x3fbe, %eax
+	jge	3f
+	/* Argument's exponent below -65, result rounds to 1.  */
+	fld1
+	faddp
+	ret
+3:	fld	%st
+	frndint				/* int(x) */
+	fsubr	%st,%st(1)		/* fract(x) */
+	fxch
+	f2xm1				/* 2^(fract(x)) - 1 */
+	fld1
+	faddp				/* 2^(fract(x)) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	LDBL_CHECK_FORCE_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_exp2l)
+strong_alias (__ieee754_exp2l, __exp2l_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_expf.S b/REORG.TODO/sysdeps/i386/fpu/e_expf.S
new file mode 100644
index 0000000000..65cb4ec204
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_expf.S
@@ -0,0 +1,74 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+/* e^x = 2^(x * log2(e)) */
+ENTRY(__ieee754_expf)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	flds	4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	fldl2e
+	fmulp				/* x * log2(e) */
+	fld	%st
+	frndint				/* int(x * log2(e)) */
+	fsubr	%st,%st(1)		/* fract(x * log2(e)) */
+	fxch
+	f2xm1				/* 2^(fract(x * log2(e))) - 1 */
+	fld1
+	faddp				/* 2^(fract(x * log2(e))) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	FLT_NARROW_EVAL_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_expf)
+
+
+ENTRY(__expf_finite)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl2e
+	fmuls	4(%esp)			/* x * log2(e) */
+	fld	%st
+	frndint				/* int(x * log2(e)) */
+	fsubr	%st,%st(1)		/* fract(x * log2(e)) */
+	fxch
+	f2xm1				/* 2^(fract(x * log2(e))) - 1 */
+	fld1
+	faddp				/* 2^(fract(x * log2(e))) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	FLT_NARROW_EVAL_UFLOW_NONNEG
+	ret
+END(__expf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_expl.S b/REORG.TODO/sysdeps/i386/fpu/e_expl.S
new file mode 100644
index 0000000000..7d75fe22a1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_expl.S
@@ -0,0 +1,226 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+/*
+ * The 8087 method for the exponential function is to calculate
+ *   exp(x) = 2^(x log2(e))
+ * after separating integer and fractional parts
+ *   x log2(e) = i + f, |f| <= .5
+ * 2^i is immediate but f needs to be precise for long double accuracy.
+ * Suppress range reduction error in computing f by the following.
+ * Separate x into integer and fractional parts
+ *   x = xi + xf, |xf| <= .5
+ * Separate log2(e) into the sum of an exact number c0 and small part c1.
+ *   c0 + c1 = log2(e) to extra precision
+ * Then
+ *   f = (c0 xi - i) + c0 xf + c1 x
+ * where c0 xi is exact and so also is (c0 xi - i).
+ * -- moshier@na-net.ornl.gov
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+#ifdef USE_AS_EXP10L
+# define IEEE754_EXPL __ieee754_exp10l
+# define EXPL_FINITE __exp10l_finite
+# define FLDLOG fldl2t
+#elif defined USE_AS_EXPM1L
+# define IEEE754_EXPL __expm1l
+# undef EXPL_FINITE
+# define FLDLOG fldl2e
+#else
+# define IEEE754_EXPL __ieee754_expl
+# define EXPL_FINITE __expl_finite
+# define FLDLOG fldl2e
+#endif
+
+	.section .rodata.cst16,"aM",@progbits,16
+
+	.p2align 4
+#ifdef USE_AS_EXP10L
+	.type c0,@object
+c0:	.byte 0, 0, 0, 0, 0, 0, 0x9a, 0xd4, 0x00, 0x40
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(c0)
+	.type c1,@object
+c1:	.byte 0x58, 0x92, 0xfc, 0x15, 0x37, 0x9a, 0x97, 0xf0, 0xef, 0x3f
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(c1)
+#else
+	.type c0,@object
+c0:	.byte 0, 0, 0, 0, 0, 0, 0xaa, 0xb8, 0xff, 0x3f
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(c0)
+	.type c1,@object
+c1:	.byte 0x20, 0xfa, 0xee, 0xc2, 0x5f, 0x70, 0xa5, 0xec, 0xed, 0x3f
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(c1)
+#endif
+#ifndef USE_AS_EXPM1L
+	.type csat,@object
+csat:	.byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x0e, 0x40
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(csat)
+DEFINE_LDBL_MIN
+#endif
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(IEEE754_EXPL)
+#ifdef USE_AS_EXPM1L
+	movzwl	4+8(%esp), %eax
+	xorb	$0x80, %ah	// invert sign bit (now 1 is "positive")
+	cmpl	$0xc006, %eax	// is num positive and exp >= 6 (number is >= 128.0)?
+	jae	HIDDEN_JUMPTARGET (__expl) // (if num is denormal, it is at least >= 64.0)
+#endif
+	fldt	4(%esp)
+/* I added the following ugly construct because expl(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam			/* Is NaN or +-Inf?  */
+#ifdef PIC
+	LOAD_PIC_REG (cx)
+#endif
+#ifdef USE_AS_EXPM1L
+	xorb	$0x80, %ah
+	cmpl	$0xc006, %eax
+	fstsw	%ax
+	movb	$0x45, %dh
+	jb	4f
+
+	/* Below -64.0 (may be -NaN or -Inf). */
+	andb	%ah, %dh
+	cmpb	$0x01, %dh
+	je	6f		/* Is +-NaN, jump.  */
+	jmp	1f		/* -large, possibly -Inf.  */
+
+4:	/* In range -64.0 to 64.0 (may be +-0 but not NaN or +-Inf).  */
+	/* Test for +-0 as argument.  */
+	andb	%ah, %dh
+	cmpb	$0x40, %dh
+	je	2f
+
+	/* Test for arguments that are small but not subnormal.  */
+	movzwl	4+8(%esp), %eax
+	andl	$0x7fff, %eax
+	cmpl	$0x3fbf, %eax
+	jge	3f
+	/* Argument's exponent below -64; avoid spurious underflow if
+	   normal.  */
+	cmpl	$0x0001, %eax
+	jge	2f
+	/* Force underflow and return the argument, to avoid wrong signs
+	   of zero results from the code below in some rounding modes.  */
+	fld	%st
+	fmul	%st
+	fstp	%st
+	jmp	2f
+#else
+	movzwl	4+8(%esp), %eax
+	andl	$0x7fff, %eax
+	cmpl	$0x400d, %eax
+	jg	5f
+	cmpl	$0x3fbc, %eax
+	jge	3f
+	/* Argument's exponent below -67, result rounds to 1.  */
+	fld1
+	faddp
+	jmp	2f
+5:	/* Overflow, underflow or infinity or NaN as argument.  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f		/* Is +-Inf, jump.    */
+	cmpb	$0x01, %dh
+	je	6f		/* Is +-NaN, jump.    */
+	/* Overflow or underflow; saturate.  */
+	fstp	%st
+	fldt	MO(csat)
+	andb	$2, %ah
+	jz	3f
+	fchs
+#endif
+3:	FLDLOG			/* 1  log2(base)      */
+	fmul	%st(1), %st	/* 1  x log2(base)    */
+	/* Set round-to-nearest temporarily.  */
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fstcw	4(%esp)
+	movl	$0xf3ff, %edx
+	andl	4(%esp), %edx
+	movl	%edx, (%esp)
+	fldcw	(%esp)
+	frndint			/* 1  i               */
+	fld	%st(1)		/* 2  x               */
+	frndint			/* 2  xi              */
+	fldcw	4(%esp)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fld	%st(1)		/* 3  i               */
+	fldt	MO(c0)		/* 4  c0              */
+	fld	%st(2)		/* 5  xi              */
+	fmul	%st(1), %st	/* 5  c0 xi           */
+	fsubp	%st, %st(2)	/* 4  f = c0 xi  - i  */
+	fld	%st(4)		/* 5  x               */
+	fsub	%st(3), %st	/* 5  xf = x - xi     */
+	fmulp	%st, %st(1)	/* 4  c0 xf           */
+	faddp	%st, %st(1)	/* 3  f = f + c0 xf   */
+	fldt	MO(c1)		/* 4                  */
+	fmul	%st(4), %st	/* 4  c1 * x          */
+	faddp	%st, %st(1)	/* 3  f = f + c1 * x  */
+	f2xm1			/* 3 2^(fract(x * log2(base))) - 1 */
+#ifdef USE_AS_EXPM1L
+	fstp	%st(1)		/* 2                  */
+	fscale			/* 2 scale factor is st(1); base^x - 2^i */
+	fxch			/* 2 i                */
+	fld1			/* 3 1.0              */
+	fscale			/* 3 2^i              */
+	fld1			/* 4 1.0              */
+	fsubrp	%st, %st(1)	/* 3 2^i - 1.0        */
+	fstp	%st(1)		/* 2                  */
+	faddp	%st, %st(1)	/* 1 base^x - 1.0     */
+#else
+	fld1			/* 4 1.0              */
+	faddp			/* 3 2^(fract(x * log2(base))) */
+	fstp	%st(1)		/* 2  */
+	fscale			/* 2 scale factor is st(1); base^x */
+	fstp	%st(1)		/* 1  */
+	LDBL_CHECK_FORCE_UFLOW_NONNEG
+#endif
+	fstp	%st(1)		/* 0  */
+	jmp	2f
+1:
+#ifdef USE_AS_EXPM1L
+	/* For expm1l, only negative sign gets here.  */
+	fstp	%st
+	fld1
+	fchs
+#else
+	testl	$0x200, %eax	/* Test sign.  */
+	jz	2f		/* If positive, jump.  */
+	fstp	%st
+	fldz			/* Set result to 0.  */
+#endif
+2:	ret
+6:	/* NaN argument.  */
+	fadd	%st
+	ret
+END(IEEE754_EXPL)
+#ifdef USE_AS_EXPM1L
+libm_hidden_def (__expm1l)
+weak_alias (__expm1l, expm1l)
+#else
+strong_alias (IEEE754_EXPL, EXPL_FINITE)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_fmod.S b/REORG.TODO/sysdeps/i386/fpu/e_fmod.S
new file mode 100644
index 0000000000..26b3acc392
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_fmod.S
@@ -0,0 +1,18 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_fmod)
+	fldl	12(%esp)
+	fldl	4(%esp)
+1:	fprem
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	ret
+END (__ieee754_fmod)
+strong_alias (__ieee754_fmod, __fmod_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_fmodf.S b/REORG.TODO/sysdeps/i386/fpu/e_fmodf.S
new file mode 100644
index 0000000000..ece4d98427
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_fmodf.S
@@ -0,0 +1,19 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_fmodf)
+	flds	8(%esp)
+	flds	4(%esp)
+1:	fprem
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	ret
+END(__ieee754_fmodf)
+strong_alias (__ieee754_fmodf, __fmodf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_fmodl.c b/REORG.TODO/sysdeps/i386/fpu/e_fmodl.c
new file mode 100644
index 0000000000..49700ae8f6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_fmodl.c
@@ -0,0 +1,23 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+long double
+__ieee754_fmodl (long double x, long double y)
+{
+  long double res;
+
+  asm ("1:\tfprem\n"
+       "fstsw   %%ax\n"
+       "sahf\n"
+       "jp      1b\n"
+       "fstp    %%st(1)"
+       : "=t" (res) : "0" (x), "u" (y) : "ax", "st(1)");
+  return res;
+}
+strong_alias (__ieee754_fmodl, __fmodl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_hypot.S b/REORG.TODO/sysdeps/i386/fpu/e_hypot.S
new file mode 100644
index 0000000000..7403566fd7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_hypot.S
@@ -0,0 +1,75 @@
+/* Compute the hypothenuse of X and Y.
+   Copyright (C) 1998-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <i386-math-asm.h>
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_hypot)
+#ifdef  PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fldl	4(%esp)		// x
+	fxam
+	fnstsw
+	fldl	12(%esp)	// y : x
+	movb	%ah, %ch
+	fxam
+	fnstsw
+	movb	%ah, %al
+	orb	%ch, %ah
+	sahf
+	jc	1f
+	fmul	%st(0)		// y * y : x
+	fxch			// x : y * y
+	fmul	%st(0)		// x * x : y * y
+	faddp			// x * x + y * y
+	fsqrt
+	DBL_NARROW_EVAL_UFLOW_NONNEG
+2:	ret
+
+	// We have to test whether any of the parameters is Inf.
+	// In this case the result is infinity.
+1:	andb	$0x45, %al
+	cmpb	$5, %al
+	je	3f		// jump if y is Inf
+	andb	$0x45, %ch
+	cmpb	$5, %ch
+	jne	4f		// jump if x is not Inf
+	fxch
+3:	fstp	%st(1)
+	fabs
+	jmp	2b
+
+4:	testb	$1, %al
+	jnz	5f		// y is NaN
+	fxch
+5:	fstp	%st(1)
+	jmp	2b
+
+END(__ieee754_hypot)
+strong_alias (__ieee754_hypot, __hypot_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_hypotf.S b/REORG.TODO/sysdeps/i386/fpu/e_hypotf.S
new file mode 100644
index 0000000000..6a2c7052b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_hypotf.S
@@ -0,0 +1,64 @@
+/* Compute the hypothenuse of X and Y.
+   Copyright (C) 1998-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <i386-math-asm.h>
+
+	.text
+ENTRY(__ieee754_hypotf)
+	flds	4(%esp)		// x
+	fxam
+	fnstsw
+	flds	8(%esp)		// y : x
+	movb	%ah, %ch
+	fxam
+	fnstsw
+	movb	%ah, %al
+	orb	%ch, %ah
+	sahf
+	jc	1f
+	fmul	%st(0)		// y * y : x
+	fxch			// x : y * y
+	fmul	%st(0)		// x * x : y * y
+	faddp			// x * x + y * y
+	fsqrt
+	FLT_NARROW_EVAL
+2:	ret
+
+	// We have to test whether any of the parameters is Inf.
+	// In this case the result is infinity.
+1:	andb	$0x45, %al
+	cmpb	$5, %al
+	je	3f		// jump if y is Inf
+	andb	$0x45, %ch
+	cmpb	$5, %ch
+	jne	4f		// jump if x is not Inf
+	fxch
+3:	fstp	%st(1)
+	fabs
+	jmp	2b
+
+4:	testb	$1, %al
+	jnz	5f		// y is NaN
+	fxch
+5:	fstp	%st(1)
+	jmp	2b
+
+END(__ieee754_hypotf)
+strong_alias (__ieee754_hypotf, __hypotf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_ilogb.S b/REORG.TODO/sysdeps/i386/fpu/e_ilogb.S
new file mode 100644
index 0000000000..29ef2214e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_ilogb.S
@@ -0,0 +1,42 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_ilogb.S,v 1.5 1995/10/12 15:53:09 jtc Exp $")
+
+ENTRY(__ieee754_ilogb)
+	fldl	4(%esp)
+/* I added the following ugly construct because ilogb(+-Inf) is
+   required to return INT_MAX in ISO C99.
+   -- jakub@redhat.com.  */
+	fxam			/* Is NaN or +-Inf?  */
+	fstsw   %ax
+	movb    $0x45, %dh
+	andb    %ah, %dh
+	cmpb    $0x05, %dh
+	je      1f		/* Is +-Inf, jump.  */
+	cmpb    $0x40, %dh
+	je      2f		/* Is +-0, jump.  */
+
+	fxtract
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	fstp	%st
+
+	fistpl	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+
+	ret
+
+1:	fstp	%st
+	movl	$0x7fffffff, %eax
+	ret
+2:	fstp	%st
+	movl	$0x80000000, %eax	/* FP_ILOGB0  */
+	ret
+END (__ieee754_ilogb)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S b/REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S
new file mode 100644
index 0000000000..d72de6c84a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S
@@ -0,0 +1,42 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_ilogbf.S,v 1.4 1995/10/22 20:32:43 pk Exp $")
+
+ENTRY(__ieee754_ilogbf)
+	flds	4(%esp)
+/* I added the following ugly construct because ilogb(+-Inf) is
+   required to return INT_MAX in ISO C99.
+   -- jakub@redhat.com.  */
+	fxam			/* Is NaN or +-Inf?  */
+	fstsw   %ax
+	movb    $0x45, %dh
+	andb    %ah, %dh
+	cmpb    $0x05, %dh
+	je      1f		/* Is +-Inf, jump.  */
+	cmpb    $0x40, %dh
+	je      2f		/* Is +-0, jump.  */
+
+	fxtract
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	fstp	%st
+
+	fistpl	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+
+	ret
+
+1:	fstp	%st
+	movl	$0x7fffffff, %eax
+	ret
+2:	fstp	%st
+	movl	$0x80000000, %eax	/* FP_ILOGB0  */
+	ret
+END (__ieee754_ilogbf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S b/REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S
new file mode 100644
index 0000000000..60761dfa38
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S
@@ -0,0 +1,43 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__ieee754_ilogbl)
+	fldt	4(%esp)
+/* I added the following ugly construct because ilogb(+-Inf) is
+   required to return INT_MAX in ISO C99.
+   -- jakub@redhat.com.  */
+	fxam			/* Is NaN or +-Inf?  */
+	fstsw   %ax
+	movb    $0x45, %dh
+	andb    %ah, %dh
+	cmpb    $0x05, %dh
+	je      1f		/* Is +-Inf, jump.  */
+	cmpb    $0x40, %dh
+	je      2f		/* Is +-0, jump.  */
+
+	fxtract
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	fstp	%st
+
+	fistpl	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+
+	ret
+
+1:	fstp	%st
+	movl	$0x7fffffff, %eax
+	ret
+2:	fstp	%st
+	movl	$0x80000000, %eax	/* FP_ILOGB0  */
+	ret
+END (__ieee754_ilogbl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log.S b/REORG.TODO/sysdeps/i386/fpu/e_log.S
new file mode 100644
index 0000000000..335df22577
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log.S
@@ -0,0 +1,92 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log)
+	fldln2			// log(2)
+	fldl	4(%esp)		// x : log(2)
+	fxam
+	fnstsw
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	sahf
+	jc	3f		// in case x is NaN or +-Inf
+4:	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+
+3:	jp	4b		// in case x is +-Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+END (__ieee754_log)
+
+ENTRY(__log_finite)
+	fldln2			// log(2)
+	fldl	4(%esp)		// x : log(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2b
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	6f
+	fabs			// log(1) is +0 in all rounding modes.
+6:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+END(__log_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log10.S b/REORG.TODO/sysdeps/i386/fpu/e_log10.S
new file mode 100644
index 0000000000..17277084ca
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log10.S
@@ -0,0 +1,68 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log10)
+	fldlg2			// log10(2)
+	fldl	4(%esp)		// x : log10(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fxam
+	fnstsw
+	fld	%st		// x : x : log10(2)
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fsubl	MO(one)		// x-1 : x : log10(2)
+	fld	%st		// x-1 : x-1 : x : log10(2)
+	fabs			// |x-1| : x-1 : x : log10(2)
+	fcompl	MO(limit)	// x-1 : x : log10(2)
+	fnstsw			// x-1 : x : log10(2)
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log10(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : log10(2)
+	fyl2xp1			// log10(x)
+	ret
+
+2:	fstp	%st(0)		// x : log10(2)
+	fyl2x			// log10(x)
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+END (__ieee754_log10)
+strong_alias (__ieee754_log10, __log10_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log10f.S b/REORG.TODO/sysdeps/i386/fpu/e_log10f.S
new file mode 100644
index 0000000000..72a3b88251
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log10f.S
@@ -0,0 +1,69 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log10f)
+	fldlg2			// log10(2)
+	flds	4(%esp)		// x : log10(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fxam
+	fnstsw
+	fld	%st		// x : x : log10(2)
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fsubl	MO(one)		// x-1 : x : log10(2)
+	fld	%st		// x-1 : x-1 : x : log10(2)
+	fabs			// |x-1| : x-1 : x : log10(2)
+	fcompl	MO(limit)	// x-1 : x : log10(2)
+	fnstsw			// x-1 : x : log10(2)
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log10(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : log10(2)
+	fyl2xp1			// log10(x)
+	ret
+
+2:	fstp	%st(0)		// x : log10(2)
+	fyl2x			// log10(x)
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+END (__ieee754_log10f)
+strong_alias (__ieee754_log10f, __log10f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log10l.S b/REORG.TODO/sysdeps/i386/fpu/e_log10l.S
new file mode 100644
index 0000000000..9326b19796
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log10l.S
@@ -0,0 +1,71 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log10l)
+	fldlg2			// log10(2)
+	fldt	4(%esp)		// x : log10(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fxam
+	fnstsw
+	fld	%st		// x : x : log10(2)
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fsubl	MO(one)		// x-1 : x : log10(2)
+	fld	%st		// x-1 : x-1 : x : log10(2)
+	fabs			// |x-1| : x-1 : x : log10(2)
+	fcompl	MO(limit)	// x-1 : x : log10(2)
+	fnstsw			// x-1 : x : log10(2)
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log10(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : log10(2)
+	fyl2xp1			// log10(x)
+	ret
+
+2:	fstp	%st(0)		// x : log10(2)
+	fyl2x			// log10(x)
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	fadd	%st(0)
+	ret
+END(__ieee754_log10l)
+strong_alias (__ieee754_log10l, __log10l_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log2.S b/REORG.TODO/sysdeps/i386/fpu/e_log2.S
new file mode 100644
index 0000000000..73ff0fffd3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log2.S
@@ -0,0 +1,69 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for use as log2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fldl	MO(one)
+	fldl	4(%esp)		// x : 1
+	fxam
+	fnstsw
+	fld	%st		// x : x : 1
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fsub	%st(2), %st	// x-1 : x : 1
+	fld	%st		// x-1 : x-1 : x : 1
+	fabs			// |x-1| : x-1 : x : 1
+	fcompl	MO(limit)	// x-1 : x : 1
+	fnstsw			// x-1 : x : 1
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log2(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : 1
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : 1
+	fyl2x			// log(x)
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+END (__ieee754_log2)
+strong_alias (__ieee754_log2, __log2_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log2f.S b/REORG.TODO/sysdeps/i386/fpu/e_log2f.S
new file mode 100644
index 0000000000..344eeb495e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log2f.S
@@ -0,0 +1,69 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for use as log2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log2f)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fldl	MO(one)
+	flds	4(%esp)		// x : 1
+	fxam
+	fnstsw
+	fld	%st		// x : x : 1
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fsub	%st(2), %st	// x-1 : x : 1
+	fld	%st		// x-1 : x-1 : x : 1
+	fabs			// |x-1| : x-1 : x : 1
+	fcompl	MO(limit)	// x-1 : x : 1
+	fnstsw			// x-1 : x : 1
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log2(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : 1
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : 1
+	fyl2x			// log(x)
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+END (__ieee754_log2f)
+strong_alias (__ieee754_log2f, __log2f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log2l.S b/REORG.TODO/sysdeps/i386/fpu/e_log2l.S
new file mode 100644
index 0000000000..73e62ea908
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log2l.S
@@ -0,0 +1,70 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for use as log2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log2l)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fldl	MO(one)
+	fldt	4(%esp)		// x : 1
+	fxam
+	fnstsw
+	fld	%st		// x : x : 1
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fsub	%st(2), %st	// x-1 : x : 1
+	fld	%st		// x-1 : x-1 : x : 1
+	fabs			// |x-1| : x-1 : x : 1
+	fcompl	MO(limit)	// x-1 : x : 1
+	fnstsw			// x-1 : x : 1
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log2(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : 1
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : 1
+	fyl2x			// log(x)
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	fadd	%st(0)
+	ret
+END (__ieee754_log2l)
+strong_alias (__ieee754_log2l, __log2l_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_logf.S b/REORG.TODO/sysdeps/i386/fpu/e_logf.S
new file mode 100644
index 0000000000..de967a31f5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_logf.S
@@ -0,0 +1,93 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_logf)
+	fldln2			// log(2)
+	flds	4(%esp)		// x : log(2)
+	fxam
+	fnstsw
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	sahf
+	jc	3f		// in case x is NaN or +-Inf
+4:	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+
+3:	jp	4b		// in case x is +-Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+END (__ieee754_logf)
+
+ENTRY(__logf_finite)
+	fldln2			// log(2)
+	flds	4(%esp)		// x : log(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2b
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	6f
+	fabs			// log(1) is +0 in all rounding modes.
+6:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+END(__logf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_logl.S b/REORG.TODO/sysdeps/i386/fpu/e_logl.S
new file mode 100644
index 0000000000..53127d704e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_logl.S
@@ -0,0 +1,97 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_logl)
+	fldln2			// log(2)
+	fldt	4(%esp)		// x : log(2)
+	fxam
+	fnstsw
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	sahf
+	jc	3f		// in case x is NaN or +-Inf
+	movzwl	4+8(%esp), %eax
+	cmpl	$0xc000, %eax
+	jae	6f		// x <= -2, avoid overflow from -LDBL_MAX - 1.
+4:	fsubl	MO(one)		// x-1 : x : log(2)
+6:	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+
+3:	jp	4b		// in case x is +-Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	fadd	%st(0)
+	ret
+END (__ieee754_logl)
+
+ENTRY(__logl_finite)
+	fldln2			// log(2)
+	fldt	4(%esp)		// x : log(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2b
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	7f
+	fabs			// log(1) is +0 in all rounding modes.
+7:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+END(__logl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_pow.S b/REORG.TODO/sysdeps/i386/fpu/e_pow.S
new file mode 100644
index 0000000000..2edb9a9fbc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_pow.S
@@ -0,0 +1,456 @@
+/* ix87 specific implementation of pow function.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	.type p63,@object
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
+	.type p10,@object
+p10:	.byte 0, 0, 0, 0, 0, 0, 0x90, 0x40
+	ASM_SIZE_DIRECTIVE(p10)
+
+	.section .rodata.cst16,"aM",@progbits,16
+
+	.p2align 3
+	.type infinity,@object
+inf_zero:
+infinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
+	ASM_SIZE_DIRECTIVE(infinity)
+	.type zero,@object
+zero:	.double 0.0
+	ASM_SIZE_DIRECTIVE(zero)
+	.type minf_mzero,@object
+minf_mzero:
+minfinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
+mzero:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	ASM_SIZE_DIRECTIVE(minf_mzero)
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+	.text
+ENTRY(__ieee754_pow)
+	fldl	12(%esp)	// y
+	fxam
+
+#ifdef	PIC
+	LOAD_PIC_REG (cx)
+#endif
+
+	fnstsw
+	movb	%ah, %dl
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah	// is y == 0 ?
+	je	11f
+
+	cmpb	$0x05, %ah	// is y == ±inf ?
+	je	12f
+
+	cmpb	$0x01, %ah	// is y == NaN ?
+	je	30f
+
+	fldl	4(%esp)		// x : y
+
+	subl	$8,%esp
+	cfi_adjust_cfa_offset (8)
+
+	fxam
+	fnstsw
+	movb	%ah, %dh
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	je	20f		// x is ±0
+
+	cmpb	$0x05, %ah
+	je	15f		// x is ±inf
+
+	cmpb	$0x01, %ah
+	je	32f		// x is NaN
+
+	fxch			// y : x
+
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p63)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
+	/* First see whether `y' is a natural number.  In this case we
+	   can use a more precise algorithm.  */
+	fld	%st		// y : y : x
+	fistpll	(%esp)		// y : x
+	fildll	(%esp)		// int(y) : y : x
+	fucomp	%st(1)		// y : x
+	fnstsw
+	sahf
+	jne	3f
+
+	/* OK, we have an integer value for y.  If large enough that
+	   errors may propagate out of the 11 bits excess precision, use
+	   the algorithm for real exponent instead.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p10)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	orl	$0, %edx
+	fstp	%st(0)		// x
+	jns	4f		// y >= 0, jump
+	fdivrl	MO(one)		// 1/x		(now referred to as x)
+	negl	%eax
+	adcl	$0, %edx
+	negl	%edx
+4:	fldl	MO(one)		// 1 : x
+	fxch
+
+	/* If y is even, take the absolute value of x.  Otherwise,
+	   ensure all intermediate values that might overflow have the
+	   sign of x.  */
+	testb	$1, %al
+	jnz	6f
+	fabs
+
+6:	shrdl	$1, %edx, %eax
+	jnc	5f
+	fxch
+	fabs
+	fmul	%st(1)		// x : ST*x
+	fxch
+5:	fld	%st		// x : x : ST*x
+	fabs			// |x| : x : ST*x
+	fmulp			// |x|*x : ST*x
+	shrl	$1, %edx
+	movl	%eax, %ecx
+	orl	%edx, %ecx
+	jnz	6b
+	fstp	%st(0)		// ST*x
+#ifdef	PIC
+	LOAD_PIC_REG (cx)
+#endif
+	DBL_NARROW_EVAL_UFLOW_NONNAN
+	ret
+
+	/* y is ±NAN */
+30:	fldl	4(%esp)		// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fucomp	%st(1)		// x : y
+	fnstsw
+	sahf
+	je	31f
+	fxch			// y : x
+31:	fstp	%st(1)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+32:	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fstp	%st(1)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+2:	// y is a large integer (absolute value at least 1L<<10), but
+	// may be odd unless at least 1L<<64.  So it may be necessary
+	// to adjust the sign of a negative result afterwards.
+	fxch			// x : y
+	fabs			// |x| : y
+	fxch			// y : x
+	.align ALIGNARG(4)
+3:	/* y is a real number.  */
+	fxch			// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fldl	MO(limit)	// 0.29 : 1.0 : x : y
+	fld	%st(2)		// x : 0.29 : 1.0 : x : y
+	fsub	%st(2)		// x-1 : 0.29 : 1.0 : x : y
+	fabs			// |x-1| : 0.29 : 1.0 : x : y
+	fucompp			// 1.0 : x : y
+	fnstsw
+	fxch			// x : 1.0 : y
+	sahf
+	ja	7f
+	fsub	%st(1)		// x-1 : 1.0 : y
+	fyl2xp1			// log2(x) : y
+	jmp	8f
+
+7:	fyl2x			// log2(x) : y
+8:	fmul	%st(1)		// y*log2(x) : y
+	fst	%st(1)		// y*log2(x) : y*log2(x)
+	frndint			// int(y*log2(x)) : y*log2(x)
+	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
+	fxch			// fract(y*log2(x)) : int(y*log2(x))
+	f2xm1			// 2^fract(y*log2(x))-1 : int(y*log2(x))
+	faddl	MO(one)		// 2^fract(y*log2(x)) : int(y*log2(x))
+
+	// Before scaling, we must negate if x is negative and y is an
+	// odd integer.
+	testb	$2, %dh
+	jz	291f
+	// x is negative.  If y is an odd integer, negate the result.
+	fldl	20(%esp)	// y : 2^fract(y*log2(x)) : int(y*log2(x))
+	fld	%st		// y : y : 2^fract(y*log2(x)) : int(y*log2(x))
+	fabs			// |y| : y : 2^fract(y*log2(x)) : int(y*log2(x))
+	fcompl	MO(p63)		// y : 2^fract(y*log2(x)) : int(y*log2(x))
+	fnstsw
+	sahf
+	jnc	290f
+
+	// We must find out whether y is an odd integer.
+	fld	%st		// y : y : 2^fract(y*log2(x)) : int(y*log2(x))
+	fistpll	(%esp)		// y : 2^fract(y*log2(x)) : int(y*log2(x))
+	fildll	(%esp)		// int(y) : y : 2^fract(y*log2(x)) : int(y*log2(x))
+	fucompp			// 2^fract(y*log2(x)) : int(y*log2(x))
+	fnstsw
+	sahf
+	jne	291f
+
+	// OK, the value is an integer, but is it odd?
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	292f		// jump if not odd
+	// It's an odd integer.
+	fchs
+	jmp	292f
+
+	cfi_adjust_cfa_offset (8)
+290:	fstp	%st(0)		// 2^fract(y*log2(x)) : int(y*log2(x))
+291:	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+292:	fscale			// +/- 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x))
+	fstp	%st(1)		// +/- 2^fract(y*log2(x))*2^int(y*log2(x))
+	DBL_NARROW_EVAL_UFLOW_NONNAN
+	ret
+
+
+	// pow(x,±0) = 1
+	.align ALIGNARG(4)
+11:	fstp	%st(0)		// pop y
+	fldl	MO(one)
+	ret
+
+	// y == ±inf
+	.align ALIGNARG(4)
+12:	fstp	%st(0)		// pop y
+	fldl	MO(one)		// 1
+	fldl	4(%esp)		// x : 1
+	fabs			// abs(x) : 1
+	fucompp			// < 1, == 1, or > 1
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x45, %ah
+	je	13f		// jump if x is NaN
+
+	cmpb	$0x40, %ah
+	je	14f		// jump if |x| == 1
+
+	shlb	$1, %ah
+	xorb	%ah, %dl
+	andl	$2, %edx
+	fldl	MOX(inf_zero, %edx, 4)
+	ret
+
+	.align ALIGNARG(4)
+14:	fldl	MO(one)
+	ret
+
+	.align ALIGNARG(4)
+13:	fldl	4(%esp)		// load x == NaN
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is ±inf
+15:	fstp	%st(0)		// y
+	testb	$2, %dh
+	jz	16f		// jump if x == +inf
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, so test
+	// that (in which case y is certainly even) before testing
+	// whether y is odd.
+	fld	%st		// y : y
+	fabs			// |y| : y
+	fcompl	MO(p63)		// y
+	fnstsw
+	sahf
+	jnc	16f
+
+	// We must find out whether y is an odd integer.
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	17f
+
+	// OK, the value is an integer.
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	18f		// jump if not odd
+	// It's an odd integer.
+	shrl	$31, %edx
+	fldl	MOX(minf_mzero, %edx, 8)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+16:	fcompl	MO(zero)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fnstsw
+	shrl	$5, %eax
+	andl	$8, %eax
+	fldl	MOX(inf_zero, %eax, 1)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+17:	shll	$30, %edx	// sign bit for y in right position
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+18:	shrl	$31, %edx
+	fldl	MOX(inf_zero, %edx, 8)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is ±0
+20:	fstp	%st(0)		// y
+	testb	$2, %dl
+	jz	21f		// y > 0
+
+	// x is ±0 and y is < 0.  We must find out whether y is an odd integer.
+	testb	$2, %dh
+	jz	25f
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, so test
+	// that (in which case y is certainly even) before testing
+	// whether y is odd.
+	fld	%st		// y : y
+	fabs			// |y| : y
+	fcompl	MO(p63)		// y
+	fnstsw
+	sahf
+	jnc	25f
+
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	26f
+
+	// OK, the value is an integer.
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	27f		// jump if not odd
+	// It's an odd integer.
+	// Raise divide-by-zero exception and get minus infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	fchs
+	ret
+
+	cfi_adjust_cfa_offset (8)
+25:	fstp	%st(0)
+26:	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+27:	// Raise divide-by-zero exception and get infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is ±0 and y is > 0.  We must find out whether y is an odd integer.
+21:	testb	$2, %dh
+	jz	22f
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, so test
+	// that (in which case y is certainly even) before testing
+	// whether y is odd.
+	fcoml	MO(p63)		// y
+	fnstsw
+	sahf
+	jnc	22f
+
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	23f
+
+	// OK, the value is an integer.
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	24f		// jump if not odd
+	// It's an odd integer.
+	fldl	MO(mzero)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+22:	fstp	%st(0)
+23:	addl	$8, %esp	// Don't use 2 x pop
+	cfi_adjust_cfa_offset (-8)
+24:	fldl	MO(zero)
+	ret
+
+END(__ieee754_pow)
+strong_alias (__ieee754_pow, __pow_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_powf.S b/REORG.TODO/sysdeps/i386/fpu/e_powf.S
new file mode 100644
index 0000000000..467ef2380b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_powf.S
@@ -0,0 +1,392 @@
+/* ix87 specific implementation of pow function.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	.type p31,@object
+p31:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41
+	ASM_SIZE_DIRECTIVE(p31)
+
+	.section .rodata.cst16,"aM",@progbits,16
+
+	.p2align 3
+	.type infinity,@object
+inf_zero:
+infinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
+	ASM_SIZE_DIRECTIVE(infinity)
+	.type zero,@object
+zero:	.double 0.0
+	ASM_SIZE_DIRECTIVE(zero)
+	.type minf_mzero,@object
+minf_mzero:
+minfinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
+mzero:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	ASM_SIZE_DIRECTIVE(minf_mzero)
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+	.text
+ENTRY(__ieee754_powf)
+	flds	8(%esp)	// y
+	fxam
+
+#ifdef	PIC
+	LOAD_PIC_REG (cx)
+#endif
+
+	fnstsw
+	movb	%ah, %dl
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah	// is y == 0 ?
+	je	11f
+
+	cmpb	$0x05, %ah	// is y == ±inf ?
+	je	12f
+
+	cmpb	$0x01, %ah	// is y == NaN ?
+	je	30f
+
+	flds	4(%esp)		// x : y
+
+	subl	$4, %esp
+	cfi_adjust_cfa_offset (4)
+
+	fxam
+	fnstsw
+	movb	%ah, %dh
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	je	20f		// x is ±0
+
+	cmpb	$0x05, %ah
+	je	15f		// x is ±inf
+
+	cmpb	$0x01, %ah
+	je	33f		// x is NaN
+
+	fxch			// y : x
+
+	/* fistpl raises invalid exception for |y| >= 1L<<31.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p31)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
+	/* First see whether `y' is a natural number.  In this case we
+	   can use a more precise algorithm.  */
+	fld	%st		// y : y : x
+	fistpl	(%esp)		// y : x
+	fildl	(%esp)		// int(y) : y : x
+	fucomp	%st(1)		// y : x
+	fnstsw
+	sahf
+	jne	3f
+
+	/* OK, we have an integer value for y.  */
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	orl	$0, %edx
+	fstp	%st(0)		// x
+	jns	4f		// y >= 0, jump
+	fdivrl	MO(one)		// 1/x		(now referred to as x)
+	negl	%edx
+4:	fldl	MO(one)		// 1 : x
+	fxch
+
+	/* If y is even, take the absolute value of x.  Otherwise,
+	   ensure all intermediate values that might overflow have the
+	   sign of x.  */
+	testb	$1, %dl
+	jnz	6f
+	fabs
+
+6:	shrl	$1, %edx
+	jnc	5f
+	fxch
+	fabs
+	fmul	%st(1)		// x : ST*x
+	fxch
+5:	fld	%st		// x : x : ST*x
+	fabs			// |x| : x : ST*x
+	fmulp			// |x|*x : ST*x
+	testl	%edx, %edx
+	jnz	6b
+	fstp	%st(0)		// ST*x
+	FLT_NARROW_EVAL_UFLOW_NONNAN
+	ret
+
+	/* y is ±NAN */
+30:	flds	4(%esp)		// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fucomp	%st(1)		// x : y
+	fnstsw
+	sahf
+	je	31f
+	fxch			// y : x
+31:	fstp	%st(1)
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	.align ALIGNARG(4)
+2:	/* y is a large integer (so even).  */
+	fxch			// x : y
+	fabs			// |x| : y
+	fxch			// y : x
+	.align ALIGNARG(4)
+3:	/* y is a real number.  */
+	fxch			// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fldl	MO(limit)	// 0.29 : 1.0 : x : y
+	fld	%st(2)		// x : 0.29 : 1.0 : x : y
+	fsub	%st(2)		// x-1 : 0.29 : 1.0 : x : y
+	fabs			// |x-1| : 0.29 : 1.0 : x : y
+	fucompp			// 1.0 : x : y
+	fnstsw
+	fxch			// x : 1.0 : y
+	sahf
+	ja	7f
+	fsub	%st(1)		// x-1 : 1.0 : y
+	fyl2xp1			// log2(x) : y
+	jmp	8f
+
+7:	fyl2x			// log2(x) : y
+8:	fmul	%st(1)		// y*log2(x) : y
+	fst	%st(1)		// y*log2(x) : y*log2(x)
+	frndint			// int(y*log2(x)) : y*log2(x)
+	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
+	fxch			// fract(y*log2(x)) : int(y*log2(x))
+	f2xm1			// 2^fract(y*log2(x))-1 : int(y*log2(x))
+	faddl	MO(one)		// 2^fract(y*log2(x)) : int(y*log2(x))
+	fscale			// 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x))
+32:	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
+	FLT_NARROW_EVAL_UFLOW_NONNAN
+	ret
+
+	/* x is NaN.  */
+	cfi_adjust_cfa_offset (4)
+33:	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+	fstp	%st(1)
+	ret
+
+	// pow(x,±0) = 1
+	.align ALIGNARG(4)
+11:	fstp	%st(0)		// pop y
+	fldl	MO(one)
+	ret
+
+	// y == ±inf
+	.align ALIGNARG(4)
+12:	fstp	%st(0)		// pop y
+	fldl	MO(one)		// 1
+	flds	4(%esp)		// x : 1
+	fabs			// abs(x) : 1
+	fucompp			// < 1, == 1, or > 1
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x45, %ah
+	je	13f		// jump if x is NaN
+
+	cmpb	$0x40, %ah
+	je	14f		// jump if |x| == 1
+
+	shlb	$1, %ah
+	xorb	%ah, %dl
+	andl	$2, %edx
+	fldl	MOX(inf_zero, %edx, 4)
+	ret
+
+	.align ALIGNARG(4)
+14:	fldl	MO(one)
+	ret
+
+	.align ALIGNARG(4)
+13:	flds	4(%esp)		// load x == NaN
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	.align ALIGNARG(4)
+	// x is ±inf
+15:	fstp	%st(0)		// y
+	testb	$2, %dh
+	jz	16f		// jump if x == +inf
+
+	// fistpl raises invalid exception for |y| >= 1L<<31, so test
+	// that (in which case y is certainly even) before testing
+	// whether y is odd.
+	fld	%st		// y : y
+	fabs			// |y| : y
+	fcompl	MO(p31)		// y
+	fnstsw
+	sahf
+	jnc	16f
+
+	// We must find out whether y is an odd integer.
+	fld	%st		// y : y
+	fistpl	(%esp)		// y
+	fildl	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	17f
+
+	// OK, the value is an integer.
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	testb	$1, %dl
+	jz	18f		// jump if not odd
+	// It's an odd integer.
+	shrl	$31, %edx
+	fldl	MOX(minf_mzero, %edx, 8)
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	.align ALIGNARG(4)
+16:	fcompl	MO(zero)
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+	fnstsw
+	shrl	$5, %eax
+	andl	$8, %eax
+	fldl	MOX(inf_zero, %eax, 1)
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	.align ALIGNARG(4)
+17:	shll	$30, %edx	// sign bit for y in right position
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+18:	shrl	$31, %edx
+	fldl	MOX(inf_zero, %edx, 8)
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	.align ALIGNARG(4)
+	// x is ±0
+20:	fstp	%st(0)		// y
+	testb	$2, %dl
+	jz	21f		// y > 0
+
+	// x is ±0 and y is < 0.  We must find out whether y is an odd integer.
+	testb	$2, %dh
+	jz	25f
+
+	// fistpl raises invalid exception for |y| >= 1L<<31, so test
+	// that (in which case y is certainly even) before testing
+	// whether y is odd.
+	fld	%st		// y : y
+	fabs			// |y| : y
+	fcompl	MO(p31)		// y
+	fnstsw
+	sahf
+	jnc	25f
+
+	fld	%st		// y : y
+	fistpl	(%esp)		// y
+	fildl	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	26f
+
+	// OK, the value is an integer.
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	testb	$1, %dl
+	jz	27f		// jump if not odd
+	// It's an odd integer.
+	// Raise divide-by-zero exception and get minus infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	fchs
+	ret
+
+	cfi_adjust_cfa_offset (4)
+25:	fstp	%st(0)
+26:	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+27:	// Raise divide-by-zero exception and get infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	.align ALIGNARG(4)
+	// x is ±0 and y is > 0.  We must find out whether y is an odd integer.
+21:	testb	$2, %dh
+	jz	22f
+
+	// fistpl raises invalid exception for |y| >= 1L<<31, so test
+	// that (in which case y is certainly even) before testing
+	// whether y is odd.
+	fcoml	MO(p31)		// y
+	fnstsw
+	sahf
+	jnc	22f
+
+	fld	%st		// y : y
+	fistpl	(%esp)		// y
+	fildl	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	23f
+
+	// OK, the value is an integer.
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	testb	$1, %dl
+	jz	24f		// jump if not odd
+	// It's an odd integer.
+	fldl	MO(mzero)
+	ret
+
+	cfi_adjust_cfa_offset (4)
+22:	fstp	%st(0)
+23:	addl	$4, %esp	// Don't use pop.
+	cfi_adjust_cfa_offset (-4)
+24:	fldl	MO(zero)
+	ret
+
+END(__ieee754_powf)
+strong_alias (__ieee754_powf, __powf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_powl.S b/REORG.TODO/sysdeps/i386/fpu/e_powl.S
new file mode 100644
index 0000000000..9e162848e4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_powl.S
@@ -0,0 +1,459 @@
+/* ix87 specific implementation of pow function.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type p2,@object
+p2:	.byte 0, 0, 0, 0, 0, 0, 0x10, 0x40
+	ASM_SIZE_DIRECTIVE(p2)
+	.type p63,@object
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
+	.type p64,@object
+p64:	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
+	ASM_SIZE_DIRECTIVE(p64)
+	.type p78,@object
+p78:	.byte 0, 0, 0, 0, 0, 0, 0xd0, 0x44
+	ASM_SIZE_DIRECTIVE(p78)
+	.type pm79,@object
+pm79:	.byte 0, 0, 0, 0, 0, 0, 0, 0x3b
+	ASM_SIZE_DIRECTIVE(pm79)
+
+	.section .rodata.cst16,"aM",@progbits,16
+
+	.p2align 3
+	.type infinity,@object
+inf_zero:
+infinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
+	ASM_SIZE_DIRECTIVE(infinity)
+	.type zero,@object
+zero:	.double 0.0
+	ASM_SIZE_DIRECTIVE(zero)
+	.type minf_mzero,@object
+minf_mzero:
+minfinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
+mzero:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	ASM_SIZE_DIRECTIVE(minf_mzero)
+DEFINE_LDBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+	.text
+ENTRY(__ieee754_powl)
+	fldt	16(%esp)	// y
+	fxam
+
+#ifdef	PIC
+	LOAD_PIC_REG (cx)
+#endif
+
+	fnstsw
+	movb	%ah, %dl
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah	// is y == 0 ?
+	je	11f
+
+	cmpb	$0x05, %ah	// is y == ±inf ?
+	je	12f
+
+	cmpb	$0x01, %ah	// is y == NaN ?
+	je	30f
+
+	fldt	4(%esp)		// x : y
+
+	subl	$8,%esp
+	cfi_adjust_cfa_offset (8)
+
+	fxam
+	fnstsw
+	movb	%ah, %dh
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	je	20f		// x is ±0
+
+	cmpb	$0x05, %ah
+	je	15f		// x is ±inf
+
+	cmpb	$0x01, %ah
+	je	32f		// x is NaN
+
+	fxch			// y : x
+
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p63)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
+	/* First see whether `y' is a natural number.  In this case we
+	   can use a more precise algorithm.  */
+	fld	%st		// y : y : x
+	fistpll	(%esp)		// y : x
+	fildll	(%esp)		// int(y) : y : x
+	fucomp	%st(1)		// y : x
+	fnstsw
+	sahf
+	je	9f
+
+	// If y has absolute value at most 0x1p-79, then any finite
+	// nonzero x will result in 1.  Saturate y to those bounds to
+	// avoid underflow in the calculation of y*log2(x).
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(pm79)	// y : x
+	fnstsw
+	sahf
+	jnc	3f
+	fstp	%st(0)		// pop y
+	fldl	MO(pm79)	// 0x1p-79 : x
+	testb	$2, %dl
+	jnz	3f		// y > 0
+	fchs			// -0x1p-79 : x
+	jmp	3f
+
+9:	/* OK, we have an integer value for y.  Unless very small
+	   (we use < 4), use the algorithm for real exponent to avoid
+	   accumulation of errors.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p2)		// y : x
+	fnstsw
+	sahf
+	jnc	3f
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	orl	$0, %edx
+	fstp	%st(0)		// x
+	jns	4f		// y >= 0, jump
+	fdivrl	MO(one)		// 1/x		(now referred to as x)
+	negl	%eax
+	adcl	$0, %edx
+	negl	%edx
+4:	fldl	MO(one)		// 1 : x
+	fxch
+
+	/* If y is even, take the absolute value of x.  Otherwise,
+	   ensure all intermediate values that might overflow have the
+	   sign of x.  */
+	testb	$1, %al
+	jnz	6f
+	fabs
+
+6:	shrdl	$1, %edx, %eax
+	jnc	5f
+	fxch
+	fabs
+	fmul	%st(1)		// x : ST*x
+	fxch
+5:	fld	%st		// x : x : ST*x
+	fabs			// |x| : x : ST*x
+	fmulp			// |x|*x : ST*x
+	shrl	$1, %edx
+	movl	%eax, %ecx
+	orl	%edx, %ecx
+	jnz	6b
+	fstp	%st(0)		// ST*x
+#ifdef	PIC
+	LOAD_PIC_REG (cx)
+#endif
+	LDBL_CHECK_FORCE_UFLOW_NONNAN
+	ret
+
+	/* y is ±NAN */
+30:	fldt	4(%esp)		// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fucomp	%st(1)		// x : y
+	fnstsw
+	sahf
+	je	33f
+31:	/* At least one argument NaN, and result should be NaN.  */
+	faddp
+	ret
+33:	jp	31b
+	/* pow (1, NaN); check if the NaN signaling.  */
+	testb	$0x40, 23(%esp)
+	jz	31b
+	fstp	%st(1)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+32:	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	faddp
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+2:	// y is a large integer (absolute value at least 1L<<63).
+	// If y has absolute value at least 1L<<78, then any finite
+	// nonzero x will result in 0 (underflow), 1 or infinity (overflow).
+	// Saturate y to those bounds to avoid overflow in the calculation
+	// of y*log2(x).
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p78)		// y : x
+	fnstsw
+	sahf
+	jc	3f
+	fstp	%st(0)		// pop y
+	fldl	MO(p78)		// 1L<<78 : x
+	testb	$2, %dl
+	jz	3f		// y > 0
+	fchs			// -(1L<<78) : x
+	.align ALIGNARG(4)
+3:	/* y is a real number.  */
+	subl	$28, %esp
+	cfi_adjust_cfa_offset (28)
+	fstpt	12(%esp)	// x
+	fstpt	(%esp)		// <empty>
+	call	HIDDEN_JUMPTARGET (__powl_helper)	// <result>
+	addl	$36, %esp
+	cfi_adjust_cfa_offset (-36)
+	ret
+
+	// pow(x,±0) = 1, unless x is sNaN
+	.align ALIGNARG(4)
+11:	fstp	%st(0)		// pop y
+	fldt	4(%esp)		// x
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	112f		// x is NaN
+111:	fstp	%st(0)
+	fldl	MO(one)
+	ret
+
+112:	testb	$0x40, 11(%esp)
+	jnz	111b
+	fadd	%st(0)
+	ret
+
+	// y == ±inf
+	.align ALIGNARG(4)
+12:	fstp	%st(0)		// pop y
+	fldl	MO(one)		// 1
+	fldt	4(%esp)		// x : 1
+	fabs			// abs(x) : 1
+	fucompp			// < 1, == 1, or > 1
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x45, %ah
+	je	13f		// jump if x is NaN
+
+	cmpb	$0x40, %ah
+	je	14f		// jump if |x| == 1
+
+	shlb	$1, %ah
+	xorb	%ah, %dl
+	andl	$2, %edx
+	fldl	MOX(inf_zero, %edx, 4)
+	ret
+
+	.align ALIGNARG(4)
+14:	fldl	MO(one)
+	ret
+
+	.align ALIGNARG(4)
+13:	fldt	4(%esp)		// load x == NaN
+	fadd	%st(0)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is ±inf
+15:	fstp	%st(0)		// y
+	testb	$2, %dh
+	jz	16f		// jump if x == +inf
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, but y
+	// may be odd unless we know |y| >= 1L<<64.
+	fld	%st		// y : y
+	fabs			// |y| : y
+	fcompl	MO(p64)		// y
+	fnstsw
+	sahf
+	jnc	16f
+	fldl	MO(p63)		// p63 : y
+	fxch			// y : p63
+	fprem			// y%p63 : p63
+	fstp	%st(1)		// y%p63
+
+	// We must find out whether y is an odd integer.
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	17f
+
+	// OK, the value is an integer, but is it odd?
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	18f		// jump if not odd
+	// It's an odd integer.
+	shrl	$31, %edx
+	fldl	MOX(minf_mzero, %edx, 8)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+16:	fcompl	MO(zero)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fnstsw
+	shrl	$5, %eax
+	andl	$8, %eax
+	fldl	MOX(inf_zero, %eax, 1)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+17:	shll	$30, %edx	// sign bit for y in right position
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+18:	shrl	$31, %edx
+	fldl	MOX(inf_zero, %edx, 8)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is ±0
+20:	fstp	%st(0)		// y
+	testb	$2, %dl
+	jz	21f		// y > 0
+
+	// x is ±0 and y is < 0.  We must find out whether y is an odd integer.
+	testb	$2, %dh
+	jz	25f
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, but y
+	// may be odd unless we know |y| >= 1L<<64.
+	fld	%st		// y : y
+	fabs			// |y| : y
+	fcompl	MO(p64)		// y
+	fnstsw
+	sahf
+	jnc	25f
+	fldl	MO(p63)		// p63 : y
+	fxch			// y : p63
+	fprem			// y%p63 : p63
+	fstp	%st(1)		// y%p63
+
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	26f
+
+	// OK, the value is an integer, but is it odd?
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	27f		// jump if not odd
+	// It's an odd integer.
+	// Raise divide-by-zero exception and get minus infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	fchs
+	ret
+
+	cfi_adjust_cfa_offset (8)
+25:	fstp	%st(0)
+26:	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+27:	// Raise divide-by-zero exception and get infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is ±0 and y is > 0.  We must find out whether y is an odd integer.
+21:	testb	$2, %dh
+	jz	22f
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, but y
+	// may be odd unless we know |y| >= 1L<<64.
+	fld	%st		// y : y
+	fcompl	MO(p64)		// y
+	fnstsw
+	sahf
+	jnc	22f
+	fldl	MO(p63)		// p63 : y
+	fxch			// y : p63
+	fprem			// y%p63 : p63
+	fstp	%st(1)		// y%p63
+
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	23f
+
+	// OK, the value is an integer, but is it odd?
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	24f		// jump if not odd
+	// It's an odd integer.
+	fldl	MO(mzero)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+22:	fstp	%st(0)
+23:	addl	$8, %esp	// Don't use 2 x pop
+	cfi_adjust_cfa_offset (-8)
+24:	fldl	MO(zero)
+	ret
+
+END(__ieee754_powl)
+strong_alias (__ieee754_powl, __powl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c b/REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c
new file mode 100644
index 0000000000..1347b0468c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c
@@ -0,0 +1,3 @@
+/* Empty.  This file is only meant to avoid compiling the file with the
+   same name in the libm-ieee754 directory.  The code is not used since
+   there is an assembler version for all users of this file.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_remainder.S b/REORG.TODO/sysdeps/i386/fpu/e_remainder.S
new file mode 100644
index 0000000000..f7867aa90b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_remainder.S
@@ -0,0 +1,18 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_remainder)
+	fldl	12(%esp)
+	fldl	4(%esp)
+1:	fprem1
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	ret
+END (__ieee754_remainder)
+strong_alias (__ieee754_remainder, __remainder_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_remainderf.S b/REORG.TODO/sysdeps/i386/fpu/e_remainderf.S
new file mode 100644
index 0000000000..cfd390bc69
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_remainderf.S
@@ -0,0 +1,18 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_remainderf)
+	flds	8(%esp)
+	flds	4(%esp)
+1:	fprem1
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	ret
+END (__ieee754_remainderf)
+strong_alias (__ieee754_remainderf, __remainderf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_remainderl.S b/REORG.TODO/sysdeps/i386/fpu/e_remainderl.S
new file mode 100644
index 0000000000..5ec23a37a3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_remainderl.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_remainderl)
+	fldt	16(%esp)
+	fldt	4(%esp)
+1:	fprem1
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	ret
+END (__ieee754_remainderl)
+strong_alias (__ieee754_remainderl, __remainderl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_scalb.S b/REORG.TODO/sysdeps/i386/fpu/e_scalb.S
new file mode 100644
index 0000000000..370924c29f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_scalb.S
@@ -0,0 +1,100 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Correct handling of y==-inf <drepper@gnu>
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type zero_nan,@object
+zero_nan:
+	.double 0.0
+nan:	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+	ASM_SIZE_DIRECTIVE(zero_nan)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+	.text
+ENTRY(__ieee754_scalb)
+	fldl	12(%esp)
+	fxam
+	fnstsw
+	fldl	4(%esp)
+	andl	$0x4700, %eax
+	cmpl	$0x0700, %eax
+	je	1f
+	andl	$0x4500, %eax
+	cmpl	$0x0100, %eax
+	je	2f
+	fxam
+	fnstsw
+	andl	$0x4500, %eax
+	cmpl	$0x0100, %eax
+	je	3f
+	fld	%st(1)
+	frndint
+	fcomp	%st(2)
+	fnstsw
+	sahf
+	jne	4f
+	fscale
+	fstp	%st(1)
+	DBL_NARROW_EVAL
+	ret
+
+	/* y is -inf */
+1:	fxam
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fnstsw
+	movl	8(%esp), %edx
+	shrl	$5, %eax
+	fstp	%st
+	fstp	%st
+	andl	$0x80000000, %edx
+	andl	$0x0228, %eax
+	cmpl	$0x0028, %eax
+	je	4f
+	andl	$8, %eax
+	shrl	$27, %edx
+	addl	%edx, %eax
+	fldl	MOX(zero_nan, %eax, 1)
+	ret
+
+	/* The result is NaN, but we must not raise an exception.
+	   So use a variable.  */
+2:	fstp	%st
+	fstp	%st
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	MO(nan)
+	ret
+
+	/* The first parameter is a NaN.  Return it.  */
+3:	fstp	%st(1)
+	ret
+
+	/* Return NaN and raise the invalid exception.  */
+4:	fstp	%st
+	fstp	%st
+	fldz
+	fdiv	%st
+	ret
+END(__ieee754_scalb)
+strong_alias (__ieee754_scalb, __scalb_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_scalbf.S b/REORG.TODO/sysdeps/i386/fpu/e_scalbf.S
new file mode 100644
index 0000000000..4f2dfa3acf
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_scalbf.S
@@ -0,0 +1,102 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Correct handling of y==-inf <drepper@gnu>
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type zero_nan,@object
+zero_nan:
+	.double 0.0
+nan:	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+	ASM_SIZE_DIRECTIVE(zero_nan)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+
+	.text
+ENTRY(__ieee754_scalbf)
+	flds	8(%esp)
+	fxam
+	fnstsw
+	flds	4(%esp)
+	andl	$0x4700, %eax
+	cmpl	$0x0700, %eax
+	je	1f
+	andl	$0x4500, %eax
+	cmpl	$0x0100, %eax
+	je	2f
+	fxam
+	fnstsw
+	andl	$0x4500, %eax
+	cmpl	$0x0100, %eax
+	je	3f
+	fld	%st(1)
+	frndint
+	fcomp	%st(2)
+	fnstsw
+	sahf
+	jne	4f
+	fscale
+	fstp	%st(1)
+	FLT_NARROW_EVAL
+	ret
+
+	/* y is -inf */
+1:	fxam
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fnstsw
+	movl	4(%esp), %edx
+	shrl	$5, %eax
+	fstp	%st
+	fstp	%st
+	andl	$0x80000000, %edx
+	andl	$0x0228, %eax
+	cmpl	$0x0028, %eax
+	je	4f
+	andl	$8, %eax
+	shrl	$27, %edx
+	addl	%edx, %eax
+	fldl	MOX(zero_nan, %eax, 1)
+	ret
+
+	/* The result is NaN, but we must not raise an exception.
+	   So use a variable.  */
+2:	fstp	%st
+	fstp	%st
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	MO(nan)
+	ret
+
+	/* The first parameter is a NaN.  Return it.  */
+3:	fstp	%st(1)
+	ret
+
+	/* Return NaN and raise the invalid exception.  */
+4:	fstp	%st
+	fstp	%st
+	fldz
+	fdiv	%st
+	ret
+END(__ieee754_scalbf)
+strong_alias (__ieee754_scalbf, __scalbf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_scalbl.S b/REORG.TODO/sysdeps/i386/fpu/e_scalbl.S
new file mode 100644
index 0000000000..896f599cb0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_scalbl.S
@@ -0,0 +1,90 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Correct handling of y==-inf <drepper@gnu>
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type zero_nan,@object
+zero_nan:
+	.double 0.0
+nan:	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+	ASM_SIZE_DIRECTIVE(zero_nan)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+	.text
+ENTRY(__ieee754_scalbl)
+	fldt	16(%esp)
+	fxam
+	fnstsw
+	fldt	4(%esp)
+	andl	$0x4700, %eax
+	cmpl	$0x0700, %eax
+	je	1f
+	andl	$0x4500, %eax
+	cmpl	$0x0100, %eax
+	je	2f
+	fxam
+	fnstsw
+	andl	$0x4500, %eax
+	cmpl	$0x0100, %eax
+	je	2f
+	fld	%st(1)
+	frndint
+	fcomp	%st(2)
+	fnstsw
+	sahf
+	jne	4f
+	fscale
+	fstp	%st(1)
+	ret
+
+	/* y is -inf */
+1:	fxam
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fnstsw
+	movl	12(%esp), %edx
+	shrl	$5, %eax
+	fstp	%st
+	fstp	%st
+	andl	$0x8000, %edx
+	andl	$0x0228, %eax
+	cmpl	$0x0028, %eax
+	je	4f
+	andl	$8, %eax
+	shrl	$11, %edx
+	addl	%edx, %eax
+	fldl	MOX(zero_nan, %eax, 1)
+	ret
+
+	/* The result is NaN; raise an exception for sNaN arguments.  */
+2:	faddp
+	ret
+
+	/* Return NaN and raise the invalid exception.  */
+4:	fstp	%st
+	fstp	%st
+	fldz
+	fdiv	%st
+	ret
+END(__ieee754_scalbl)
+strong_alias (__ieee754_scalbl, __scalbl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_sqrt.S b/REORG.TODO/sysdeps/i386/fpu/e_sqrt.S
new file mode 100644
index 0000000000..fba5833a9a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_sqrt.S
@@ -0,0 +1,23 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_sqrt)
+	fldl	4(%esp)
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fstcw	4(%esp)
+	movl	$0xfeff, %edx
+	andl	4(%esp), %edx
+	movl	%edx, (%esp)
+	fldcw	(%esp)
+	fsqrt
+	fldcw	4(%esp)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	ret
+END (__ieee754_sqrt)
+strong_alias (__ieee754_sqrt, __sqrt_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S b/REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S
new file mode 100644
index 0000000000..6f7e4b015f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S
@@ -0,0 +1,13 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_sqrtf)
+	flds	4(%esp)
+	fsqrt
+	ret
+END (__ieee754_sqrtf)
+strong_alias (__ieee754_sqrtf, __sqrtf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c b/REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c
new file mode 100644
index 0000000000..41bcd7eeb7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+#undef __ieee754_sqrtl
+long double
+__ieee754_sqrtl (long double x)
+{
+  long double res;
+
+  asm ("fsqrt" : "=t" (res) : "0" (x));
+
+  return res;
+}
+strong_alias (__ieee754_sqrtl, __sqrtl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c b/REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c
new file mode 100644
index 0000000000..5d8596964b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c
@@ -0,0 +1,69 @@
+/* Clear given exceptions in current floating-point environment.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__feclearexcept (int excepts)
+{
+  fenv_t temp;
+
+  /* Mask out unsupported bits/exceptions.  */
+  excepts &= FE_ALL_EXCEPT;
+
+  /* Bah, we have to clear selected exceptions.  Since there is no
+     `fldsw' instruction we have to do it the hard way.  */
+  __asm__ ("fnstenv %0" : "=m" (*&temp));
+
+  /* Clear the relevant bits.  */
+  temp.__status_word &= excepts ^ FE_ALL_EXCEPT;
+
+  /* Put the new data in effect.  */
+  __asm__ ("fldenv %0" : : "m" (*&temp));
+
+  /* If the CPU supports SSE, we clear the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int xnew_exc;
+
+      /* Get the current MXCSR.  */
+      __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+      /* Clear the relevant bits.  */
+      xnew_exc &= ~excepts;
+
+      /* Put the new data in effect.  */
+      __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+    }
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feclearexcept, __old_feclearexcept)
+compat_symbol (libm, __old_feclearexcept, feclearexcept, GLIBC_2_1);
+#endif
+
+libm_hidden_ver (__feclearexcept, feclearexcept)
+versioned_symbol (libm, __feclearexcept, feclearexcept, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c b/REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c
new file mode 100644
index 0000000000..f8db665425
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c
@@ -0,0 +1,54 @@
+/* Disable floating-point exceptions.
+   Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+fedisableexcept (int excepts)
+{
+  unsigned short int new_exc, old_exc;
+
+  /* Get the current control word.  */
+  __asm__ ("fstcw %0" : "=m" (*&new_exc));
+
+  old_exc = (~new_exc) & FE_ALL_EXCEPT;
+
+  excepts &= FE_ALL_EXCEPT;
+
+  new_exc |= excepts;
+  __asm__ ("fldcw %0" : : "m" (*&new_exc));
+
+  /* If the CPU supports SSE we set the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int xnew_exc;
+
+      /* Get the current control word.  */
+      __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+      xnew_exc |= excepts << 7;
+
+      __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+    }
+
+  return old_exc;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c b/REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c
new file mode 100644
index 0000000000..f1c42d7c27
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c
@@ -0,0 +1,54 @@
+/* Enable floating-point exceptions.
+   Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+feenableexcept (int excepts)
+{
+  unsigned short int new_exc;
+  unsigned short int old_exc;
+
+  /* Get the current control word.  */
+  __asm__ ("fstcw %0" : "=m" (*&new_exc));
+
+  excepts &= FE_ALL_EXCEPT;
+  old_exc = (~new_exc) & FE_ALL_EXCEPT;
+
+  new_exc &= ~excepts;
+  __asm__ ("fldcw %0" : : "m" (*&new_exc));
+
+  /* If the CPU supports SSE we set the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int xnew_exc;
+
+      /* Get the current control word.  */
+      __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+      xnew_exc &= ~(excepts << 7);
+
+      __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+    }
+
+  return old_exc;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetenv.c b/REORG.TODO/sysdeps/i386/fpu/fegetenv.c
new file mode 100644
index 0000000000..983f6af25e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fegetenv.c
@@ -0,0 +1,49 @@
+/* Store current floating-point environment.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__fegetenv (fenv_t *envp)
+{
+  __asm__ ("fnstenv %0" : "=m" (*envp));
+  /* And load it right back since the processor changes the mask.
+     Intel thought this opcode to be used in interrupt handlers which
+     would block all exceptions.  */
+  __asm__ ("fldenv %0" : : "m" (*envp));
+
+  if (HAS_CPU_FEATURE (SSE))
+    __asm__ ("stmxcsr %0" : "=m" (envp->__eip));
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fegetenv, __old_fegetenv)
+compat_symbol (libm, __old_fegetenv, fegetenv, GLIBC_2_1);
+#endif
+
+libm_hidden_def (__fegetenv)
+libm_hidden_ver (__fegetenv, fegetenv)
+versioned_symbol (libm, __fegetenv, fegetenv, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetexcept.c b/REORG.TODO/sysdeps/i386/fpu/fegetexcept.c
new file mode 100644
index 0000000000..dc87b7a470
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fegetexcept.c
@@ -0,0 +1,31 @@
+/* Get enabled floating-point exceptions.
+   Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+fegetexcept (void)
+{
+  unsigned short int exc;
+
+  /* Get the current control word.  */
+  __asm__ ("fstcw %0" : "=m" (*&exc));
+
+  return (~exc) & FE_ALL_EXCEPT;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetmode.c b/REORG.TODO/sysdeps/i386/fpu/fegetmode.c
new file mode 100644
index 0000000000..abbce3075f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fegetmode.c
@@ -0,0 +1,32 @@
+/* Store current floating-point control modes.  i386 version.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <fpu_control.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+fegetmode (femode_t *modep)
+{
+  _FPU_GETCW (modep->__control_word);
+  if (HAS_CPU_FEATURE (SSE))
+    __asm__ ("stmxcsr %0" : "=m" (modep->__mxcsr));
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetround.c b/REORG.TODO/sysdeps/i386/fpu/fegetround.c
new file mode 100644
index 0000000000..8ce8b859d8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fegetround.c
@@ -0,0 +1,33 @@
+/* Return current rounding direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+__fegetround (void)
+{
+  int cw;
+
+  __asm__ ("fnstcw %0" : "=m" (*&cw));
+
+  return cw & 0xc00;
+}
+libm_hidden_def (__fegetround)
+weak_alias (__fegetround, fegetround)
+libm_hidden_weak (fegetround)
diff --git a/REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c b/REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c
new file mode 100644
index 0000000000..d327358913
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c
@@ -0,0 +1,50 @@
+/* Store current floating-point environment and clear exceptions.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__feholdexcept (fenv_t *envp)
+{
+  /* Store the environment.  Recall that fnstenv has a side effect of
+     masking all exceptions.  Then clear all exceptions.  */
+  __asm__ volatile ("fnstenv %0; fnclex" : "=m" (*envp));
+
+  /* If the CPU supports SSE we set the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int xwork;
+
+      /* Get the current control word.  */
+      __asm__ ("stmxcsr %0" : "=m" (envp->__eip));
+
+      /* Set all exceptions to non-stop and clear them.  */
+      xwork = (envp->__eip | 0x1f80) & ~0x3f;
+
+      __asm__ ("ldmxcsr %0" : : "m" (*&xwork));
+    }
+
+  return 0;
+}
+libm_hidden_def (__feholdexcept)
+weak_alias (__feholdexcept, feholdexcept)
+libm_hidden_weak (feholdexcept)
diff --git a/REORG.TODO/sysdeps/i386/fpu/fenv_private.h b/REORG.TODO/sysdeps/i386/fpu/fenv_private.h
new file mode 100644
index 0000000000..e20e1f1662
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fenv_private.h
@@ -0,0 +1,501 @@
+#ifndef FENV_PRIVATE_H
+#define FENV_PRIVATE_H 1
+
+#include <fenv.h>
+#include <fpu_control.h>
+
+#ifdef __SSE2_MATH__
+# define math_opt_barrier(x) \
+  ({ __typeof(x) __x;					\
+     if (sizeof (x) <= sizeof (double))			\
+       __asm ("" : "=x" (__x) : "0" (x));		\
+     else						\
+       __asm ("" : "=t" (__x) : "0" (x));		\
+     __x; })
+# define math_force_eval(x) \
+  do {							\
+    if (sizeof (x) <= sizeof (double))			\
+      __asm __volatile ("" : : "x" (x));		\
+    else						\
+      __asm __volatile ("" : : "f" (x));		\
+  } while (0)
+#else
+# define math_opt_barrier(x) \
+  ({ __typeof (x) __x;					\
+     __asm ("" : "=t" (__x) : "0" (x));			\
+     __x; })
+# define math_force_eval(x) \
+  do {							\
+    __typeof (x) __x = (x);				\
+    if (sizeof (x) <= sizeof (double))			\
+      __asm __volatile ("" : : "m" (__x));		\
+    else						\
+      __asm __volatile ("" : : "f" (__x));		\
+  } while (0)
+#endif
+
+/* This file is used by both the 32- and 64-bit ports.  The 64-bit port
+   has a field in the fenv_t for the mxcsr; the 32-bit port does not.
+   Instead, we (ab)use the only 32-bit field extant in the struct.  */
+#ifndef __x86_64__
+# define __mxcsr	__eip
+#endif
+
+
+/* All of these functions are private to libm, and are all used in pairs
+   to save+change the fp state and restore the original state.  Thus we
+   need not care for both the 387 and the sse unit, only the one we're
+   actually using.  */
+
+#if defined __AVX__ || defined SSE2AVX
+# define STMXCSR "vstmxcsr"
+# define LDMXCSR "vldmxcsr"
+#else
+# define STMXCSR "stmxcsr"
+# define LDMXCSR "ldmxcsr"
+#endif
+
+static __always_inline void
+libc_feholdexcept_sse (fenv_t *e)
+{
+  unsigned int mxcsr;
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
+  e->__mxcsr = mxcsr;
+  mxcsr = (mxcsr | 0x1f80) & ~0x3f;
+  asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+static __always_inline void
+libc_feholdexcept_387 (fenv_t *e)
+{
+  /* Recall that fnstenv has a side-effect of masking exceptions.
+     Clobber all of the fp registers so that the TOS field is 0.  */
+  asm volatile ("fnstenv %0; fnclex"
+		: "=m"(*e)
+		: : "st", "st(1)", "st(2)", "st(3)",
+		    "st(4)", "st(5)", "st(6)", "st(7)");
+}
+
+static __always_inline void
+libc_fesetround_sse (int r)
+{
+  unsigned int mxcsr;
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
+  mxcsr = (mxcsr & ~0x6000) | (r << 3);
+  asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+static __always_inline void
+libc_fesetround_387 (int r)
+{
+  fpu_control_t cw;
+  _FPU_GETCW (cw);
+  cw = (cw & ~0xc00) | r;
+  _FPU_SETCW (cw);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_sse (fenv_t *e, int r)
+{
+  unsigned int mxcsr;
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
+  e->__mxcsr = mxcsr;
+  mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3);
+  asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+/* Set both rounding mode and precision.  A convenience function for use
+   by libc_feholdexcept_setround and libc_feholdexcept_setround_53bit. */
+static __always_inline void
+libc_feholdexcept_setround_387_prec (fenv_t *e, int r)
+{
+  libc_feholdexcept_387 (e);
+
+  fpu_control_t cw = e->__control_word;
+  cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
+  cw |= r | 0x3f;
+  _FPU_SETCW (cw);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387 (fenv_t *e, int r)
+{
+  libc_feholdexcept_setround_387_prec (e, r | _FPU_EXTENDED);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387_53bit (fenv_t *e, int r)
+{
+  libc_feholdexcept_setround_387_prec (e, r | _FPU_DOUBLE);
+}
+
+static __always_inline int
+libc_fetestexcept_sse (int e)
+{
+  unsigned int mxcsr;
+  asm volatile (STMXCSR " %0" : "=m" (*&mxcsr));
+  return mxcsr & e & FE_ALL_EXCEPT;
+}
+
+static __always_inline int
+libc_fetestexcept_387 (int ex)
+{
+  fexcept_t temp;
+  asm volatile ("fnstsw %0" : "=a" (temp));
+  return temp & ex & FE_ALL_EXCEPT;
+}
+
+static __always_inline void
+libc_fesetenv_sse (fenv_t *e)
+{
+  asm volatile (LDMXCSR " %0" : : "m" (e->__mxcsr));
+}
+
+static __always_inline void
+libc_fesetenv_387 (fenv_t *e)
+{
+  /* Clobber all fp registers so that the TOS value we saved earlier is
+     compatible with the current state of the compiler.  */
+  asm volatile ("fldenv %0"
+		: : "m" (*e)
+		: "st", "st(1)", "st(2)", "st(3)",
+		  "st(4)", "st(5)", "st(6)", "st(7)");
+}
+
+static __always_inline int
+libc_feupdateenv_test_sse (fenv_t *e, int ex)
+{
+  unsigned int mxcsr, old_mxcsr, cur_ex;
+  asm volatile (STMXCSR " %0" : "=m" (*&mxcsr));
+  cur_ex = mxcsr & FE_ALL_EXCEPT;
+
+  /* Merge current exceptions with the old environment.  */
+  old_mxcsr = e->__mxcsr;
+  mxcsr = old_mxcsr | cur_ex;
+  asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+
+  /* Raise SIGFPE for any new exceptions since the hold.  Expect that
+     the normal environment has all exceptions masked.  */
+  if (__glibc_unlikely (~(old_mxcsr >> 7) & cur_ex))
+    __feraiseexcept (cur_ex);
+
+  /* Test for exceptions raised since the hold.  */
+  return cur_ex & ex;
+}
+
+static __always_inline int
+libc_feupdateenv_test_387 (fenv_t *e, int ex)
+{
+  fexcept_t cur_ex;
+
+  /* Save current exceptions.  */
+  asm volatile ("fnstsw %0" : "=a" (cur_ex));
+  cur_ex &= FE_ALL_EXCEPT;
+
+  /* Reload original environment.  */
+  libc_fesetenv_387 (e);
+
+  /* Merge current exceptions.  */
+  __feraiseexcept (cur_ex);
+
+  /* Test for exceptions raised since the hold.  */
+  return cur_ex & ex;
+}
+
+static __always_inline void
+libc_feupdateenv_sse (fenv_t *e)
+{
+  libc_feupdateenv_test_sse (e, 0);
+}
+
+static __always_inline void
+libc_feupdateenv_387 (fenv_t *e)
+{
+  libc_feupdateenv_test_387 (e, 0);
+}
+
+static __always_inline void
+libc_feholdsetround_sse (fenv_t *e, int r)
+{
+  unsigned int mxcsr;
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
+  e->__mxcsr = mxcsr;
+  mxcsr = (mxcsr & ~0x6000) | (r << 3);
+  asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+static __always_inline void
+libc_feholdsetround_387_prec (fenv_t *e, int r)
+{
+  fpu_control_t cw;
+
+  _FPU_GETCW (cw);
+  e->__control_word = cw;
+  cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
+  cw |= r;
+  _FPU_SETCW (cw);
+}
+
+static __always_inline void
+libc_feholdsetround_387 (fenv_t *e, int r)
+{
+  libc_feholdsetround_387_prec (e, r | _FPU_EXTENDED);
+}
+
+static __always_inline void
+libc_feholdsetround_387_53bit (fenv_t *e, int r)
+{
+  libc_feholdsetround_387_prec (e, r | _FPU_DOUBLE);
+}
+
+static __always_inline void
+libc_feresetround_sse (fenv_t *e)
+{
+  unsigned int mxcsr;
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
+  mxcsr = (mxcsr & ~0x6000) | (e->__mxcsr & 0x6000);
+  asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+static __always_inline void
+libc_feresetround_387 (fenv_t *e)
+{
+  _FPU_SETCW (e->__control_word);
+}
+
+#ifdef __SSE_MATH__
+# define libc_feholdexceptf		libc_feholdexcept_sse
+# define libc_fesetroundf		libc_fesetround_sse
+# define libc_feholdexcept_setroundf	libc_feholdexcept_setround_sse
+# define libc_fetestexceptf		libc_fetestexcept_sse
+# define libc_fesetenvf			libc_fesetenv_sse
+# define libc_feupdateenv_testf		libc_feupdateenv_test_sse
+# define libc_feupdateenvf		libc_feupdateenv_sse
+# define libc_feholdsetroundf		libc_feholdsetround_sse
+# define libc_feresetroundf		libc_feresetround_sse
+#else
+# define libc_feholdexceptf		libc_feholdexcept_387
+# define libc_fesetroundf		libc_fesetround_387
+# define libc_feholdexcept_setroundf	libc_feholdexcept_setround_387
+# define libc_fetestexceptf		libc_fetestexcept_387
+# define libc_fesetenvf			libc_fesetenv_387
+# define libc_feupdateenv_testf		libc_feupdateenv_test_387
+# define libc_feupdateenvf		libc_feupdateenv_387
+# define libc_feholdsetroundf		libc_feholdsetround_387
+# define libc_feresetroundf		libc_feresetround_387
+#endif /* __SSE_MATH__ */
+
+#ifdef __SSE2_MATH__
+# define libc_feholdexcept		libc_feholdexcept_sse
+# define libc_fesetround		libc_fesetround_sse
+# define libc_feholdexcept_setround	libc_feholdexcept_setround_sse
+# define libc_fetestexcept		libc_fetestexcept_sse
+# define libc_fesetenv			libc_fesetenv_sse
+# define libc_feupdateenv_test		libc_feupdateenv_test_sse
+# define libc_feupdateenv		libc_feupdateenv_sse
+# define libc_feholdsetround		libc_feholdsetround_sse
+# define libc_feresetround		libc_feresetround_sse
+#else
+# define libc_feholdexcept		libc_feholdexcept_387
+# define libc_fesetround		libc_fesetround_387
+# define libc_feholdexcept_setround	libc_feholdexcept_setround_387
+# define libc_fetestexcept		libc_fetestexcept_387
+# define libc_fesetenv			libc_fesetenv_387
+# define libc_feupdateenv_test		libc_feupdateenv_test_387
+# define libc_feupdateenv		libc_feupdateenv_387
+# define libc_feholdsetround		libc_feholdsetround_387
+# define libc_feresetround		libc_feresetround_387
+#endif /* __SSE2_MATH__ */
+
+#define libc_feholdexceptl		libc_feholdexcept_387
+#define libc_fesetroundl		libc_fesetround_387
+#define libc_feholdexcept_setroundl	libc_feholdexcept_setround_387
+#define libc_fetestexceptl		libc_fetestexcept_387
+#define libc_fesetenvl			libc_fesetenv_387
+#define libc_feupdateenv_testl		libc_feupdateenv_test_387
+#define libc_feupdateenvl		libc_feupdateenv_387
+#define libc_feholdsetroundl		libc_feholdsetround_387
+#define libc_feresetroundl		libc_feresetround_387
+
+#ifndef __SSE2_MATH__
+# define libc_feholdexcept_setround_53bit libc_feholdexcept_setround_387_53bit
+# define libc_feholdsetround_53bit	libc_feholdsetround_387_53bit
+#endif
+
+/* We have support for rounding mode context.  */
+#define HAVE_RM_CTX 1
+
+static __always_inline void
+libc_feholdexcept_setround_sse_ctx (struct rm_ctx *ctx, int r)
+{
+  unsigned int mxcsr, new_mxcsr;
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
+  new_mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3);
+
+  ctx->env.__mxcsr = mxcsr;
+  if (__glibc_unlikely (mxcsr != new_mxcsr))
+    {
+      asm volatile (LDMXCSR " %0" : : "m" (*&new_mxcsr));
+      ctx->updated_status = true;
+    }
+  else
+    ctx->updated_status = false;
+}
+
+/* Unconditional since we want to overwrite any exceptions that occurred in the
+   context.  This is also why all fehold* functions unconditionally write into
+   ctx->env.  */
+static __always_inline void
+libc_fesetenv_sse_ctx (struct rm_ctx *ctx)
+{
+  libc_fesetenv_sse (&ctx->env);
+}
+
+static __always_inline void
+libc_feupdateenv_sse_ctx (struct rm_ctx *ctx)
+{
+  if (__glibc_unlikely (ctx->updated_status))
+    libc_feupdateenv_test_sse (&ctx->env, 0);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387_prec_ctx (struct rm_ctx *ctx, int r)
+{
+  libc_feholdexcept_387 (&ctx->env);
+
+  fpu_control_t cw = ctx->env.__control_word;
+  fpu_control_t old_cw = cw;
+  cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
+  cw |= r | 0x3f;
+
+  if (__glibc_unlikely (old_cw != cw))
+    {
+      _FPU_SETCW (cw);
+      ctx->updated_status = true;
+    }
+  else
+    ctx->updated_status = false;
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387_ctx (struct rm_ctx *ctx, int r)
+{
+  libc_feholdexcept_setround_387_prec_ctx (ctx, r | _FPU_EXTENDED);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387_53bit_ctx (struct rm_ctx *ctx, int r)
+{
+  libc_feholdexcept_setround_387_prec_ctx (ctx, r | _FPU_DOUBLE);
+}
+
+static __always_inline void
+libc_feholdsetround_387_prec_ctx (struct rm_ctx *ctx, int r)
+{
+  fpu_control_t cw, new_cw;
+
+  _FPU_GETCW (cw);
+  new_cw = cw;
+  new_cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
+  new_cw |= r;
+
+  ctx->env.__control_word = cw;
+  if (__glibc_unlikely (new_cw != cw))
+    {
+      _FPU_SETCW (new_cw);
+      ctx->updated_status = true;
+    }
+  else
+    ctx->updated_status = false;
+}
+
+static __always_inline void
+libc_feholdsetround_387_ctx (struct rm_ctx *ctx, int r)
+{
+  libc_feholdsetround_387_prec_ctx (ctx, r | _FPU_EXTENDED);
+}
+
+static __always_inline void
+libc_feholdsetround_387_53bit_ctx (struct rm_ctx *ctx, int r)
+{
+  libc_feholdsetround_387_prec_ctx (ctx, r | _FPU_DOUBLE);
+}
+
+static __always_inline void
+libc_feholdsetround_sse_ctx (struct rm_ctx *ctx, int r)
+{
+  unsigned int mxcsr, new_mxcsr;
+
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
+  new_mxcsr = (mxcsr & ~0x6000) | (r << 3);
+
+  ctx->env.__mxcsr = mxcsr;
+  if (__glibc_unlikely (new_mxcsr != mxcsr))
+    {
+      asm volatile (LDMXCSR " %0" : : "m" (*&new_mxcsr));
+      ctx->updated_status = true;
+    }
+  else
+    ctx->updated_status = false;
+}
+
+static __always_inline void
+libc_feresetround_sse_ctx (struct rm_ctx *ctx)
+{
+  if (__glibc_unlikely (ctx->updated_status))
+    libc_feresetround_sse (&ctx->env);
+}
+
+static __always_inline void
+libc_feresetround_387_ctx (struct rm_ctx *ctx)
+{
+  if (__glibc_unlikely (ctx->updated_status))
+    _FPU_SETCW (ctx->env.__control_word);
+}
+
+static __always_inline void
+libc_feupdateenv_387_ctx (struct rm_ctx *ctx)
+{
+  if (__glibc_unlikely (ctx->updated_status))
+    libc_feupdateenv_test_387 (&ctx->env, 0);
+}
+
+#ifdef __SSE_MATH__
+# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_sse_ctx
+# define libc_fesetenvf_ctx		libc_fesetenv_sse_ctx
+# define libc_feupdateenvf_ctx		libc_feupdateenv_sse_ctx
+# define libc_feholdsetroundf_ctx	libc_feholdsetround_sse_ctx
+# define libc_feresetroundf_ctx		libc_feresetround_sse_ctx
+#else
+# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_387_ctx
+# define libc_feupdateenvf_ctx		libc_feupdateenv_387_ctx
+# define libc_feholdsetroundf_ctx	libc_feholdsetround_387_ctx
+# define libc_feresetroundf_ctx		libc_feresetround_387_ctx
+#endif /* __SSE_MATH__ */
+
+#ifdef __SSE2_MATH__
+# define libc_feholdexcept_setround_ctx	libc_feholdexcept_setround_sse_ctx
+# define libc_fesetenv_ctx		libc_fesetenv_sse_ctx
+# define libc_feupdateenv_ctx		libc_feupdateenv_sse_ctx
+# define libc_feholdsetround_ctx	libc_feholdsetround_sse_ctx
+# define libc_feresetround_ctx		libc_feresetround_sse_ctx
+#else
+# define libc_feholdexcept_setround_ctx	libc_feholdexcept_setround_387_ctx
+# define libc_feupdateenv_ctx		libc_feupdateenv_387_ctx
+# define libc_feholdsetround_ctx	libc_feholdsetround_387_ctx
+# define libc_feresetround_ctx		libc_feresetround_387_ctx
+#endif /* __SSE2_MATH__ */
+
+#define libc_feholdexcept_setroundl_ctx	libc_feholdexcept_setround_387_ctx
+#define libc_feupdateenvl_ctx		libc_feupdateenv_387_ctx
+#define libc_feholdsetroundl_ctx	libc_feholdsetround_387_ctx
+#define libc_feresetroundl_ctx		libc_feresetround_387_ctx
+
+#ifndef __SSE2_MATH__
+# define libc_feholdsetround_53bit_ctx	libc_feholdsetround_387_53bit_ctx
+# define libc_feresetround_53bit_ctx	libc_feresetround_387_ctx
+#endif
+
+#undef __mxcsr
+
+#endif /* FENV_PRIVATE_H */
diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetenv.c b/REORG.TODO/sysdeps/i386/fpu/fesetenv.c
new file mode 100644
index 0000000000..a338e5d555
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fesetenv.c
@@ -0,0 +1,131 @@
+/* Install given floating-point environment.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <fpu_control.h>
+#include <assert.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+
+/* All exceptions, including the x86-specific "denormal operand"
+   exception.  */
+#define FE_ALL_EXCEPT_X86 (FE_ALL_EXCEPT | __FE_DENORM)
+
+
+int
+__fesetenv (const fenv_t *envp)
+{
+  fenv_t temp;
+
+  /* The memory block used by fstenv/fldenv has a size of 28 bytes.  */
+  assert (sizeof (fenv_t) == 28);
+
+  /* Install the environment specified by ENVP.  But there are a few
+     values which we do not want to come from the saved environment.
+     Therefore, we get the current environment and replace the values
+     we want to use from the environment specified by the parameter.  */
+  __asm__ ("fnstenv %0" : "=m" (*&temp));
+
+  if (envp == FE_DFL_ENV)
+    {
+      temp.__control_word |= FE_ALL_EXCEPT_X86;
+      temp.__control_word &= ~FE_TOWARDZERO;
+      temp.__control_word |= _FPU_EXTENDED;
+      temp.__status_word &= ~FE_ALL_EXCEPT_X86;
+    }
+  else if (envp == FE_NOMASK_ENV)
+    {
+      temp.__control_word &= ~(FE_ALL_EXCEPT | FE_TOWARDZERO);
+      /* Keep the "denormal operand" exception masked.  */
+      temp.__control_word |= __FE_DENORM;
+      temp.__control_word |= _FPU_EXTENDED;
+      temp.__status_word &= ~FE_ALL_EXCEPT_X86;
+    }
+  else
+    {
+      temp.__control_word &= ~(FE_ALL_EXCEPT_X86
+			       | FE_TOWARDZERO
+			       | _FPU_EXTENDED);
+      temp.__control_word |= (envp->__control_word
+			      & (FE_ALL_EXCEPT_X86
+				 | FE_TOWARDZERO
+				 | _FPU_EXTENDED));
+      temp.__status_word &= ~FE_ALL_EXCEPT_X86;
+      temp.__status_word |= envp->__status_word & FE_ALL_EXCEPT_X86;
+    }
+  temp.__eip = 0;
+  temp.__cs_selector = 0;
+  temp.__opcode = 0;
+  temp.__data_offset = 0;
+  temp.__data_selector = 0;
+
+  __asm__ ("fldenv %0" : : "m" (temp));
+
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int mxcsr;
+      __asm__ ("stmxcsr %0" : "=m" (mxcsr));
+
+      if (envp == FE_DFL_ENV)
+	{
+	  /* Clear SSE exceptions.  */
+	  mxcsr &= ~FE_ALL_EXCEPT_X86;
+	  /* Set mask for SSE MXCSR.  */
+	  mxcsr |= (FE_ALL_EXCEPT_X86 << 7);
+	  /* Set rounding to FE_TONEAREST.  */
+	  mxcsr &= ~0x6000;
+	  mxcsr |= (FE_TONEAREST << 3);
+	  /* Clear the FZ and DAZ bits.  */
+	  mxcsr &= ~0x8040;
+	}
+      else if (envp == FE_NOMASK_ENV)
+	{
+	  /* Clear SSE exceptions.  */
+	  mxcsr &= ~FE_ALL_EXCEPT_X86;
+	  /* Do not mask exceptions.  */
+	  mxcsr &= ~(FE_ALL_EXCEPT << 7);
+	  /* Keep the "denormal operand" exception masked.  */
+	  mxcsr |= (__FE_DENORM << 7);
+	  /* Set rounding to FE_TONEAREST.  */
+	  mxcsr &= ~0x6000;
+	  mxcsr |= (FE_TONEAREST << 3);
+	  /* Clear the FZ and DAZ bits.  */
+	  mxcsr &= ~0x8040;
+	}
+      else
+	mxcsr = envp->__eip;
+
+      __asm__ ("ldmxcsr %0" : : "m" (mxcsr));
+    }
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fesetenv, __old_fesetenv)
+compat_symbol (libm, __old_fesetenv, fesetenv, GLIBC_2_1);
+#endif
+
+libm_hidden_def (__fesetenv)
+libm_hidden_ver (__fesetenv, fesetenv)
+versioned_symbol (libm, __fesetenv, fesetenv, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetexcept.c b/REORG.TODO/sysdeps/i386/fpu/fesetexcept.c
new file mode 100644
index 0000000000..adfcf17ba6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fesetexcept.c
@@ -0,0 +1,31 @@
+/* Set given exception flags.  i386 version.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+fesetexcept (int excepts)
+{
+  fenv_t temp;
+
+  __asm__ ("fnstenv %0" : "=m" (*&temp));
+  temp.__status_word |= excepts & FE_ALL_EXCEPT;
+  __asm__ ("fldenv %0" : : "m" (*&temp));
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetmode.c b/REORG.TODO/sysdeps/i386/fpu/fesetmode.c
new file mode 100644
index 0000000000..bd9f74cd97
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fesetmode.c
@@ -0,0 +1,54 @@
+/* Install given floating-point control modes.  i386 version.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <fpu_control.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+/* All exceptions, including the x86-specific "denormal operand"
+   exception.  */
+#define FE_ALL_EXCEPT_X86 (FE_ALL_EXCEPT | __FE_DENORM)
+
+int
+fesetmode (const femode_t *modep)
+{
+  fpu_control_t cw;
+  if (modep == FE_DFL_MODE)
+    cw = _FPU_DEFAULT;
+  else
+    cw = modep->__control_word;
+  _FPU_SETCW (cw);
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int mxcsr;
+      __asm__ ("stmxcsr %0" : "=m" (mxcsr));
+      /* Preserve SSE exception flags but restore other state in
+	 MXCSR.  */
+      mxcsr &= FE_ALL_EXCEPT_X86;
+      if (modep == FE_DFL_MODE)
+	/* Default MXCSR state has all bits zero except for those
+	   masking exceptions.  */
+	mxcsr |= FE_ALL_EXCEPT_X86 << 7;
+      else
+	mxcsr |= modep->__mxcsr & ~FE_ALL_EXCEPT_X86;
+      __asm__ ("ldmxcsr %0" : : "m" (mxcsr));
+    }
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetround.c b/REORG.TODO/sysdeps/i386/fpu/fesetround.c
new file mode 100644
index 0000000000..a3fa6235c0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fesetround.c
@@ -0,0 +1,54 @@
+/* Set current rounding direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__fesetround (int round)
+{
+  unsigned short int cw;
+
+  if ((round & ~0xc00) != 0)
+    /* ROUND is no valid rounding mode.  */
+    return 1;
+
+  __asm__ ("fnstcw %0" : "=m" (*&cw));
+  cw &= ~0xc00;
+  cw |= round;
+  __asm__ ("fldcw %0" : : "m" (*&cw));
+
+  /* If the CPU supports SSE we set the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int xcw;
+
+      __asm__ ("stmxcsr %0" : "=m" (*&xcw));
+      xcw &= ~0x6000;
+      xcw |= round << 3;
+      __asm__ ("ldmxcsr %0" : : "m" (*&xcw));
+    }
+
+  return 0;
+}
+libm_hidden_def (__fesetround)
+weak_alias (__fesetround, fesetround)
+libm_hidden_weak (fesetround)
diff --git a/REORG.TODO/sysdeps/i386/fpu/feupdateenv.c b/REORG.TODO/sysdeps/i386/fpu/feupdateenv.c
new file mode 100644
index 0000000000..b610289cd0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/feupdateenv.c
@@ -0,0 +1,60 @@
+/* Install given floating-point environment and raise exceptions.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <dl-procinfo.h>
+#include <ldsodefs.h>
+
+int
+__feupdateenv (const fenv_t *envp)
+{
+  fexcept_t temp;
+  unsigned int xtemp = 0;
+
+  /* Save current exceptions.  */
+  __asm__ ("fnstsw %0" : "=m" (*&temp));
+
+  /* If the CPU supports SSE we test the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    __asm__ ("stmxcsr %0" : "=m" (*&xtemp));
+
+  temp = (temp | xtemp) & FE_ALL_EXCEPT;
+
+  /* Install new environment.  */
+  __fesetenv (envp);
+
+  /* Raise the saved exception.  Incidently for us the implementation
+     defined format of the values in objects of type fexcept_t is the
+     same as the ones specified using the FE_* constants.  */
+  __feraiseexcept ((int) temp);
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feupdateenv, __old_feupdateenv)
+compat_symbol (libm, __old_feupdateenv, feupdateenv, GLIBC_2_1);
+#endif
+
+libm_hidden_def (__feupdateenv)
+libm_hidden_ver (__feupdateenv, feupdateenv)
+versioned_symbol (libm, __feupdateenv, feupdateenv, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c b/REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c
new file mode 100644
index 0000000000..954e5f69d8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c
@@ -0,0 +1,57 @@
+/* Store current representation for exceptions.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+
+int
+__fegetexceptflag (fexcept_t *flagp, int excepts)
+{
+  fexcept_t temp;
+
+  /* Get the current exceptions.  */
+  __asm__ ("fnstsw %0" : "=m" (*&temp));
+
+  *flagp = temp & excepts & FE_ALL_EXCEPT;
+
+  /* If the CPU supports SSE, we clear the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int sse_exc;
+
+      /* Get the current MXCSR.  */
+      __asm__ ("stmxcsr %0" : "=m" (*&sse_exc));
+
+      *flagp |= sse_exc & excepts & FE_ALL_EXCEPT;
+    }
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fegetexceptflag, __old_fegetexceptflag)
+compat_symbol (libm, __old_fegetexceptflag, fegetexceptflag, GLIBC_2_1);
+#endif
+
+versioned_symbol (libm, __fegetexceptflag, fegetexceptflag, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c b/REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c
new file mode 100644
index 0000000000..913d7b912c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c
@@ -0,0 +1,124 @@
+/* Raise given exceptions.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <math.h>
+
+int
+__feraiseexcept (int excepts)
+{
+  /* Raise exceptions represented by EXPECTS.  But we must raise only
+     one signal at a time.  It is important that if the overflow/underflow
+     exception and the inexact exception are given at the same time,
+     the overflow/underflow exception follows the inexact exception.  */
+
+  /* First: invalid exception.  */
+  if ((FE_INVALID & excepts) != 0)
+    {
+      /* One example of an invalid operation is 0.0 / 0.0.  */
+      double d;
+      __asm__ __volatile__ ("fldz; fdiv %%st, %%st(0); fwait" : "=t" (d));
+      (void) &d;
+    }
+
+  /* Next: division by zero.  */
+  if ((FE_DIVBYZERO & excepts) != 0)
+    {
+      double d;
+      __asm__ __volatile__ ("fldz; fld1; fdivp %%st, %%st(1); fwait"
+			    : "=t" (d));
+      (void) &d;
+    }
+
+  /* Next: overflow.  */
+  if ((FE_OVERFLOW & excepts) != 0)
+    {
+      /* There is no way to raise only the overflow flag.  Do it the
+	 hard way.  */
+      fenv_t temp;
+
+      /* Bah, we have to clear selected exceptions.  Since there is no
+	 `fldsw' instruction we have to do it the hard way.  */
+      __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp));
+
+      /* Set the relevant bits.  */
+      temp.__status_word |= FE_OVERFLOW;
+
+      /* Put the new data in effect.  */
+      __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp));
+
+      /* And raise the exception.  */
+      __asm__ __volatile__ ("fwait");
+    }
+
+  /* Next: underflow.  */
+  if ((FE_UNDERFLOW & excepts) != 0)
+    {
+      /* There is no way to raise only the underflow flag.  Do it the
+	 hard way.  */
+      fenv_t temp;
+
+      /* Bah, we have to clear selected exceptions.  Since there is no
+	 `fldsw' instruction we have to do it the hard way.  */
+      __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp));
+
+      /* Set the relevant bits.  */
+      temp.__status_word |= FE_UNDERFLOW;
+
+      /* Put the new data in effect.  */
+      __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp));
+
+      /* And raise the exception.  */
+      __asm__ __volatile__ ("fwait");
+    }
+
+  /* Last: inexact.  */
+  if ((FE_INEXACT & excepts) != 0)
+    {
+      /* There is no way to raise only the inexact flag.  Do it the
+	 hard way.  */
+      fenv_t temp;
+
+      /* Bah, we have to clear selected exceptions.  Since there is no
+	 `fldsw' instruction we have to do it the hard way.  */
+      __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp));
+
+      /* Set the relevant bits.  */
+      temp.__status_word |= FE_INEXACT;
+
+      /* Put the new data in effect.  */
+      __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp));
+
+      /* And raise the exception.  */
+      __asm__ __volatile__ ("fwait");
+    }
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feraiseexcept, __old_feraiseexcept)
+compat_symbol (libm, __old_feraiseexcept, feraiseexcept, GLIBC_2_1);
+#endif
+
+libm_hidden_def (__feraiseexcept)
+libm_hidden_ver (__feraiseexcept, feraiseexcept)
+versioned_symbol (libm, __feraiseexcept, feraiseexcept, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c b/REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c
new file mode 100644
index 0000000000..efa64aaefd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c
@@ -0,0 +1,69 @@
+/* Set floating-point environment exception handling.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <math.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__fesetexceptflag (const fexcept_t *flagp, int excepts)
+{
+  fenv_t temp;
+
+  /* Get the current environment.  We have to do this since we cannot
+     separately set the status word.  */
+  __asm__ ("fnstenv %0" : "=m" (*&temp));
+
+  temp.__status_word &= ~(excepts & FE_ALL_EXCEPT);
+  temp.__status_word |= *flagp & excepts & FE_ALL_EXCEPT;
+
+  /* Store the new status word (along with the rest of the environment.
+     Possibly new exceptions are set but they won't get executed unless
+     the next floating-point instruction.  */
+  __asm__ ("fldenv %0" : : "m" (*&temp));
+
+  /* If the CPU supports SSE, we set the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int xnew_exc;
+
+      /* Get the current MXCSR.  */
+      __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+      /* Set the relevant bits.  */
+      xnew_exc &= ~(excepts & FE_ALL_EXCEPT);
+      xnew_exc |= *flagp & excepts & FE_ALL_EXCEPT;
+
+      /* Put the new data in effect.  */
+      __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+    }
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fesetexceptflag, __old_fesetexceptflag)
+compat_symbol (libm, __old_fesetexceptflag, fesetexceptflag, GLIBC_2_1);
+#endif
+
+versioned_symbol (libm, __fesetexceptflag, fesetexceptflag, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/ftestexcept.c b/REORG.TODO/sysdeps/i386/fpu/ftestexcept.c
new file mode 100644
index 0000000000..f523f9e709
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/ftestexcept.c
@@ -0,0 +1,40 @@
+/* Test exception in current environment.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <dl-procinfo.h>
+#include <ldsodefs.h>
+
+int
+fetestexcept (int excepts)
+{
+  short temp;
+  int xtemp = 0;
+
+  /* Get current exceptions.  */
+  __asm__ ("fnstsw %0" : "=a" (temp));
+
+  /* If the CPU supports SSE we test the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    __asm__ ("stmxcsr %0" : "=m" (*&xtemp));
+
+  return (temp | xtemp) & excepts & FE_ALL_EXCEPT;
+}
+libm_hidden_def (fetestexcept)
diff --git a/REORG.TODO/sysdeps/i386/fpu/halfulp.c b/REORG.TODO/sysdeps/i386/fpu/halfulp.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/halfulp.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h b/REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h
new file mode 100644
index 0000000000..6ffc8e6f64
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h
@@ -0,0 +1,340 @@
+/* Helper macros for x86 libm functions.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _I386_MATH_ASM_H
+#define _I386_MATH_ASM_H 1
+
+/* Remove excess range and precision by storing a value on the stack
+   and loading it back.  */
+#define FLT_NARROW_EVAL				\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	fstps	(%esp);				\
+	flds	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);
+#define DBL_NARROW_EVAL				\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fstpl	(%esp);				\
+	fldl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);
+
+/* Define constants for the minimum value of a floating-point
+   type.  */
+#define DEFINE_FLT_MIN				\
+	.section .rodata.cst4,"aM",@progbits,4;	\
+	.p2align 2;				\
+	.type flt_min,@object;			\
+flt_min:					\
+	.byte 0, 0, 0x80, 0;			\
+	.size flt_min, .-flt_min;
+#define DEFINE_DBL_MIN				\
+	.section .rodata.cst8,"aM",@progbits,8;	\
+	.p2align 3;				\
+	.type dbl_min,@object;			\
+dbl_min:					\
+	.byte 0, 0, 0, 0, 0, 0, 0x10, 0;	\
+	.size dbl_min, .-dbl_min;
+#define DEFINE_LDBL_MIN					\
+	.section .rodata.cst16,"aM",@progbits,16;	\
+	.p2align 4;					\
+	.type ldbl_min,@object;				\
+ldbl_min:						\
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x1, 0;	\
+	.byte 0, 0, 0, 0, 0, 0;				\
+	.size ldbl_min, .-ldbl_min;
+
+/* Remove excess range and precision by storing a value on the stack
+   and loading it back.  The value is given to be nonnegative or NaN;
+   if it is subnormal, also force an underflow exception.  The
+   relevant constant for the minimum of the type must have been
+   defined, the MO macro must have been defined for access to memory
+   operands, and, if PIC, the PIC register must have been loaded.  */
+#define FLT_NARROW_EVAL_UFLOW_NONNEG_NAN	\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	flds	MO(flt_min);			\
+	fld	%st(1);				\
+	fucompp;				\
+	fnstsw;					\
+	sahf;					\
+	jnc 6424f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstps	(%esp);				\
+6424:	fstps	(%esp);				\
+	flds	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);
+#define DBL_NARROW_EVAL_UFLOW_NONNEG_NAN	\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fldl	MO(dbl_min);			\
+	fld	%st(1);				\
+	fucompp;				\
+	fnstsw;					\
+	sahf;					\
+	jnc 6453f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstpl	(%esp);				\
+6453:	fstpl	(%esp);				\
+	fldl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);
+
+/* Likewise, but the argument is not a NaN (so fcom instructions,
+   which support memory operands, can be used).  */
+#define FLT_NARROW_EVAL_UFLOW_NONNEG		\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	fcoms	MO(flt_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6424f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstps	(%esp);				\
+6424:	fstps	(%esp);				\
+	flds	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);
+#define DBL_NARROW_EVAL_UFLOW_NONNEG		\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fcoml	MO(dbl_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6453f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstpl	(%esp);				\
+6453:	fstpl	(%esp);				\
+	fldl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);
+
+/* Likewise, but the non-NaN argument may be negative.  */
+#define FLT_NARROW_EVAL_UFLOW_NONNAN		\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	fld	%st(0);				\
+	fabs;					\
+	fcomps	MO(flt_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6424f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstps	(%esp);				\
+6424:	fstps	(%esp);				\
+	flds	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);
+#define DBL_NARROW_EVAL_UFLOW_NONNAN		\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fld	%st(0);				\
+	fabs;					\
+	fcompl	MO(dbl_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6453f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstpl	(%esp);				\
+6453:	fstpl	(%esp);				\
+	fldl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);
+
+/* Force an underflow exception if the given value is subnormal.  The
+   relevant constant for the minimum of the type must have been
+   defined, the MO macro must have been defined for access to memory
+   operands, and, if PIC, the PIC register must have been loaded.  */
+#define FLT_CHECK_FORCE_UFLOW			\
+	flds	MO(flt_min);			\
+	fld	%st(1);				\
+	fabs;					\
+	fucompp;				\
+	fnstsw;					\
+	sahf;					\
+	jnc 6424f;				\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstps	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);		\
+6424:
+#define DBL_CHECK_FORCE_UFLOW			\
+	fldl	MO(dbl_min);			\
+	fld	%st(1);				\
+	fabs;					\
+	fucompp;				\
+	fnstsw;					\
+	sahf;					\
+	jnc 6453f;				\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstpl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);		\
+6453:
+
+/* Likewise, but also remove excess range and precision if the value
+   is subnormal.  */
+#define FLT_CHECK_FORCE_UFLOW_NARROW		\
+	flds	MO(flt_min);			\
+	fld	%st(1);				\
+	fabs;					\
+	fucompp;				\
+	fnstsw;					\
+	sahf;					\
+	jnc 6424f;				\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstps	(%esp);				\
+	fstps	(%esp);				\
+	flds	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);		\
+6424:
+#define DBL_CHECK_FORCE_UFLOW_NARROW		\
+	fldl	MO(dbl_min);			\
+	fld	%st(1);				\
+	fabs;					\
+	fucompp;				\
+	fnstsw;					\
+	sahf;					\
+	jnc 6453f;				\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstpl	(%esp);				\
+	fstpl	(%esp);				\
+	fldl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);		\
+6453:
+
+/* Likewise, but the argument is nonnegative or NaN.  */
+#define LDBL_CHECK_FORCE_UFLOW_NONNEG_NAN	\
+	fldt	MO(ldbl_min);			\
+	fld	%st(1);				\
+	fucompp;				\
+	fnstsw;					\
+	sahf;					\
+	jnc 6464f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstp	%st(0);				\
+6464:
+
+/* Likewise, but the argument is not a NaN.  */
+#define FLT_CHECK_FORCE_UFLOW_NONNAN		\
+	fld %st(0);				\
+	fabs;					\
+	fcomps	MO(flt_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6424f;				\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstps	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);		\
+6424:
+#define DBL_CHECK_FORCE_UFLOW_NONNAN		\
+	fld %st(0);				\
+	fabs;					\
+	fcompl	MO(dbl_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6453f;				\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstpl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);		\
+6453:
+#define LDBL_CHECK_FORCE_UFLOW_NONNAN		\
+	fldt	MO(ldbl_min);			\
+	fld	%st(1);				\
+	fabs;					\
+	fcompp;					\
+	fnstsw;					\
+	sahf;					\
+	jnc 6464f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstp	%st(0);				\
+6464:
+
+/* Likewise, but the argument is nonnegative and not a NaN.  */
+#define FLT_CHECK_FORCE_UFLOW_NONNEG		\
+	fcoms	MO(flt_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6424f;				\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstps	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);		\
+6424:
+#define DBL_CHECK_FORCE_UFLOW_NONNEG		\
+	fcoml	MO(dbl_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6453f;				\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstpl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);		\
+6453:
+#define LDBL_CHECK_FORCE_UFLOW_NONNEG		\
+	fldt	MO(ldbl_min);			\
+	fld	%st(1);				\
+	fcompp;					\
+	fnstsw;					\
+	sahf;					\
+	jnc 6464f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstp	%st(0);				\
+6464:
+
+#endif /* i386-math-asm.h.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps
new file mode 100644
index 0000000000..0fc50907ad
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps
@@ -0,0 +1,2202 @@
+# Begin of automatic generation
+
+# Maximal error of functions:
+Function: "acos":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "acos_downward":
+ildouble: 2
+ldouble: 2
+
+Function: "acos_towardzero":
+ildouble: 2
+ldouble: 2
+
+Function: "acos_upward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "acosh":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 2
+
+Function: "acosh_downward":
+double: 1
+idouble: 1
+ildouble: 6
+ldouble: 4
+
+Function: "acosh_towardzero":
+double: 1
+idouble: 1
+ildouble: 6
+ldouble: 4
+
+Function: "acosh_upward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 3
+
+Function: "asin":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_downward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "asin_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asinh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "asinh_downward":
+double: 1
+float: 1
+idouble: 1
+ildouble: 5
+ldouble: 5
+
+Function: "asinh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "asinh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "atan":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atanh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "atanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 4
+
+Function: "atanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 3
+
+Function: "atanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "cabs":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cacos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "cacos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "cacos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "cacosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacosh_downward":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cacosh_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacosh_upward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "carg":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "casin_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "casin_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "casin_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casin_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casinh_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Imaginary part of "casinh_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "casinh_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "casinh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "catan":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "catanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catanh":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cbrt":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cbrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "cbrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cbrt_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccosh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cexp":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cexp":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cexp_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cexp_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "clog":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog10":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog10":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "clog10_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "clog10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cos":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cos_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh":
+double: 1
+float: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_downward":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: "cosh_towardzero":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_upward":
+double: 4
+float: 2
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: Real part of "cpow":
+double: 2
+float: 5
+idouble: 2
+ifloat: 5
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cpow":
+float: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "cpow_downward":
+double: 5
+float: 8
+idouble: 5
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cpow_towardzero":
+double: 5
+float: 8
+idouble: 5
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cpow_upward":
+double: 4
+float: 1
+idouble: 4
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cpow_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csin":
+float: 1
+ifloat: 1
+
+Function: Real part of "csin_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "csin_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csinh":
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "csinh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "csinh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csqrt":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "csqrt":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csqrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "csqrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ctan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_towardzero":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctan_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ctanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ctanh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctanh_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctanh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "erf":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erfc":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "erfc_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "exp":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_downward":
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_upward":
+ildouble: 1
+ldouble: 1
+
+Function: "exp_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "expm1":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "expm1_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "gamma":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "gamma_downward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_towardzero":
+double: 4
+float: 2
+idouble: 4
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_upward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "hypot":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j0":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "j0_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "j0_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "j0_upward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "j1":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j1_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "j1_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "j1_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: "jn":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "jn_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "jn_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "jn_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "lgamma":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "lgamma_downward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_towardzero":
+double: 4
+float: 2
+idouble: 4
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_upward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "log":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log1p":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log1p_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "log2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log_downward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow_downward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_towardzero":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_upward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "sin":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "sin_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sin_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sin_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "sincos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sincos_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sinh":
+double: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sinh_downward":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 5
+
+Function: "sinh_towardzero":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 4
+
+Function: "sinh_upward":
+double: 4
+float: 2
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 5
+
+Function: "tan":
+float: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "tan_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "tanh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 7
+ldouble: 4
+
+Function: "tanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 4
+
+Function: "tgamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "y0":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "y0_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y0_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y0_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "y1":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "y1_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "y1_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y1_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "yn":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "yn_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "yn_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "yn_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+# end of automatic generation
diff --git a/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name
new file mode 100644
index 0000000000..54ca0d8295
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name
@@ -0,0 +1 @@
+ix86
diff --git a/REORG.TODO/sysdeps/i386/fpu/math-tests.h b/REORG.TODO/sysdeps/i386/fpu/math-tests.h
new file mode 100644
index 0000000000..26d0633dc0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/math-tests.h
@@ -0,0 +1,27 @@
+/* Configuration for math tests.  32-bit x86 version.
+   Copyright (C) 2013-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* On 32-bit x86, versions of GCC up to at least 4.8 are happy to use FPU load
+   instructions for sNaN values, and loading a float or double sNaN value will
+   already raise an INVALID exception as well as turn the sNaN into a qNaN,
+   rendering certain tests infeasible in this scenario.
+   <http://gcc.gnu.org/PR56831>.  */
+#define SNAN_TESTS_float	0
+#define SNAN_TESTS_double	0
+
+#include_next <math-tests.h>
diff --git a/REORG.TODO/sysdeps/i386/fpu/math_private.h b/REORG.TODO/sysdeps/i386/fpu/math_private.h
new file mode 100644
index 0000000000..485214391f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/math_private.h
@@ -0,0 +1,7 @@
+#ifndef I386_MATH_PRIVATE_H
+#define I386_MATH_PRIVATE_H 1
+
+#include "fenv_private.h"
+#include_next <math_private.h>
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/mpatan.c b/REORG.TODO/sysdeps/i386/fpu/mpatan.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mpatan.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/mpatan2.c b/REORG.TODO/sysdeps/i386/fpu/mpatan2.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mpatan2.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/mpexp.c b/REORG.TODO/sysdeps/i386/fpu/mpexp.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mpexp.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/mplog.c b/REORG.TODO/sysdeps/i386/fpu/mplog.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mplog.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/mpsqrt.c b/REORG.TODO/sysdeps/i386/fpu/mpsqrt.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mpsqrt.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_asinh.S b/REORG.TODO/sysdeps/i386/fpu/s_asinh.S
new file mode 100644
index 0000000000..1a60f7de2c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_asinh.S
@@ -0,0 +1,139 @@
+/* ix87 specific implementation of arcsinh.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type huge,@object
+huge:	.double 1e+300
+	ASM_SIZE_DIRECTIVE(huge)
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__asinh)
+	movl	8(%esp), %ecx
+	movl	$0x7fffffff, %eax
+	andl	%ecx, %eax
+	andl	$0x80000000, %ecx
+	movl	%eax, %edx
+	orl	$0x800fffff, %edx
+	incl	%edx
+	jz	7f			// x in ±Inf or NaN
+	xorl	%ecx, 8(%esp)
+	fldl	4(%esp)			// |x|
+	cmpl	$0x3e300000, %eax
+	jb	2f			// |x| < 2^-28
+	fldln2				// log(2) : |x|
+	cmpl	$0x41b00000, %eax
+	fxch				// |x| : log(2)
+	ja	3f			// |x| > 2^28
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpl	$0x40000000, %eax
+	ja	5f			// |x| > 2
+
+	// 2^-28 <= |x| <= 2 => y = sign(x)*log1p(|x|+|x|^2/(1+sqrt(1+|x|^2)))
+	fld	%st			// |x| : |x| : log(2)
+	fmul	%st(1)			// |x|^2 : |x| : log(2)
+	fld	%st			// |x|^2 : |x|^2 : |x| : log(2)
+	faddl	MO(one)			// 1+|x|^2 : |x|^2 : |x| : log(2)
+	fsqrt				// sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+	faddl	MO(one)			// 1+sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+	fdivrp				// |x|^2/(1+sqrt(1+|x|^2)) : |x| : log(2)
+	faddp				// |x|+|x|^2/(1+sqrt(1+|x|^2)) : log(2)
+	fcoml	MO(limit)
+	fnstsw
+	sahf
+	ja	6f
+	fyl2xp1
+	jecxz	4f
+	fchs
+4:	ret
+
+7:	fldl	4(%esp)
+	ret
+
+6:	faddl	MO(one)
+	fyl2x
+	jecxz	4f
+	fchs
+4:	ret
+
+	// |x| < 2^-28 => y = x (inexact iff |x| != 0.0)
+	.align ALIGNARG(4)
+2:
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	jecxz	4f
+	fchs				// x
+4:	fld	%st			// x : x
+	faddl	MO(huge)		// huge+x : x
+	fstp	%st(0)			// x
+	cmpl	$0x00100000, %eax
+	jae	8f
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fld	%st(0)
+	fmul	%st(0)
+	fstpl	(%esp)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+8:	ret
+
+	// |x| > 2^28 => y = sign(x) * (log(|x|) + log(2))
+	.align ALIGNARG(4)
+3:	fyl2x				// log(|x|)
+	fldln2				// log(2) : log(|x|)
+	faddp				// log(|x|)+log(2)
+	jecxz	4f
+	fchs
+4:	ret
+
+	// |x| > 2 => y = sign(x) * log(2*|x| + 1/(|x|+sqrt(x*x+1)))
+	.align ALIGNARG(4)
+5:	fld	%st			// |x| : |x| : log(2)
+	fadd	%st, %st(1)		// |x| : 2*|x| : log(2)
+	fld	%st			// |x| : |x| : 2*|x| : log(2)
+	fmul	%st(1)			// |x|^2 : |x| : 2*|x| : log(2)
+	faddl	MO(one)			// 1+|x|^2 : |x| : 2*|x| : log(2)
+	fsqrt				// sqrt(1+|x|^2) : |x| : 2*|x| : log(2)
+	faddp				// |x|+sqrt(1+|x|^2) : 2*|x| : log(2)
+	fdivrl	MO(one)			// 1/(|x|+sqrt(1+|x|^2)) : 2*|x| : log(2)
+	faddp				// 2*|x|+1/(|x|+sqrt(1+|x|^2)) : log(2)
+	fyl2x				// log(2*|x|+1/(|x|+sqrt(1+|x|^2)))
+	jecxz	4f
+	fchs
+4:	ret
+END(__asinh)
+weak_alias (__asinh, asinh)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_asinhf.S b/REORG.TODO/sysdeps/i386/fpu/s_asinhf.S
new file mode 100644
index 0000000000..12bcfef934
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_asinhf.S
@@ -0,0 +1,139 @@
+/* ix87 specific implementation of arcsinh.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type huge,@object
+huge:	.double 1e+36
+	ASM_SIZE_DIRECTIVE(huge)
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__asinhf)
+	movl	4(%esp), %ecx
+	movl	$0x7fffffff, %eax
+	andl	%ecx, %eax
+	andl	$0x80000000, %ecx
+	movl	%eax, %edx
+	orl	$0x807fffff, %edx
+	incl	%edx
+	jz	7f			// x in ±Inf or NaN
+	xorl	%ecx, 4(%esp)
+	flds	4(%esp)			// |x|
+	cmpl	$0x38000000, %eax
+	jb	2f			// |x| < 2^-14
+	fldln2				// log(2) : |x|
+	cmpl	$0x47000000, %eax
+	fxch				// |x| : log(2)
+	ja	3f			// |x| > 2^14
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpl	$0x40000000, %eax
+	ja	5f			// |x| > 2
+
+	// 2^-14 <= |x| <= 2 => y = sign(x)*log1p(|x|+|x|^2/(1+sqrt(1+|x|^2)))
+	fld	%st			// |x| : |x| : log(2)
+	fmul	%st(1)			// |x|^2 : |x| : log(2)
+	fld	%st			// |x|^2 : |x|^2 : |x| : log(2)
+	faddl	MO(one)			// 1+|x|^2 : |x|^2 : |x| : log(2)
+	fsqrt				// sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+	faddl	MO(one)			// 1+sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+	fdivrp				// |x|^2/(1+sqrt(1+|x|^2)) : |x| : log(2)
+	faddp				// |x|+|x|^2/(1+sqrt(1+|x|^2)) : log(2)
+	fcoml	MO(limit)
+	fnstsw
+	sahf
+	ja	6f
+	fyl2xp1
+	jecxz	4f
+	fchs
+4:	ret
+
+7:	flds	4(%esp)
+	ret
+
+6:	faddl	MO(one)
+	fyl2x
+	jecxz	4f
+	fchs
+4:	ret
+
+	// |x| < 2^-14 => y = x (inexact iff |x| != 0.0)
+	.align ALIGNARG(4)
+2:
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	jecxz	4f
+	fchs				// x
+4:	fld	%st			// x : x
+	faddl	MO(huge)		// huge+x : x
+	fstp	%st(0)			// x
+	cmpl	$0x00800000, %eax
+	jae	8f
+	subl	$4, %esp
+	cfi_adjust_cfa_offset (4)
+	fld	%st(0)
+	fmul	%st(0)
+	fstps	(%esp)
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+8:	ret
+
+	// |x| > 2^14 => y = sign(x) * (log(|x|) + log(2))
+	.align ALIGNARG(4)
+3:	fyl2x				// log(|x|)
+	fldln2				// log(2) : log(|x|)
+	faddp				// log(|x|)+log(2)
+	jecxz	4f
+	fchs
+4:	ret
+
+	// |x| > 2 => y = sign(x) * log(2*|x| + 1/(|x|+sqrt(x*x+1)))
+	.align ALIGNARG(4)
+5:	fld	%st			// |x| : |x| : log(2)
+	fadd	%st, %st(1)		// |x| : 2*|x| : log(2)
+	fld	%st			// |x| : |x| : 2*|x| : log(2)
+	fmul	%st(1)			// |x|^2 : |x| : 2*|x| : log(2)
+	faddl	MO(one)			// 1+|x|^2 : |x| : 2*|x| : log(2)
+	fsqrt				// sqrt(1+|x|^2) : |x| : 2*|x| : log(2)
+	faddp				// |x|+sqrt(1+|x|^2) : 2*|x| : log(2)
+	fdivrl	MO(one)			// 1/(|x|+sqrt(1+|x|^2)) : 2*|x| : log(2)
+	faddp				// 2*|x|+1/(|x|+sqrt(1+|x|^2)) : log(2)
+	fyl2x				// log(2*|x|+1/(|x|+sqrt(1+|x|^2)))
+	jecxz	4f
+	fchs
+4:	ret
+END(__asinhf)
+weak_alias (__asinhf, asinhf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_asinhl.S b/REORG.TODO/sysdeps/i386/fpu/s_asinhl.S
new file mode 100644
index 0000000000..f31a267e78
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_asinhl.S
@@ -0,0 +1,144 @@
+/* ix87 specific implementation of arcsinh.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type huge,@object
+huge:	.tfloat 1e+4930
+	ASM_SIZE_DIRECTIVE(huge)
+	.align ALIGNARG(4)
+	/* Please note that we use double value for 1.0.  This number
+	   has an exact representation and so we don't get accuracy
+	   problems.  The advantage is that the code is simpler.  */
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__asinhl)
+	movl	12(%esp), %ecx
+	movl	$0x7fff, %eax
+	andl	%ecx, %eax
+	andl	$0x8000, %ecx
+	movl	%eax, %edx
+	orl	$0xffff8000, %edx
+	incl	%edx
+	jz	7f			// x in ±Inf or NaN
+	xorl	%ecx, 12(%esp)
+	fldt	4(%esp)			// |x|
+	cmpl	$0x3fde, %eax
+	jb	2f			// |x| < 2^-34
+	fldln2				// log(2) : |x|
+	cmpl	$0x4020, %eax
+	fxch				// |x| : log(2)
+	ja	3f			// |x| > 2^34
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpl	$0x4000, %eax
+	ja	5f			// |x| > 2
+
+	// 2^-34 <= |x| <= 2 => y = sign(x)*log1p(|x|+|x|^2/(1+sqrt(1+|x|^2)))
+	fld	%st			// |x| : |x| : log(2)
+	fmul	%st(1)			// |x|^2 : |x| : log(2)
+	fld	%st			// |x|^2 : |x|^2 : |x| : log(2)
+	faddl	MO(one)			// 1+|x|^2 : |x|^2 : |x| : log(2)
+	fsqrt				// sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+	faddl	MO(one)			// 1+sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+	fdivrp				// |x|^2/(1+sqrt(1+|x|^2)) : |x| : log(2)
+	faddp				// |x|+|x|^2/(1+sqrt(1+|x|^2)) : log(2)
+	fcoml	MO(limit)
+	fnstsw
+	sahf
+	ja	6f
+	fyl2xp1
+	jecxz	4f
+	fchs
+4:	ret
+
+7:	fldt	4(%esp)
+	fadd	%st
+	ret
+
+6:	faddl	MO(one)
+	fyl2x
+	jecxz	4f
+	fchs
+4:	ret
+
+	// |x| < 2^-34 => y = x (inexact iff |x| != 0.0)
+	.align ALIGNARG(4)
+2:
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	jecxz	4f
+	fchs				// x
+4:	fld	%st			// x : x
+	fldt	MO(huge)		// huge : x : x
+	faddp				// huge+x : x
+	fstp	%st(0)			// x
+	cmpl	$0x0001, %eax
+	jae	8f
+	fld	%st(0)
+	fmul	%st(0)
+	fstp	%st(0)
+8:	ret
+
+	// |x| > 2^34 => y = sign(x) * (log(|x|) + log(2))
+	.align ALIGNARG(4)
+3:	fyl2x				// log(|x|)
+	fldln2				// log(2) : log(|x|)
+	faddp				// log(|x|)+log(2)
+	jecxz	4f
+	fchs
+4:	ret
+
+	// |x| > 2 => y = sign(x) * log(2*|x| + 1/(|x|+sqrt(x*x+1)))
+	.align ALIGNARG(4)
+5:	fld	%st			// |x| : |x| : log(2)
+	fadd	%st, %st(1)		// |x| : 2*|x| : log(2)
+	fld	%st			// |x| : |x| : 2*|x| : log(2)
+	fmul	%st(1)			// |x|^2 : |x| : 2*|x| : log(2)
+	faddl	MO(one)			// 1+|x|^2 : |x| : 2*|x| : log(2)
+	fsqrt				// sqrt(1+|x|^2) : |x| : 2*|x| : log(2)
+	faddp				// |x|+sqrt(1+|x|^2) : 2*|x| : log(2)
+	fdivrl	MO(one)			// 1/(|x|+sqrt(1+|x|^2)) : 2*|x| : log(2)
+	faddp				// 2*|x|+1/(|x|+sqrt(1+|x|^2)) : log(2)
+	fyl2x				// log(2*|x|+1/(|x|+sqrt(1+|x|^2)))
+	jecxz	4f
+	fchs
+4:	ret
+END(__asinhl)
+weak_alias (__asinhl, asinhl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_atan.S b/REORG.TODO/sysdeps/i386/fpu/s_atan.S
new file mode 100644
index 0000000000..644de78feb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_atan.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_atan.S,v 1.4 1995/05/08 23:50:41 jtc Exp $")
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__atan)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	4(%esp)
+	fld1
+	fpatan
+	DBL_CHECK_FORCE_UFLOW
+	ret
+END (__atan)
+weak_alias (__atan, atan)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_atanf.S b/REORG.TODO/sysdeps/i386/fpu/s_atanf.S
new file mode 100644
index 0000000000..0589c1135e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_atanf.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_atanf.S,v 1.3 1995/05/08 23:51:33 jtc Exp $")
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__atanf)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	flds	4(%esp)
+	fld1
+	fpatan
+	FLT_CHECK_FORCE_UFLOW
+	ret
+END (__atanf)
+weak_alias (__atanf, atanf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_atanl.c b/REORG.TODO/sysdeps/i386/fpu/s_atanl.c
new file mode 100644
index 0000000000..b7dba88aad
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_atanl.c
@@ -0,0 +1,22 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+long double
+__atanl (long double x)
+{
+  long double res;
+
+  asm ("fld1\n"
+       "fpatan"
+       : "=t" (res) : "0" (x));
+
+  return res;
+}
+
+weak_alias (__atanl, atanl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_cbrt.S b/REORG.TODO/sysdeps/i386/fpu/s_cbrt.S
new file mode 100644
index 0000000000..7f01659eae
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_cbrt.S
@@ -0,0 +1,200 @@
+/* Compute cubic root of double value.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
+   Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+        .section .rodata
+
+        .align ALIGNARG(4)
+        .type f7,@object
+f7:	.double -0.145263899385486377
+	ASM_SIZE_DIRECTIVE(f7)
+        .type f6,@object
+f6:	.double 0.784932344976639262
+	ASM_SIZE_DIRECTIVE(f6)
+        .type f5,@object
+f5:	.double -1.83469277483613086
+	ASM_SIZE_DIRECTIVE(f5)
+        .type f4,@object
+f4:	.double 2.44693122563534430
+	ASM_SIZE_DIRECTIVE(f4)
+        .type f3,@object
+f3:	.double -2.11499494167371287
+	ASM_SIZE_DIRECTIVE(f3)
+        .type f2,@object
+f2:	.double 1.50819193781584896
+	ASM_SIZE_DIRECTIVE(f2)
+        .type f1,@object
+f1:	.double 0.354895765043919860
+	ASM_SIZE_DIRECTIVE(f1)
+
+#define CBRT2		1.2599210498948731648
+#define ONE_CBRT2	0.793700525984099737355196796584
+#define SQR_CBRT2	1.5874010519681994748
+#define ONE_SQR_CBRT2	0.629960524947436582364439673883
+
+	.type factor,@object
+factor:	.double ONE_SQR_CBRT2
+	.double ONE_CBRT2
+	.double 1.0
+	.double CBRT2
+	.double SQR_CBRT2
+	ASM_SIZE_DIRECTIVE(factor)
+
+        .type two54,@object
+two54:  .byte 0, 0, 0, 0, 0, 0, 0x50, 0x43
+        ASM_SIZE_DIRECTIVE(two54)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%ebx)
+#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
+#else
+#define MO(op) op
+#define MOX(op,x) op(x)
+#endif
+
+	.text
+ENTRY(__cbrt)
+	movl	4(%esp), %ecx
+	movl	8(%esp), %eax
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+	orl	%eax, %ecx
+	jz	1f
+	xorl	%ecx, %ecx
+	cmpl	$0x7ff00000, %eax
+	jae	1f
+
+#ifdef PIC
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	LOAD_PIC_REG (bx)
+#endif
+
+	cmpl	$0x00100000, %eax
+	jae	2f
+
+#ifdef PIC
+	fldl	8(%esp)
+#else
+	fldl	4(%esp)
+#endif
+	fmull	MO(two54)
+	movl	$-54, %ecx
+#ifdef PIC
+	fstpl	8(%esp)
+	movl	12(%esp), %eax
+#else
+	fstpl	4(%esp)
+	movl	8(%esp), %eax
+#endif
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+
+2:	shrl	$20, %eax
+	andl	$0x800fffff, %edx
+	subl	$1022, %eax
+	orl	$0x3fe00000, %edx
+	addl	%eax, %ecx
+#ifdef PIC
+	movl	%edx, 12(%esp)
+
+	fldl	8(%esp)			/* xm */
+#else
+	movl	%edx, 8(%esp)
+
+	fldl	4(%esp)			/* xm */
+#endif
+	fabs
+
+	/* The following code has two tracks:
+	    a) compute the normalized cbrt value
+	    b) compute xe/3 and xe%3
+	   The right track computes the value for b) and this is done
+	   in an optimized way by avoiding division.
+
+	   But why two tracks at all?  Very easy: efficiency.  Some FP
+	   instruction can overlap with a certain amount of integer (and
+	   FP) instructions.  So we get (except for the imull) all
+	   instructions for free.  */
+
+	fld	%st(0)			/* xm : xm */
+
+	fmull	MO(f7)			/* f7*xm : xm */
+			movl	$1431655766, %eax
+	faddl	MO(f6)			/* f6+f7*xm : xm */
+			imull	%ecx
+	fmul	%st(1)			/* (f6+f7*xm)*xm : xm */
+			movl	%ecx, %eax
+	faddl	MO(f5)			/* f5+(f6+f7*xm)*xm : xm */
+			sarl	$31, %eax
+	fmul	%st(1)			/* (f5+(f6+f7*xm)*xm)*xm : xm */
+			subl	%eax, %edx
+	faddl	MO(f4)			/* f4+(f5+(f6+f7*xm)*xm)*xm : xm */
+	fmul	%st(1)			/* (f4+(f5+(f6+f7*xm)*xm)*xm)*xm : xm */
+	faddl	MO(f3)			/* f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm : xm */
+	fmul	%st(1)			/* (f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm : xm */
+	faddl	MO(f2)			/* f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm : xm */
+	fmul	%st(1)			/* (f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm)*xm : xm */
+	faddl	MO(f1)			/* u:=f1+(f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm)*xm : xm */
+
+	fld	%st			/* u : u : xm */
+	fmul	%st(1)			/* u*u : u : xm */
+	fld	%st(2)			/* xm : u*u : u : xm */
+	fadd	%st			/* 2*xm : u*u : u : xm */
+	fxch	%st(1)			/* u*u : 2*xm : u : xm */
+	fmul	%st(2)			/* t2:=u*u*u : 2*xm : u : xm */
+			movl	%edx, %eax
+	fadd	%st, %st(1)		/* t2 : t2+2*xm : u : xm */
+			leal	(%edx,%edx,2),%edx
+	fadd	%st(0)			/* 2*t2 : t2+2*xm : u : xm */
+			subl	%edx, %ecx
+	faddp	%st, %st(3)		/* t2+2*xm : u : 2*t2+xm */
+			shll	$3, %ecx
+	fmulp				/* u*(t2+2*xm) : 2*t2+xm */
+	fdivp	%st, %st(1)		/* u*(t2+2*xm)/(2*t2+xm) */
+	fmull	MOX(16+factor,%ecx)	/* u*(t2+2*xm)/(2*t2+xm)*FACT */
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	fildl	(%esp)			/* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
+	fxch				/* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
+	fscale				/* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+#ifdef PIC
+	movl	12(%esp), %eax
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+#else
+	movl	8(%esp), %eax
+#endif
+	testl	%eax, %eax
+	fstp	%st(1)
+	jns	4f
+	fchs
+4:	ret
+
+	/* Return the argument.  */
+1:	fldl	4(%esp)
+	ret
+END(__cbrt)
+weak_alias (__cbrt, cbrt)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S b/REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S
new file mode 100644
index 0000000000..645d24372d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S
@@ -0,0 +1,177 @@
+/* Compute cubic root of float value.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
+   Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+        .section .rodata
+
+        .align ALIGNARG(4)
+        .type f3,@object
+f3:	.double 0.191502161678719066
+        ASM_SIZE_DIRECTIVE(f3)
+        .type f2,@object
+f2:	.double 0.697570460207922770
+        ASM_SIZE_DIRECTIVE(f2)
+        .type f1,@object
+f1:	.double 0.492659620528969547
+        ASM_SIZE_DIRECTIVE(f1)
+
+#define CBRT2		1.2599210498948731648
+#define ONE_CBRT2	0.793700525984099737355196796584
+#define SQR_CBRT2	1.5874010519681994748
+#define ONE_SQR_CBRT2	0.629960524947436582364439673883
+
+	.type factor,@object
+        .align ALIGNARG(4)
+factor:	.double ONE_SQR_CBRT2
+	.double ONE_CBRT2
+	.double 1.0
+	.double CBRT2
+	.double SQR_CBRT2
+	ASM_SIZE_DIRECTIVE(factor)
+
+        .type two25,@object
+two25:	.byte 0, 0, 0, 0x4c
+        ASM_SIZE_DIRECTIVE(two25)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%ebx)
+#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
+#else
+#define MO(op) op
+#define MOX(op,x) op(x)
+#endif
+
+	.text
+ENTRY(__cbrtf)
+	movl	4(%esp), %eax
+	xorl	%ecx, %ecx
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+	jz	1f
+	cmpl	$0x7f800000, %eax
+	jae	1f
+
+#ifdef PIC
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	LOAD_PIC_REG (bx)
+#endif
+
+	cmpl	$0x00800000, %eax
+	jae	2f
+
+#ifdef PIC
+	flds	8(%esp)
+#else
+	flds	4(%esp)
+#endif
+	fmuls	MO(two25)
+	movl	$-25, %ecx
+#ifdef PIC
+	fstps	8(%esp)
+	movl	8(%esp), %eax
+#else
+	fstps	4(%esp)
+	movl	4(%esp), %eax
+#endif
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+
+2:	shrl	$23, %eax
+	andl	$0x807fffff, %edx
+	subl	$126, %eax
+	orl	$0x3f000000, %edx
+	addl	%eax, %ecx
+#ifdef PIC
+	movl	%edx, 8(%esp)
+
+	flds	8(%esp)			/* xm */
+#else
+	movl	%edx, 4(%esp)
+
+	flds	4(%esp)			/* xm */
+#endif
+	fabs
+
+	/* The following code has two tracks:
+	    a) compute the normalized cbrt value
+	    b) compute xe/3 and xe%3
+	   The right track computes the value for b) and this is done
+	   in an optimized way by avoiding division.
+
+	   But why two tracks at all?  Very easy: efficiency.  Some FP
+	   instruction can overlap with a certain amount of integer (and
+	   FP) instructions.  So we get (except for the imull) all
+	   instructions for free.  */
+
+	fld	%st(0)			/* xm : xm */
+	fmull	MO(f3)			/* f3*xm : xm */
+			movl	$1431655766, %eax
+	fsubrl	MO(f2)			/* f2-f3*xm : xm */
+			imull	%ecx
+	fmul	%st(1)			/* (f2-f3*xm)*xm : xm */
+			movl	%ecx, %eax
+	faddl	MO(f1)			/* u:=f1+(f2-f3*xm)*xm : xm */
+			sarl	$31, %eax
+	fld	%st			/* u : u : xm */
+			subl	%eax, %edx
+	fmul	%st(1)			/* u*u : u : xm */
+	fld	%st(2)			/* xm : u*u : u : xm */
+	fadd	%st			/* 2*xm : u*u : u : xm */
+	fxch	%st(1)			/* u*u : 2*xm : u : xm */
+	fmul	%st(2)			/* t2:=u*u*u : 2*xm : u : xm */
+			movl	%edx, %eax
+	fadd	%st, %st(1)		/* t2 : t2+2*xm : u : xm */
+			leal	(%edx,%edx,2),%edx
+	fadd	%st(0)			/* 2*t2 : t2+2*xm : u : xm */
+			subl	%edx, %ecx
+	faddp	%st, %st(3)		/* t2+2*xm : u : 2*t2+xm */
+			shll	$3, %ecx
+	fmulp				/* u*(t2+2*xm) : 2*t2+xm */
+	fdivp	%st, %st(1)		/* u*(t2+2*xm)/(2*t2+xm) */
+	fmull	MOX(16+factor,%ecx)	/* u*(t2+2*xm)/(2*t2+xm)*FACT */
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	fildl	(%esp)			/* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
+	fxch				/* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
+	fscale				/* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+#ifdef PIC
+	movl	8(%esp), %eax
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+#else
+	movl	4(%esp), %eax
+#endif
+	testl	%eax, %eax
+	fstp	%st(1)
+	jns	4f
+	fchs
+4:	ret
+
+	/* Return the argument.  */
+1:	flds	4(%esp)
+	ret
+END(__cbrtf)
+weak_alias (__cbrtf, cbrtf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S b/REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S
new file mode 100644
index 0000000000..e4a72d29c6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S
@@ -0,0 +1,229 @@
+/* Compute cubic root of long double value.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
+   Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+        .section .rodata
+
+        .align ALIGNARG(4)
+        .type f8,@object
+f8:	.tfloat 0.161617097923756032
+	ASM_SIZE_DIRECTIVE(f8)
+        .align ALIGNARG(4)
+        .type f7,@object
+f7:	.tfloat -0.988553671195413709
+	ASM_SIZE_DIRECTIVE(f7)
+        .align ALIGNARG(4)
+        .type f6,@object
+f6:	.tfloat 2.65298938441952296
+	ASM_SIZE_DIRECTIVE(f6)
+        .align ALIGNARG(4)
+        .type f5,@object
+f5:	.tfloat -4.11151425200350531
+	ASM_SIZE_DIRECTIVE(f5)
+        .align ALIGNARG(4)
+        .type f4,@object
+f4:	.tfloat 4.09559907378707839
+	ASM_SIZE_DIRECTIVE(f4)
+        .align ALIGNARG(4)
+        .type f3,@object
+f3:	.tfloat -2.82414939754975962
+	ASM_SIZE_DIRECTIVE(f3)
+        .align ALIGNARG(4)
+        .type f2,@object
+f2:	.tfloat 1.67595307700780102
+	ASM_SIZE_DIRECTIVE(f2)
+        .align ALIGNARG(4)
+        .type f1,@object
+f1:	.tfloat 0.338058687610520237
+	ASM_SIZE_DIRECTIVE(f1)
+
+#define CBRT2		1.2599210498948731648
+#define ONE_CBRT2	0.793700525984099737355196796584
+#define SQR_CBRT2	1.5874010519681994748
+#define ONE_SQR_CBRT2	0.629960524947436582364439673883
+
+	/* We make the entries in the following table all 16 bytes
+	   wide to avoid having to implement a multiplication by 10.  */
+	.type factor,@object
+        .align ALIGNARG(4)
+factor:	.tfloat ONE_SQR_CBRT2
+	.byte 0, 0, 0, 0, 0, 0
+	.tfloat ONE_CBRT2
+	.byte 0, 0, 0, 0, 0, 0
+	.tfloat 1.0
+	.byte 0, 0, 0, 0, 0, 0
+	.tfloat CBRT2
+	.byte 0, 0, 0, 0, 0, 0
+	.tfloat SQR_CBRT2
+	ASM_SIZE_DIRECTIVE(factor)
+
+        .type two64,@object
+        .align ALIGNARG(4)
+two64:  .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
+        ASM_SIZE_DIRECTIVE(two64)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%ebx)
+#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
+#else
+#define MO(op) op
+#define MOX(op,x) op(x)
+#endif
+
+	.text
+ENTRY(__cbrtl)
+	movl	4(%esp), %ecx
+	movl	12(%esp), %eax
+	orl	8(%esp), %ecx
+	movl	%eax, %edx
+	andl	$0x7fff, %eax
+	orl	%eax, %ecx
+	jz	1f
+	xorl	%ecx, %ecx
+	cmpl	$0x7fff, %eax
+	je	1f
+
+#ifdef PIC
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	LOAD_PIC_REG (bx)
+#endif
+
+	cmpl	$0, %eax
+	jne	2f
+
+#ifdef PIC
+	fldt	8(%esp)
+#else
+	fldt	4(%esp)
+#endif
+	fmull	MO(two64)
+	movl	$-64, %ecx
+#ifdef PIC
+	fstpt	8(%esp)
+	movl	16(%esp), %eax
+#else
+	fstpt	4(%esp)
+	movl	12(%esp), %eax
+#endif
+	movl	%eax, %edx
+	andl	$0x7fff, %eax
+
+2:	andl	$0x8000, %edx
+	subl	$16382, %eax
+	orl	$0x3ffe, %edx
+	addl	%eax, %ecx
+#ifdef PIC
+	movl	%edx, 16(%esp)
+
+	fldt	8(%esp)			/* xm */
+#else
+	movl	%edx, 12(%esp)
+
+	fldt	4(%esp)			/* xm */
+#endif
+	fabs
+
+	/* The following code has two tracks:
+	    a) compute the normalized cbrt value
+	    b) compute xe/3 and xe%3
+	   The right track computes the value for b) and this is done
+	   in an optimized way by avoiding division.
+
+	   But why two tracks at all?  Very easy: efficiency.  Some FP
+	   instruction can overlap with a certain amount of integer (and
+	   FP) instructions.  So we get (except for the imull) all
+	   instructions for free.  */
+
+	fldt	MO(f8)			/* f8 : xm */
+	fmul	%st(1)			/* f8*xm : xm */
+
+	fldt	MO(f7)
+	faddp				/* f7+f8*xm : xm */
+	fmul	%st(1)			/* (f7+f8*xm)*xm : xm */
+			movl	$1431655766, %eax
+	fldt	MO(f6)
+	faddp				/* f6+(f7+f8*xm)*xm : xm */
+			imull	%ecx
+	fmul	%st(1)			/* (f6+(f7+f8*xm)*xm)*xm : xm */
+			movl	%ecx, %eax
+	fldt	MO(f5)
+	faddp				/* f5+(f6+(f7+f8*xm)*xm)*xm : xm */
+			sarl	$31, %eax
+	fmul	%st(1)			/* (f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
+			subl	%eax, %edx
+	fldt	MO(f4)
+	faddp				/* f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
+	fmul	%st(1)			/* (f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
+	fldt	MO(f3)
+	faddp				/* f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
+	fmul	%st(1)			/* (f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
+	fldt	MO(f2)
+	faddp				/* f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
+	fmul	%st(1)			/* (f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
+	fldt	MO(f1)
+	faddp				/* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
+
+	fld	%st			/* u : u : xm */
+	fmul	%st(1)			/* u*u : u : xm */
+	fld	%st(2)			/* xm : u*u : u : xm */
+	fadd	%st			/* 2*xm : u*u : u : xm */
+	fxch	%st(1)			/* u*u : 2*xm : u : xm */
+	fmul	%st(2)			/* t2:=u*u*u : 2*xm : u : xm */
+			movl	%edx, %eax
+	fadd	%st, %st(1)		/* t2 : t2+2*xm : u : xm */
+			leal	(%edx,%edx,2),%edx
+	fadd	%st(0)			/* 2*t2 : t2+2*xm : u : xm */
+			subl	%edx, %ecx
+	faddp	%st, %st(3)		/* t2+2*xm : u : 2*t2+xm */
+			shll	$4, %ecx
+	fmulp				/* u*(t2+2*xm) : 2*t2+xm */
+	fdivp	%st, %st(1)		/* u*(t2+2*xm)/(2*t2+xm) */
+	fldt	MOX(32+factor,%ecx)
+	fmulp				/* u*(t2+2*xm)/(2*t2+xm)*FACT */
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	fildl	(%esp)			/* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
+	fxch				/* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
+	fscale				/* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+#ifdef PIC
+	movl	16(%esp), %eax
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+#else
+	movl	12(%esp), %eax
+#endif
+	testl	$0x8000, %eax
+	fstp	%st(1)
+	jz	4f
+	fchs
+4:	ret
+
+	/* Return the argument.  */
+1:	fldt	4(%esp)
+	fadd	%st
+	ret
+END(__cbrtl)
+weak_alias (__cbrtl, cbrtl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_ceil.S b/REORG.TODO/sysdeps/i386/fpu/s_ceil.S
new file mode 100644
index 0000000000..1226bb2f87
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_ceil.S
@@ -0,0 +1,34 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_ceil.S,v 1.4 1995/05/08 23:52:13 jtc Exp $")
+
+ENTRY(__ceil)
+	fldl	4(%esp)
+	subl	$32,%esp
+	cfi_adjust_cfa_offset (32)
+
+	fnstenv	4(%esp)			/* store fpu environment */
+
+	/* We use here %edx although only the low 1 bits are defined.
+	   But none of the operations should care and they are faster
+	   than the 16 bit operations.  */
+	movl	$0x0800,%edx		/* round towards +oo */
+	orl	4(%esp),%edx
+	andl	$0xfbff,%edx
+	movl	%edx,(%esp)
+	fldcw	(%esp)			/* load modified control word */
+
+	frndint				/* round */
+
+	fldenv	4(%esp)			/* restore original environment */
+
+	addl	$32,%esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__ceil)
+weak_alias (__ceil, ceil)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_ceilf.S b/REORG.TODO/sysdeps/i386/fpu/s_ceilf.S
new file mode 100644
index 0000000000..d345c0973b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_ceilf.S
@@ -0,0 +1,34 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_ceilf.S,v 1.3 1995/05/08 23:52:44 jtc Exp $")
+
+ENTRY(__ceilf)
+	flds	4(%esp)
+	subl	$32,%esp
+	cfi_adjust_cfa_offset (32)
+
+	fnstenv	4(%esp)			/* store fpu environment */
+
+	/* We use here %edx although only the low 1 bits are defined.
+	   But none of the operations should care and they are faster
+	   than the 16 bit operations.  */
+	movl	$0x0800,%edx		/* round towards +oo */
+	orl	4(%esp),%edx
+	andl	$0xfbff,%edx
+	movl	%edx,(%esp)
+	fldcw	(%esp)			/* load modified control word */
+
+	frndint				/* round */
+
+	fldenv	4(%esp)			/* restore original environment */
+
+	addl	$32,%esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__ceilf)
+weak_alias (__ceilf, ceilf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_ceill.S b/REORG.TODO/sysdeps/i386/fpu/s_ceill.S
new file mode 100644
index 0000000000..7c08f43b24
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_ceill.S
@@ -0,0 +1,40 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__ceill)
+	fldt	4(%esp)
+	subl	$32,%esp
+	cfi_adjust_cfa_offset (32)
+
+	fnstenv	4(%esp)			/* store fpu environment */
+
+	/* We use here %edx although only the low 1 bits are defined.
+	   But none of the operations should care and they are faster
+	   than the 16 bit operations.  */
+	movl	$0x0800,%edx		/* round towards +oo */
+	orl	4(%esp),%edx
+	andl	$0xfbff,%edx
+	movl	%edx,(%esp)
+	fldcw	(%esp)			/* load modified control word */
+
+	frndint				/* round */
+
+	/* Preserve "invalid" exceptions from sNaN input.  */
+	fnstsw
+	andl	$0x1, %eax
+	orl	%eax, 8(%esp)
+
+	fldenv	4(%esp)			/* restore original environment */
+
+	addl	$32,%esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__ceill)
+weak_alias (__ceill, ceill)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_copysign.S b/REORG.TODO/sysdeps/i386/fpu/s_copysign.S
new file mode 100644
index 0000000000..2520a94427
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_copysign.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_copysign.S,v 1.4 1995/05/08 23:53:02 jtc Exp $")
+
+ENTRY(__copysign)
+	movl	16(%esp),%edx
+	movl	8(%esp),%eax
+	andl	$0x80000000,%edx
+	andl	$0x7fffffff,%eax
+	orl	%edx,%eax
+	movl	%eax,8(%esp)
+	fldl	4(%esp)
+	ret
+END (__copysign)
+weak_alias (__copysign, copysign)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_copysignf.S b/REORG.TODO/sysdeps/i386/fpu/s_copysignf.S
new file mode 100644
index 0000000000..57b1a6f119
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_copysignf.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_copysignf.S,v 1.3 1995/05/08 23:53:25 jtc Exp $")
+
+ENTRY(__copysignf)
+	movl	8(%esp),%edx
+	movl	4(%esp),%eax
+	andl	$0x80000000,%edx
+	andl	$0x7fffffff,%eax
+	orl	%edx,%eax
+	movl	%eax,4(%esp)
+	flds	4(%esp)
+	ret
+END (__copysignf)
+weak_alias (__copysignf, copysignf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_copysignl.S b/REORG.TODO/sysdeps/i386/fpu/s_copysignl.S
new file mode 100644
index 0000000000..2163e7b014
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_copysignl.S
@@ -0,0 +1,21 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__copysignl)
+	movl	24(%esp),%edx
+	movl	12(%esp),%eax
+	andl	$0x8000,%edx
+	andl	$0x7fff,%eax
+	orl	%edx,%eax
+	movl	%eax,12(%esp)
+	fldt	4(%esp)
+	ret
+END (__copysignl)
+weak_alias (__copysignl, copysignl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_expm1.S b/REORG.TODO/sysdeps/i386/fpu/s_expm1.S
new file mode 100644
index 0000000000..59fded2d5a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_expm1.S
@@ -0,0 +1,113 @@
+/* ix87 specific implementation of exp(x)-1.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+   Based on code by John C. Bowman <bowman@ipp-garching.mpg.de>.
+   Corrections by H.J. Lu (hjl@gnu.ai.mit.edu), 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+	/* Using: e^x - 1 = 2^(x * log2(e)) - 1 */
+
+#include <sysdep.h>
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type minus1,@object
+minus1:	.double -1.0
+	ASM_SIZE_DIRECTIVE(minus1)
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type l2e,@object
+l2e:	.tfloat 1.442695040888963407359924681002
+	ASM_SIZE_DIRECTIVE(l2e)
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__expm1)
+	movzwl	4+6(%esp), %eax
+	xorb	$0x80, %ah	// invert sign bit (now 1 is "positive")
+	cmpl	$0xc086, %eax	// is num >= 704?
+	jae	HIDDEN_JUMPTARGET (__exp)
+
+	fldl	4(%esp)		// x
+	fxam			// Is NaN, +-Inf or +-0?
+	xorb	$0x80, %ah
+	cmpl	$0xc043, %eax	// is num <= -38.0?
+	fstsw	%ax
+	movb	$0x45, %ch
+	jb	4f
+
+	// Below -38.0 (may be -NaN or -Inf).
+	andb	%ah, %ch
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpb	$0x01, %ch
+	je	5f		// If -NaN, jump.
+	jmp	2f		// -large, possibly -Inf.
+
+4:	// In range -38.0 to 704.0 (may be +-0 but not NaN or +-Inf).
+	andb	%ah, %ch
+	cmpb	$0x40, %ch
+	je	3f		// If +-0, jump.
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+5:	fldt	MO(l2e)		// log2(e) : x
+	fmulp			// log2(e)*x
+	fld	%st		// log2(e)*x : log2(e)*x
+	// Set round-to-nearest temporarily.
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fstcw	4(%esp)
+	movl	$0xf3ff, %ecx
+	andl	4(%esp), %ecx
+	movl	%ecx, (%esp)
+	fldcw	(%esp)
+	frndint			// int(log2(e)*x) : log2(e)*x
+	fldcw	4(%esp)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fsubr	%st, %st(1)	// int(log2(e)*x) : fract(log2(e)*x)
+	fxch			// fract(log2(e)*x) : int(log2(e)*x)
+	f2xm1			// 2^fract(log2(e)*x)-1 : int(log2(e)*x)
+	fscale			// 2^(log2(e)*x)-2^int(log2(e)*x) : int(log2(e)*x)
+	fxch			// int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fldl	MO(one)		// 1 : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fscale			// 2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fsubrl	MO(one)		// 1-2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fstp	%st(1)		// 1-2^int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fsubrp	%st, %st(1)	// 2^(log2(e)*x)
+	DBL_CHECK_FORCE_UFLOW
+	ret
+
+2:	fstp	%st
+	fldl	MO(minus1)	// Set result to -1.0.
+3:	ret
+END(__expm1)
+weak_alias (__expm1, expm1)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_expm1f.S b/REORG.TODO/sysdeps/i386/fpu/s_expm1f.S
new file mode 100644
index 0000000000..4f0b2e7832
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_expm1f.S
@@ -0,0 +1,113 @@
+/* ix87 specific implementation of exp(x)-1.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+   Based on code by John C. Bowman <bowman@ipp-garching.mpg.de>.
+   Corrections by H.J. Lu (hjl@gnu.ai.mit.edu), 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+	/* Using: e^x - 1 = 2^(x * log2(e)) - 1 */
+
+#include <sysdep.h>
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type minus1,@object
+minus1:	.double -1.0
+	ASM_SIZE_DIRECTIVE(minus1)
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type l2e,@object
+l2e:	.tfloat 1.442695040888963407359924681002
+	ASM_SIZE_DIRECTIVE(l2e)
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__expm1f)
+	movzwl	4+2(%esp), %eax
+	xorb	$0x80, %ah	// invert sign bit (now 1 is "positive")
+	cmpl	$0xc2b1, %eax	// is num >= 88.5?
+	jae	HIDDEN_JUMPTARGET (__expf)
+
+	flds	4(%esp)		// x
+	fxam			// Is NaN, +-Inf or +-0?
+	xorb	$0x80, %ah
+	cmpl	$0xc190, %eax	// is num <= -18.0?
+	fstsw	%ax
+	movb	$0x45, %ch
+	jb	4f
+
+	// Below -18.0 (may be -NaN or -Inf).
+	andb	%ah, %ch
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpb	$0x01, %ch
+	je	5f		// If -NaN, jump.
+	jmp	2f		// -large, possibly -Inf.
+
+4:	// In range -18.0 to 88.5 (may be +-0 but not NaN or +-Inf).
+	andb	%ah, %ch
+	cmpb	$0x40, %ch
+	je	3f		// If +-0, jump.
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+5:	fldt	MO(l2e)		// log2(e) : x
+	fmulp			// log2(e)*x
+	fld	%st		// log2(e)*x : log2(e)*x
+	// Set round-to-nearest temporarily.
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fstcw	4(%esp)
+	movl	$0xf3ff, %ecx
+	andl	4(%esp), %ecx
+	movl	%ecx, (%esp)
+	fldcw	(%esp)
+	frndint			// int(log2(e)*x) : log2(e)*x
+	fldcw	4(%esp)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fsubr	%st, %st(1)	// int(log2(e)*x) : fract(log2(e)*x)
+	fxch			// fract(log2(e)*x) : int(log2(e)*x)
+	f2xm1			// 2^fract(log2(e)*x)-1 : int(log2(e)*x)
+	fscale			// 2^(log2(e)*x)-2^int(log2(e)*x) : int(log2(e)*x)
+	fxch			// int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fldl	MO(one)		// 1 : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fscale			// 2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fsubrl	MO(one)		// 1-2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fstp	%st(1)		// 1-2^int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fsubrp	%st, %st(1)	// 2^(log2(e)*x)
+	FLT_CHECK_FORCE_UFLOW
+	ret
+
+2:	fstp	%st
+	fldl	MO(minus1)	// Set result to -1.0.
+3:	ret
+END(__expm1f)
+weak_alias (__expm1f, expm1f)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_expm1l.S b/REORG.TODO/sysdeps/i386/fpu/s_expm1l.S
new file mode 100644
index 0000000000..7fbd99b0db
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_expm1l.S
@@ -0,0 +1,2 @@
+#define USE_AS_EXPM1L
+#include <e_expl.S>
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fabs.S b/REORG.TODO/sysdeps/i386/fpu/s_fabs.S
new file mode 100644
index 0000000000..23ae9dccb9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fabs.S
@@ -0,0 +1,9 @@
+#include <sysdep.h>
+
+	.text
+ENTRY(__fabs)
+	fldl	4(%esp)
+	fabs
+	ret
+END(__fabs)
+weak_alias (__fabs, fabs)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fabsf.S b/REORG.TODO/sysdeps/i386/fpu/s_fabsf.S
new file mode 100644
index 0000000000..c0407a8839
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fabsf.S
@@ -0,0 +1,9 @@
+#include <sysdep.h>
+
+	.text
+ENTRY(__fabsf)
+	flds	4(%esp)
+	fabs
+	ret
+END(__fabsf)
+weak_alias (__fabsf, fabsf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fabsl.S b/REORG.TODO/sysdeps/i386/fpu/s_fabsl.S
new file mode 100644
index 0000000000..a12a3e050b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fabsl.S
@@ -0,0 +1,9 @@
+#include <sysdep.h>
+
+	.text
+ENTRY(__fabsl)
+	fldt	4(%esp)
+	fabs
+	ret
+END(__fabsl)
+weak_alias (__fabsl, fabsl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fdim.c b/REORG.TODO/sysdeps/i386/fpu/s_fdim.c
new file mode 100644
index 0000000000..6243c62998
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fdim.c
@@ -0,0 +1,50 @@
+/* Return positive difference between arguments.  i386 version.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <fpu_control.h>
+#include <math.h>
+#include <math_private.h>
+
+double
+__fdim (double x, double y)
+{
+  if (islessequal (x, y))
+    return 0.0;
+
+  /* To avoid double rounding, set double precision for the
+     subtraction.  math_narrow_eval is still needed to eliminate
+     excess range in the case of overflow.  If the result of the
+     subtraction is in the subnormal range for double, it is exact, so
+     no issues of double rounding for subnormals arise.  */
+  fpu_control_t cw, cw_double;
+  _FPU_GETCW (cw);
+  cw_double = (cw & ~_FPU_EXTENDED) | _FPU_DOUBLE;
+  _FPU_SETCW (cw_double);
+  double r = math_narrow_eval (x - y);
+  _FPU_SETCW (cw);
+  if (isinf (r) && !isinf (x) && !isinf (y))
+    __set_errno (ERANGE);
+
+  return r;
+}
+weak_alias (__fdim, fdim)
+#ifdef NO_LONG_DOUBLE
+strong_alias (__fdim, __fdiml)
+weak_alias (__fdim, fdiml)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_finite.S b/REORG.TODO/sysdeps/i386/fpu/s_finite.S
new file mode 100644
index 0000000000..1ae4aed451
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_finite.S
@@ -0,0 +1,17 @@
+/*
+ * Written by Joe Keane <jgk@jgk.org>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__finite)
+	movl	8(%esp),%eax
+	movl    $0xFFEFFFFF,%ecx
+	subl    %eax,%ecx
+	xorl    %ecx,%eax
+	shrl	$31, %eax
+	ret
+END (__finite)
+weak_alias (__finite, finite)
+hidden_def (__finite)
+
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_finitef.S b/REORG.TODO/sysdeps/i386/fpu/s_finitef.S
new file mode 100644
index 0000000000..69e72facff
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_finitef.S
@@ -0,0 +1,16 @@
+/*
+ * Written by Joe Keane <jgk@jgk.org>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__finitef)
+	movl	4(%esp),%eax
+	movl    $0xFF7FFFFF,%ecx
+	subl    %eax,%ecx
+	xorl    %ecx,%eax
+	shrl    $31,%eax
+	ret
+END (__finitef)
+weak_alias (__finitef, finitef)
+hidden_def (__finitef)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_finitel.S b/REORG.TODO/sysdeps/i386/fpu/s_finitel.S
new file mode 100644
index 0000000000..cce90e18fc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_finitel.S
@@ -0,0 +1,15 @@
+/*
+ * Written by Joe Keane <jgk@jgk.org>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__finitel)
+	movl	12(%esp),%eax
+	orl	$0xffff8000, %eax
+	incl	%eax
+	shrl	$31, %eax
+	ret
+END (__finitel)
+weak_alias (__finitel, finitel)
+hidden_def (__finitel)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_floor.S b/REORG.TODO/sysdeps/i386/fpu/s_floor.S
new file mode 100644
index 0000000000..ed837dae40
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_floor.S
@@ -0,0 +1,34 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_floor.S,v 1.4 1995/05/09 00:01:59 jtc Exp $")
+
+ENTRY(__floor)
+	fldl	4(%esp)
+	subl	$32,%esp
+	cfi_adjust_cfa_offset (32)
+
+	fnstenv	4(%esp)			/* store fpu environment */
+
+	/* We use here %edx although only the low 1 bits are defined.
+	   But none of the operations should care and they are faster
+	   than the 16 bit operations.  */
+	movl	$0x400,%edx		/* round towards -oo */
+	orl	4(%esp),%edx
+	andl	$0xf7ff,%edx
+	movl	%edx,(%esp)
+	fldcw	(%esp)			/* load modified control word */
+
+	frndint				/* round */
+
+	fldenv	4(%esp)			/* restore original environment */
+
+	addl	$32,%esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__floor)
+weak_alias (__floor, floor)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_floorf.S b/REORG.TODO/sysdeps/i386/fpu/s_floorf.S
new file mode 100644
index 0000000000..84b6f7ed99
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_floorf.S
@@ -0,0 +1,34 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_floorf.S,v 1.3 1995/05/09 00:04:32 jtc Exp $")
+
+ENTRY(__floorf)
+	flds	4(%esp)
+	subl	$32,%esp
+	cfi_adjust_cfa_offset (32)
+
+	fnstenv	4(%esp)			/* store fpu environment */
+
+	/* We use here %edx although only the low 1 bits are defined.
+	   But none of the operations should care and they are faster
+	   than the 16 bit operations.  */
+	movl	$0x400,%edx		/* round towards -oo */
+	orl	4(%esp),%edx
+	andl	$0xf7ff,%edx
+	movl	%edx,(%esp)
+	fldcw	(%esp)			/* load modified control word */
+
+	frndint				/* round */
+
+	fldenv	4(%esp)			/* restore original environment */
+
+	addl	$32,%esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__floorf)
+weak_alias (__floorf, floorf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_floorl.S b/REORG.TODO/sysdeps/i386/fpu/s_floorl.S
new file mode 100644
index 0000000000..dc74a0c446
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_floorl.S
@@ -0,0 +1,40 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__floorl)
+	fldt	4(%esp)
+	subl	$32,%esp
+	cfi_adjust_cfa_offset (32)
+
+	fnstenv	4(%esp)			/* store fpu environment */
+
+	/* We use here %edx although only the low 1 bits are defined.
+	   But none of the operations should care and they are faster
+	   than the 16 bit operations.  */
+	movl	$0x400,%edx		/* round towards -oo */
+	orl	4(%esp),%edx
+	andl	$0xf7ff,%edx
+	movl	%edx,(%esp)
+	fldcw	(%esp)			/* load modified control word */
+
+	frndint				/* round */
+
+	/* Preserve "invalid" exceptions from sNaN input.  */
+	fnstsw
+	andl	$0x1, %eax
+	orl	%eax, 8(%esp)
+
+	fldenv	4(%esp)			/* restore original environment */
+
+	addl	$32,%esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__floorl)
+weak_alias (__floorl, floorl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmax.S b/REORG.TODO/sysdeps/i386/fpu/s_fmax.S
new file mode 100644
index 0000000000..218dcef421
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fmax.S
@@ -0,0 +1,43 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmax)
+	fldl	12(%esp)	// y
+	fxam
+	fnstsw
+	fldl	4(%esp)		// y : x
+
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	1f		// y == NaN
+
+	fucom	%st(1)
+	fnstsw
+	sahf
+	jnc	1f
+
+	fxch	%st(1)
+1:	fstp	%st(1)
+
+	ret
+END(__fmax)
+weak_alias (__fmax, fmax)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S b/REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S
new file mode 100644
index 0000000000..b7a00cefeb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S
@@ -0,0 +1,43 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmaxf)
+	flds	8(%esp)		// y
+	fxam
+	fnstsw
+	flds	4(%esp)		// y : x
+
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	1f		// y == NaN
+
+	fucom	%st(1)
+	fnstsw
+	sahf
+	jnc	1f
+
+	fxch	%st(1)
+1:	fstp	%st(1)
+
+	ret
+END(__fmaxf)
+weak_alias (__fmaxf, fmaxf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S b/REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S
new file mode 100644
index 0000000000..68162921db
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S
@@ -0,0 +1,71 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmaxl)
+	fldt	16(%esp)	// y
+	fxam
+	fnstsw
+	fldt	4(%esp)		// y : x
+
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	2f		// y == NaN
+
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	3f		// x == NaN
+
+	fucom	%st(1)
+	fnstsw
+	sahf
+	jnc	1f
+
+	fxch	%st(1)
+1:	fstp	%st(1)
+
+	ret
+
+2:	// st(1) is a NaN; st(0) may or may not be.
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	4f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 23(%esp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+3:	// st(0) is a NaN; st(1) is not.  Test if st(0) is signaling.
+	testb	$0x40, 11(%esp)
+	jz	4f
+	fstp	%st(0)
+	ret
+
+4:	// Both arguments are NaNs, or one is a signaling NaN.
+	faddp
+	ret
+END(__fmaxl)
+weak_alias (__fmaxl, fmaxl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmin.S b/REORG.TODO/sysdeps/i386/fpu/s_fmin.S
new file mode 100644
index 0000000000..a5bb0e06dd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fmin.S
@@ -0,0 +1,43 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmin)
+	fldl	4(%esp)		// x
+	fldl	12(%esp)	// x : y
+
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	1f		// y == NaN
+
+	fucom	%st(1)
+	fnstsw
+	sahf
+	jc	2f
+
+1:	fxch	%st(1)
+2:	fstp	%st(1)
+
+	ret
+END(__fmin)
+weak_alias (__fmin, fmin)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fminf.S b/REORG.TODO/sysdeps/i386/fpu/s_fminf.S
new file mode 100644
index 0000000000..fba4a41120
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fminf.S
@@ -0,0 +1,43 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fminf)
+	flds	4(%esp)		// x
+	flds	8(%esp)		// x : y
+
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	1f		// y == NaN
+
+	fucom	%st(1)
+	fnstsw
+	sahf
+	jc	2f
+
+1:	fxch	%st(1)
+2:	fstp	%st(1)
+
+	ret
+END(__fminf)
+weak_alias (__fminf, fminf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fminl.S b/REORG.TODO/sysdeps/i386/fpu/s_fminl.S
new file mode 100644
index 0000000000..12ef21fda9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fminl.S
@@ -0,0 +1,71 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fminl)
+	fldt	16(%esp)	// y
+	fxam
+	fnstsw
+	fldt	4(%esp)		// y : x
+
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	2f		// y == NaN
+
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	3f		// x == NaN
+
+	fucom	%st(1)
+	fnstsw
+	sahf
+	jc	1f
+
+	fxch	%st(1)
+1:	fstp	%st(1)
+
+	ret
+
+2:	// st(1) is a NaN; st(0) may or may not be.
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	4f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 23(%esp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+3:	// st(0) is a NaN; st(1) is not.  Test if st(0) is signaling.
+	testb	$0x40, 11(%esp)
+	jz	4f
+	fstp	%st(0)
+	ret
+
+4:	// Both arguments are NaNs, or one is a signaling NaN.
+	faddp
+	ret
+END(__fminl)
+weak_alias (__fminl, fminl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c b/REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c
new file mode 100644
index 0000000000..ce19fd0035
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c
@@ -0,0 +1,42 @@
+/* Return classification value corresponding to argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+
+#include <math_private.h>
+
+
+int
+__fpclassifyl (long double x)
+{
+  u_int32_t ex, hx, lx;
+  int retval = FP_NORMAL;
+
+  GET_LDOUBLE_WORDS (ex, hx, lx, x);
+  ex &= 0x7fff;
+  if ((ex | lx | hx) == 0)
+    retval = FP_ZERO;
+  else if (ex == 0 && (hx & 0x80000000) == 0)
+    retval = FP_SUBNORMAL;
+  else if (ex == 0x7fff)
+    retval = ((hx & 0x7fffffff) | lx) != 0 ? FP_NAN : FP_INFINITE;
+
+  return retval;
+}
+libm_hidden_def (__fpclassifyl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_frexp.S b/REORG.TODO/sysdeps/i386/fpu/s_frexp.S
new file mode 100644
index 0000000000..104f733bf6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_frexp.S
@@ -0,0 +1,83 @@
+/* ix87 specific frexp implementation for double.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type two54,@object
+two54:	.byte 0, 0, 0, 0, 0, 0, 0x50, 0x43
+	ASM_SIZE_DIRECTIVE(two54)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+#define PARMS	4		/* no space for saved regs */
+#define VAL0	PARMS
+#define VAL1	VAL0+4
+#define EXPP	VAL1+4
+
+	.text
+ENTRY (__frexp)
+
+	movl	VAL0(%esp), %ecx
+	movl	VAL1(%esp), %eax
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+	orl	%eax, %ecx
+	jz	1f
+	xorl	%ecx, %ecx
+	cmpl	$0x7ff00000, %eax
+	jae	1f
+
+	cmpl	$0x00100000, %eax
+	jae	2f
+
+	fldl	VAL0(%esp)
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fmull	MO(two54)
+	movl	$-54, %ecx
+	fstpl	VAL0(%esp)
+	fwait
+	movl	VAL1(%esp), %eax
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+
+2:	shrl	$20, %eax
+	andl	$0x800fffff, %edx
+	subl	$1022, %eax
+	orl	$0x3fe00000, %edx
+	addl	%eax, %ecx
+	movl	%edx, VAL1(%esp)
+
+	/* Store %ecx in the variable pointed to by the second argument,
+	   get the factor from the stack and return.  */
+1:	movl	EXPP(%esp), %eax
+	fldl	VAL0(%esp)
+	movl	%ecx, (%eax)
+
+	ret
+END (__frexp)
+weak_alias (__frexp, frexp)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_frexpf.S b/REORG.TODO/sysdeps/i386/fpu/s_frexpf.S
new file mode 100644
index 0000000000..f21c39ec4b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_frexpf.S
@@ -0,0 +1,80 @@
+/* ix87 specific frexp implementation for float.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type two25,@object
+two25:	.byte 0, 0, 0, 0x4c
+	ASM_SIZE_DIRECTIVE(two25)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+#define PARMS	4		/* no space for saved regs */
+#define VAL	PARMS
+#define EXPP	VAL+4
+
+	.text
+ENTRY (__frexpf)
+
+	movl	VAL(%esp), %eax
+	xorl	%ecx, %ecx
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+	jz	1f
+	cmpl	$0x7f800000, %eax
+	jae	1f
+
+	cmpl	$0x00800000, %eax
+	jae	2f
+
+	flds	VAL(%esp)
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fmuls	MO(two25)
+	movl	$-25, %ecx
+	fstps	VAL(%esp)
+	fwait
+	movl	VAL(%esp), %eax
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+
+2:	shrl	$23, %eax
+	andl	$0x807fffff, %edx
+	subl	$126, %eax
+	orl	$0x3f000000, %edx
+	addl	%eax, %ecx
+	movl	%edx, VAL(%esp)
+
+	/* Store %ecx in the variable pointed to by the second argument,
+	   get the factor from the stack and return.  */
+1:	movl	EXPP(%esp), %eax
+	flds	VAL(%esp)
+	movl	%ecx, (%eax)
+
+	ret
+END (__frexpf)
+weak_alias (__frexpf, frexpf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_frexpl.S b/REORG.TODO/sysdeps/i386/fpu/s_frexpl.S
new file mode 100644
index 0000000000..04f28888d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_frexpl.S
@@ -0,0 +1,92 @@
+/* ix87 specific frexp implementation for long double.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type two64,@object
+two64:	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
+	ASM_SIZE_DIRECTIVE(two64)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+#define PARMS	4		/* no space for saved regs */
+#define VAL0	PARMS
+#define VAL1	VAL0+4
+#define VAL2	VAL1+4
+#define EXPP	VAL2+4
+
+	.text
+ENTRY (__frexpl)
+
+	movl	VAL0(%esp), %ecx
+	movl	VAL2(%esp), %eax
+	orl	VAL1(%esp), %ecx
+	movl	%eax, %edx
+	andl	$0x7fff, %eax
+	orl	%eax, %ecx
+	jz	1f
+	xorl	%ecx, %ecx
+	cmpl	$0x7fff, %eax
+	je	3f
+
+	cmpl	$0, %eax
+	jne	2f
+
+	fldt	VAL0(%esp)
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+	fmull	MO(two64)	/* It's not necessary to use a 80bit factor */
+	movl	$-64, %ecx
+	fstpt	VAL0(%esp)
+	fwait
+	movl	VAL2(%esp), %eax
+	movl	%eax, %edx
+	andl	$0x7fff, %eax
+
+2:	andl	$0x8000, %edx
+	subl	$16382, %eax
+	orl	$0x3ffe, %edx
+	addl	%eax, %ecx
+	movl	%edx, VAL2(%esp)
+
+	/* Store %ecx in the variable pointed to by the second argument,
+	   get the factor from the stack and return.  */
+1:	movl	EXPP(%esp), %eax
+	fldt	VAL0(%esp)
+	movl	%ecx, (%eax)
+
+	ret
+
+	/* Infinity or NaN; ensure signaling NaNs are quieted.  */
+3:	movl	EXPP(%esp), %eax
+	fldt	VAL0(%esp)
+	fadd	%st
+	movl	%ecx, (%eax)
+	ret
+END (__frexpl)
+weak_alias (__frexpl, frexpl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_isinfl.c b/REORG.TODO/sysdeps/i386/fpu/s_isinfl.c
new file mode 100644
index 0000000000..cdd77183fa
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_isinfl.c
@@ -0,0 +1,32 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Change for long double by Ulrich Drepper <drepper@cygnus.com>.
+ * Intel i387 specific version.
+ * Public domain.
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+/*
+ * isinfl(x) returns 1 if x is inf, -1 if x is -inf, else 0;
+ * no branching!
+ */
+
+#include <math.h>
+#include <math_private.h>
+
+int __isinfl(long double x)
+{
+	int32_t se,hx,lx;
+	GET_LDOUBLE_WORDS(se,hx,lx,x);
+	/* This additional ^ 0x80000000 is necessary because in Intel's
+	   internal representation of the implicit one is explicit.  */
+	lx |= (hx ^ 0x80000000) | ((se & 0x7fff) ^ 0x7fff);
+	lx |= -lx;
+	se &= 0x8000;
+	return ~(lx >> 31) & (1 - (se >> 14));
+}
+hidden_def (__isinfl)
+weak_alias (__isinfl, isinfl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_isnanl.c b/REORG.TODO/sysdeps/i386/fpu/s_isnanl.c
new file mode 100644
index 0000000000..816396d8fb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_isnanl.c
@@ -0,0 +1,43 @@
+/* s_isnanl.c -- long double version for i387 of s_isnan.c.
+ * Conversion to long double by Ulrich Drepper,
+ * Cygnus Support, drepper@cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+/*
+ * isnanl(x) returns 1 is x is nan, else 0;
+ * no branching!
+ */
+
+#include <math.h>
+#include <math_private.h>
+
+int __isnanl(long double x)
+{
+	int32_t se,hx,lx;
+	GET_LDOUBLE_WORDS(se,hx,lx,x);
+	se = (se & 0x7fff) << 1;
+	/* The additional & 0x7fffffff is required because Intel's
+	   extended format has the normally implicit 1 explicit
+	   present.  Sigh!  */
+	lx |= hx & 0x7fffffff;
+	se |= (u_int32_t)(lx|(-lx))>>31;
+	se = 0xfffe - se;
+	return (int)((u_int32_t)(se))>>16;
+}
+hidden_def (__isnanl)
+weak_alias (__isnanl, isnanl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_llrint.S b/REORG.TODO/sysdeps/i386/fpu/s_llrint.S
new file mode 100644
index 0000000000..a597183aab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_llrint.S
@@ -0,0 +1,36 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__llrint)
+	fldl	4(%esp)
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fistpll	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	ret
+END(__llrint)
+weak_alias (__llrint, llrint)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_llrintf.S b/REORG.TODO/sysdeps/i386/fpu/s_llrintf.S
new file mode 100644
index 0000000000..a4b574eccb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_llrintf.S
@@ -0,0 +1,36 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__llrintf)
+	flds	4(%esp)
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fistpll	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	ret
+END(__llrintf)
+weak_alias (__llrintf, llrintf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_llrintl.S b/REORG.TODO/sysdeps/i386/fpu/s_llrintl.S
new file mode 100644
index 0000000000..7b48c02ef4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_llrintl.S
@@ -0,0 +1,36 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__llrintl)
+	fldt	4(%esp)
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fistpll	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	ret
+END(__llrintl)
+weak_alias (__llrintl, llrintl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_log1p.S b/REORG.TODO/sysdeps/i386/fpu/s_log1p.S
new file mode 100644
index 0000000000..7978e76095
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_log1p.S
@@ -0,0 +1,67 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_log1p.S,v 1.7 1995/05/09 00:10:58 jtc Exp $")
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	/* The fyl2xp1 can only be used for values in
+		-1 + sqrt(2) / 2 <= x <= 1 - sqrt(2) / 2
+	   0.29 is a safe value.
+	*/
+limit:	.double 0.29
+one:	.double 1.0
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+/*
+ * Use the fyl2xp1 function when the argument is in the range -0.29 to 0.29,
+ * otherwise fyl2x with the needed extra computation.
+ */
+	.text
+ENTRY(__log1p)
+	fldln2
+
+	fldl	4(%esp)
+
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+	fxam
+	fnstsw
+	fld	%st
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fabs
+	fcompl	MO(limit)
+	fnstsw
+	sahf
+	jc	2f
+
+	faddl	MO(one)
+	fyl2x
+	ret
+
+2:	fyl2xp1
+	DBL_CHECK_FORCE_UFLOW_NONNAN
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+
+END (__log1p)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_log1pf.S b/REORG.TODO/sysdeps/i386/fpu/s_log1pf.S
new file mode 100644
index 0000000000..acaa299d94
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_log1pf.S
@@ -0,0 +1,67 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_log1pf.S,v 1.4 1995/05/09 00:13:05 jtc Exp $")
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	/* The fyl2xp1 can only be used for values in
+		-1 + sqrt(2) / 2 <= x <= 1 - sqrt(2) / 2
+	   0.29 is a safe value.
+	*/
+limit:	.float 0.29
+one:	.float 1.0
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+/*
+ * Use the fyl2xp1 function when the argument is in the range -0.29 to 0.29,
+ * otherwise fyl2x with the needed extra computation.
+ */
+	.text
+ENTRY(__log1pf)
+	fldln2
+
+	flds	4(%esp)
+
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+	fxam
+	fnstsw
+	fld	%st
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fabs
+	fcomps	MO(limit)
+	fnstsw
+	sahf
+	jc	2f
+
+	fadds	MO(one)
+	fyl2x
+	ret
+
+2:	fyl2xp1
+	FLT_CHECK_FORCE_UFLOW_NONNAN
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+
+END (__log1pf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_log1pl.S b/REORG.TODO/sysdeps/i386/fpu/s_log1pl.S
new file mode 100644
index 0000000000..0fd05cbdb3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_log1pl.S
@@ -0,0 +1,76 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_log1p.S,v 1.7 1995/05/09 00:10:58 jtc Exp $")
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	/* The fyl2xp1 can only be used for values in
+		-1 + sqrt(2) / 2 <= x <= 1 - sqrt(2) / 2
+	   0.29 is a safe value.
+	*/
+limit:	.tfloat 0.29
+	/* Please note:	 we use a double value here.  Since 1.0 has
+	   an exact representation this does not effect the accuracy
+	   but it helps to optimize the code.  */
+one:	.double 1.0
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+/*
+ * Use the fyl2xp1 function when the argument is in the range -0.29 to 0.29,
+ * otherwise fyl2x with the needed extra computation.
+ */
+	.text
+ENTRY(__log1pl)
+	fldln2
+
+	fldt	4(%esp)
+
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+	fxam
+	fnstsw
+	fld	%st
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:
+	fabs
+	fldt	MO(limit)
+	fcompp
+	fnstsw
+	sahf
+	jnc	2f
+
+	movzwl	4+8(%esp), %eax
+	xorb	$0x80, %ah
+	cmpl	$0xc040, %eax
+	jae	5f
+
+	faddl	MO(one)
+5:	fyl2x
+	ret
+
+2:	fyl2xp1
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	fadd	%st(0)
+	ret
+
+END (__log1pl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_logb.S b/REORG.TODO/sysdeps/i386/fpu/s_logb.S
new file mode 100644
index 0000000000..f78c091c8a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_logb.S
@@ -0,0 +1,16 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_logb.S,v 1.4 1995/05/09 00:14:30 jtc Exp $")
+
+ENTRY(__logb)
+	fldl	4(%esp)
+	fxtract
+	fstp	%st
+	ret
+END (__logb)
+weak_alias (__logb, logb)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_logbf.S b/REORG.TODO/sysdeps/i386/fpu/s_logbf.S
new file mode 100644
index 0000000000..91eb3d2925
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_logbf.S
@@ -0,0 +1,16 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_logbf.S,v 1.3 1995/05/09 00:15:12 jtc Exp $")
+
+ENTRY(__logbf)
+	flds	4(%esp)
+	fxtract
+	fstp	%st
+	ret
+END (__logbf)
+weak_alias (__logbf, logbf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_logbl.c b/REORG.TODO/sysdeps/i386/fpu/s_logbl.c
new file mode 100644
index 0000000000..391e2db489
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_logbl.c
@@ -0,0 +1,19 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <math_private.h>
+
+long double
+__logbl (long double x)
+{
+  long double res;
+
+  asm ("fxtract\n"
+       "fstp	%%st" : "=t" (res) : "0" (x));
+  return res;
+}
+
+weak_alias (__logbl, logbl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_lrint.S b/REORG.TODO/sysdeps/i386/fpu/s_lrint.S
new file mode 100644
index 0000000000..79a374b399
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_lrint.S
@@ -0,0 +1,34 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__lrint)
+	fldl	4(%esp)
+	subl	$4, %esp
+	cfi_adjust_cfa_offset (4)
+	fistpl	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	ret
+END(__lrint)
+weak_alias (__lrint, lrint)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_lrintf.S b/REORG.TODO/sysdeps/i386/fpu/s_lrintf.S
new file mode 100644
index 0000000000..fc6e68e073
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_lrintf.S
@@ -0,0 +1,34 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__lrintf)
+	flds	4(%esp)
+	subl	$4, %esp
+	cfi_adjust_cfa_offset (4)
+	fistpl	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	ret
+END(__lrintf)
+weak_alias (__lrintf, lrintf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_lrintl.S b/REORG.TODO/sysdeps/i386/fpu/s_lrintl.S
new file mode 100644
index 0000000000..ba6dbdf44c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_lrintl.S
@@ -0,0 +1,34 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__lrintl)
+	fldt	4(%esp)
+	subl	$4, %esp
+	cfi_adjust_cfa_offset (4)
+	fistpl	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	ret
+END(__lrintl)
+weak_alias (__lrintl, lrintl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S b/REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S
new file mode 100644
index 0000000000..f7b79b6ff2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+/* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>.  */
+
+#include <machine/asm.h>
+
+ENTRY(__nearbyint)
+	fldl	4(%esp)
+	subl	$32, %esp
+	cfi_adjust_cfa_offset (32)
+	fnstenv	4(%esp)
+	frndint
+	fldenv	4(%esp)
+	addl	$32, %esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__nearbyint)
+weak_alias (__nearbyint, nearbyint)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S
new file mode 100644
index 0000000000..92df2f87b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+/* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>.  */
+
+#include <machine/asm.h>
+
+ENTRY(__nearbyintf)
+	flds	4(%esp)
+	subl	$32, %esp
+	cfi_adjust_cfa_offset (32)
+	fnstenv	4(%esp)
+	frndint
+	fldenv	4(%esp)
+	addl	$32, %esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__nearbyintf)
+weak_alias (__nearbyintf, nearbyintf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S
new file mode 100644
index 0000000000..3b7d1e2436
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S
@@ -0,0 +1,23 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+/* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>.  */
+
+#include <machine/asm.h>
+
+ENTRY(__nearbyintl)
+	fldt	4(%esp)
+	subl	$32, %esp
+	cfi_adjust_cfa_offset (32)
+	fnstenv	4(%esp)
+	frndint
+	fnstsw
+	andl	$0x1, %eax
+	orl	%eax, 8(%esp)
+	fldenv	4(%esp)
+	addl	$32, %esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__nearbyintl)
+weak_alias (__nearbyintl, nearbyintl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c b/REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c
new file mode 100644
index 0000000000..600ad7a8d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c
@@ -0,0 +1,125 @@
+/* s_nextafterl.c -- long double version of s_nextafter.c.
+ * Special version for i387.
+ * Conversion to long double by Ulrich Drepper,
+ * Cygnus Support, drepper@cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+/* IEEE functions
+ *	nextafterl(x,y)
+ *	return the next machine floating-point number of x in the
+ *	direction toward y.
+ *   Special cases:
+ */
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+
+long double __nextafterl(long double x, long double y)
+{
+	u_int32_t hx,hy,ix,iy;
+	u_int32_t lx,ly;
+	int32_t esx,esy;
+
+	GET_LDOUBLE_WORDS(esx,hx,lx,x);
+	GET_LDOUBLE_WORDS(esy,hy,ly,y);
+	ix = esx&0x7fff;		/* |x| */
+	iy = esy&0x7fff;		/* |y| */
+
+	/* Intel's extended format has the normally implicit 1 explicit
+	   present.  Sigh!  */
+	if(((ix==0x7fff)&&(((hx&0x7fffffff)|lx)!=0)) ||   /* x is nan */
+	   ((iy==0x7fff)&&(((hy&0x7fffffff)|ly)!=0)))     /* y is nan */
+	   return x+y;
+	if(x==y) return y;		/* x=y, return y */
+	if((ix|hx|lx)==0) {			/* x == 0 */
+	    long double u;
+	    SET_LDOUBLE_WORDS(x,esy&0x8000,0,1);/* return +-minsubnormal */
+	    u = math_opt_barrier (x);
+	    u = u * u;
+	    math_force_eval (u);		/* raise underflow flag */
+	    return x;
+	}
+	if(esx>=0) {			/* x > 0 */
+	    if(esx>esy||((esx==esy) && (hx>hy||((hx==hy)&&(lx>ly))))) {
+	      /* x > y, x -= ulp */
+		if(lx==0) {
+		    if (hx <= 0x80000000) {
+		      if (esx == 0) {
+			--hx;
+		      } else {
+			esx -= 1;
+			hx = hx - 1;
+			if (esx > 0)
+			  hx |= 0x80000000;
+		      }
+		    } else
+		      hx -= 1;
+		}
+		lx -= 1;
+	    } else {				/* x < y, x += ulp */
+		lx += 1;
+		if(lx==0) {
+		    hx += 1;
+		    if (hx==0 || (esx == 0 && hx == 0x80000000)) {
+			esx += 1;
+			hx |= 0x80000000;
+		    }
+		}
+	    }
+	} else {				/* x < 0 */
+	    if(esy>=0||(esx>esy||((esx==esy)&&(hx>hy||((hx==hy)&&(lx>ly)))))){
+	      /* x < y, x -= ulp */
+		if(lx==0) {
+		    if (hx <= 0x80000000 && esx != 0xffff8000) {
+			esx -= 1;
+			hx = hx - 1;
+			if ((esx&0x7fff) > 0)
+			  hx |= 0x80000000;
+		    } else
+		      hx -= 1;
+		}
+		lx -= 1;
+	    } else {				/* x > y, x += ulp */
+		lx += 1;
+		if(lx==0) {
+		    hx += 1;
+		    if (hx==0 || (esx == 0xffff8000 && hx == 0x80000000)) {
+			esx += 1;
+			hx |= 0x80000000;
+		    }
+		}
+	    }
+	}
+	esy = esx&0x7fff;
+	if(esy==0x7fff) {
+	    long double u = x + x;	/* overflow  */
+	    math_force_eval (u);
+	    __set_errno (ERANGE);
+	}
+	if(esy==0) {
+	    long double u = x*x;		/* underflow */
+	    math_force_eval (u);		/* raise underflow flag */
+	    __set_errno (ERANGE);
+	}
+	SET_LDOUBLE_WORDS(x,esx,hx,lx);
+	return x;
+}
+weak_alias (__nextafterl, nextafterl)
+strong_alias (__nextafterl, __nexttowardl)
+weak_alias (__nextafterl, nexttowardl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c b/REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c
new file mode 100644
index 0000000000..0b47044760
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c
@@ -0,0 +1,93 @@
+/* s_nexttoward.c
+ * Special i387 version
+ * Conversion from s_nextafter.c by Ulrich Drepper, Cygnus Support,
+ * drepper@cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+/* IEEE functions
+ *	nexttoward(x,y)
+ *	return the next machine floating-point number of x in the
+ *	direction toward y.
+ *   Special cases:
+ */
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+#include <float.h>
+
+double __nexttoward(double x, long double y)
+{
+	int32_t hx,ix,iy;
+	u_int32_t lx,hy,ly,esy;
+
+	EXTRACT_WORDS(hx,lx,x);
+	GET_LDOUBLE_WORDS(esy,hy,ly,y);
+	ix = hx&0x7fffffff;		/* |x| */
+	iy = esy&0x7fff;		/* |y| */
+
+	/* Intel's extended format has the normally implicit 1 explicit
+	   present.  Sigh!  */
+	if(((ix>=0x7ff00000)&&((ix-0x7ff00000)|lx)!=0) ||   /* x is nan */
+	   ((iy>=0x7fff)&&((hy&0x7fffffff)|ly)!=0))        /* y is nan */
+	   return x+y;
+	if((long double) x==y) return y;	/* x=y, return y */
+	if((ix|lx)==0) {			/* x == 0 */
+	    double u;
+	    INSERT_WORDS(x,(esy&0x8000)<<16,1); /* return +-minsub */
+	    u = math_opt_barrier (x);
+	    u = u * u;
+	    math_force_eval (u);		/* raise underflow flag */
+	    return x;
+	}
+	if(hx>=0) {				/* x > 0 */
+	    if (x > y) {			/* x -= ulp */
+		if(lx==0) hx -= 1;
+		lx -= 1;
+	    } else {				/* x < y, x += ulp */
+		lx += 1;
+		if(lx==0) hx += 1;
+	    }
+	} else {				/* x < 0 */
+	    if (x < y) {			/* x -= ulp */
+		if(lx==0) hx -= 1;
+		lx -= 1;
+	    } else {				/* x > y, x += ulp */
+		lx += 1;
+		if(lx==0) hx += 1;
+	    }
+	}
+	hy = hx&0x7ff00000;
+	if(hy>=0x7ff00000) {
+	  double u = x+x;			/* overflow  */
+	  math_force_eval (u);
+	  __set_errno (ERANGE);
+	}
+	if(hy<0x00100000) {
+	    double u = x*x;			/* underflow */
+	    math_force_eval (u);		/* raise underflow flag */
+	    __set_errno (ERANGE);
+	}
+	INSERT_WORDS(x,hx,lx);
+	return x;
+}
+weak_alias (__nexttoward, nexttoward)
+#ifdef NO_LONG_DOUBLE
+strong_alias (__nexttoward, __nexttowardl)
+weak_alias (__nexttoward, nexttowardl)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c b/REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c
new file mode 100644
index 0000000000..e1156d1e4f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c
@@ -0,0 +1,77 @@
+/* s_nexttowardf.c -- float version of s_nextafter.c.
+ * Special i387 version.
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+#include <float.h>
+
+float __nexttowardf(float x, long double y)
+{
+	int32_t hx,ix,iy;
+	u_int32_t hy,ly,esy;
+
+	GET_FLOAT_WORD(hx,x);
+	GET_LDOUBLE_WORDS(esy,hy,ly,y);
+	ix = hx&0x7fffffff;		/* |x| */
+	iy = esy&0x7fff;		/* |y| */
+
+	/* Intel's extended format has the normally implicit 1 explicit
+	   present.  Sigh!  */
+	if((ix>0x7f800000) ||			/* x is nan */
+	   (iy>=0x7fff&&(((hy&0x7fffffff)|ly)!=0))) /* y is nan */
+	   return x+y;
+	if((long double) x==y) return y;	/* x=y, return y */
+	if(ix==0) {				/* x == 0 */
+	    float u;
+	    SET_FLOAT_WORD(x,((esy&0x8000)<<16)|1);/* return +-minsub*/
+	    u = math_opt_barrier (x);
+	    u = u * u;
+	    math_force_eval (u);		/* raise underflow flag */
+	    return x;
+	}
+	if(hx>=0) {				/* x > 0 */
+	    if(x > y) {				/* x -= ulp */
+		hx -= 1;
+	    } else {				/* x < y, x += ulp */
+		hx += 1;
+	    }
+	} else {				/* x < 0 */
+	    if(x < y) {				/* x -= ulp */
+		hx -= 1;
+	    } else {				/* x > y, x += ulp */
+		hx += 1;
+	    }
+	}
+	hy = hx&0x7f800000;
+	if(hy>=0x7f800000) {
+	  float u = x+x;			/* overflow  */
+	  math_force_eval (u);
+	  __set_errno (ERANGE);
+	}
+	if(hy<0x00800000) {
+	    float u = x*x;			/* underflow */
+	    math_force_eval (u);		/* raise underflow flag */
+	    __set_errno (ERANGE);
+	}
+	SET_FLOAT_WORD(x,hx);
+	return x;
+}
+weak_alias (__nexttowardf, nexttowardf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_remquo.S b/REORG.TODO/sysdeps/i386/fpu/s_remquo.S
new file mode 100644
index 0000000000..341285db30
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_remquo.S
@@ -0,0 +1,45 @@
+/*
+ * Written by Ulrich Drepper <drepper@cygnus.com>.
+ * Based on e_remainder by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+#define PARMS	4		/* no space for saved regs */
+#define DVDND	PARMS
+#define DVSOR	DVDND+8
+#define QUOP	DVSOR+8
+
+	.text
+ENTRY (__remquo)
+
+	fldl	DVSOR(%esp)
+	fldl	DVDND(%esp)
+1:	fprem1
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	/* Compute the congruent of the quotient.  */
+	movl	%eax, %ecx
+	shrl	$8, %eax
+	shrl	$12, %ecx
+	andl	$4, %ecx
+	andl	$3, %eax
+	orl	%eax, %ecx
+	leal	(%ecx,%ecx,2),%ecx
+	movl	$0xef2a60, %eax
+	shrl	%cl, %eax
+	andl	$7, %eax
+	movl	QUOP(%esp), %ecx
+	movl	DVDND+4(%esp), %edx
+	xorl	DVSOR+4(%esp), %edx
+	testl	$0x80000000, %edx
+	jz	1f
+	negl	%eax
+1:	movl	%eax, (%ecx)
+
+	ret
+END (__remquo)
+weak_alias (__remquo, remquo)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_remquof.S b/REORG.TODO/sysdeps/i386/fpu/s_remquof.S
new file mode 100644
index 0000000000..62063f068f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_remquof.S
@@ -0,0 +1,45 @@
+/*
+ * Written by Ulrich Drepper <drepper@cygnus.com>.
+ * Based on e_remainder by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+#define PARMS	4		/* no space for saved regs */
+#define DVDND	PARMS
+#define DVSOR	DVDND+4
+#define QUOP	DVSOR+4
+
+	.text
+ENTRY (__remquof)
+
+	flds	DVSOR(%esp)
+	flds	DVDND(%esp)
+1:	fprem1
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	/* Compute the congruent of the quotient.  */
+	movl	%eax, %ecx
+	shrl	$8, %eax
+	shrl	$12, %ecx
+	andl	$4, %ecx
+	andl	$3, %eax
+	orl	%eax, %ecx
+	leal	(%ecx,%ecx,2),%ecx
+	movl	$0xef2a60, %eax
+	shrl	%cl, %eax
+	andl	$7, %eax
+	movl	QUOP(%esp), %ecx
+	movl	DVDND(%esp), %edx
+	xorl	DVSOR(%esp), %edx
+	testl	$0x80000000, %edx
+	jz	1f
+	negl	%eax
+1:	movl	%eax, (%ecx)
+
+	ret
+END (__remquof)
+weak_alias (__remquof, remquof)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_remquol.S b/REORG.TODO/sysdeps/i386/fpu/s_remquol.S
new file mode 100644
index 0000000000..f3d84fc7c2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_remquol.S
@@ -0,0 +1,45 @@
+/*
+ * Written by Ulrich Drepper <drepper@cygnus.com>.
+ * Based on e_remainder by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+#define PARMS	4		/* no space for saved regs */
+#define DVDND	PARMS
+#define DVSOR	DVDND+12
+#define QUOP	DVSOR+12
+
+	.text
+ENTRY (__remquol)
+
+	fldt	DVSOR(%esp)
+	fldt	DVDND(%esp)
+1:	fprem1
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	/* Compute the congruent of the quotient.  */
+	movl	%eax, %ecx
+	shrl	$8, %eax
+	shrl	$12, %ecx
+	andl	$4, %ecx
+	andl	$3, %eax
+	orl	%eax, %ecx
+	leal	(%ecx,%ecx,2),%ecx
+	movl	$0xef2a60, %eax
+	shrl	%cl, %eax
+	andl	$7, %eax
+	movl	QUOP(%esp), %ecx
+	movl	DVDND+8(%esp), %edx
+	xorl	DVSOR+8(%esp), %edx
+	testl	$0x8000, %edx
+	jz	1f
+	negl	%eax
+1:	movl	%eax, (%ecx)
+
+	ret
+END (__remquol)
+weak_alias (__remquol, remquol)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_rint.S b/REORG.TODO/sysdeps/i386/fpu/s_rint.S
new file mode 100644
index 0000000000..be36c5f0ca
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_rint.S
@@ -0,0 +1,15 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_rint.S,v 1.4 1995/05/09 00:16:08 jtc Exp $")
+
+ENTRY(__rint)
+	fldl	4(%esp)
+	frndint
+	ret
+END (__rint)
+weak_alias (__rint, rint)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_rintf.S b/REORG.TODO/sysdeps/i386/fpu/s_rintf.S
new file mode 100644
index 0000000000..2b358c1cf1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_rintf.S
@@ -0,0 +1,15 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_rintf.S,v 1.3 1995/05/09 00:17:22 jtc Exp $")
+
+ENTRY(__rintf)
+	flds	4(%esp)
+	frndint
+	ret
+END (__rintf)
+weak_alias (__rintf, rintf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_rintl.c b/REORG.TODO/sysdeps/i386/fpu/s_rintl.c
new file mode 100644
index 0000000000..66af9cb675
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_rintl.c
@@ -0,0 +1,18 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <math_private.h>
+
+long double
+__rintl (long double x)
+{
+  long double res;
+
+  asm ("frndint" : "=t" (res) : "0" (x));
+  return res;
+}
+
+weak_alias (__rintl, rintl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbln.c b/REORG.TODO/sysdeps/i386/fpu/s_scalbln.c
new file mode 100644
index 0000000000..1009713fbc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbln.c
@@ -0,0 +1,2 @@
+/* Nothing to do.  This function is the same as scalbn.  So we define an
+   alias.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c b/REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c
new file mode 100644
index 0000000000..5e558c3540
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c
@@ -0,0 +1,2 @@
+/* Nothing to do.  This function is the same as scalbnf.  So we define an
+   alias.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c b/REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c
new file mode 100644
index 0000000000..cda2ec11c8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c
@@ -0,0 +1,2 @@
+/* Nothing to do.  This function is the same as scalbnl.  So we define an
+   alias.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbn.S b/REORG.TODO/sysdeps/i386/fpu/s_scalbn.S
new file mode 100644
index 0000000000..4e90903115
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbn.S
@@ -0,0 +1,24 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_scalbn.S,v 1.4 1995/05/09 00:19:06 jtc Exp $")
+
+ENTRY(__scalbn)
+	fildl	12(%esp)
+	fldl	4(%esp)
+	fscale
+	fstp	%st(1)
+	DBL_NARROW_EVAL
+	ret
+END (__scalbn)
+strong_alias (__scalbn, __scalbln)
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libc, GLIBC_2_1, GLIBC_2_20)
+compat_symbol (libc, __scalbn, scalbln, GLIBC_2_1);
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S b/REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S
new file mode 100644
index 0000000000..f8353c4c75
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S
@@ -0,0 +1,24 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_scalbnf.S,v 1.3 1995/05/09 00:19:59 jtc Exp $")
+
+ENTRY(__scalbnf)
+	fildl	8(%esp)
+	flds	4(%esp)
+	fscale
+	fstp	%st(1)
+	FLT_NARROW_EVAL
+	ret
+END (__scalbnf)
+strong_alias (__scalbnf, __scalblnf)
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libc, GLIBC_2_1, GLIBC_2_20)
+compat_symbol (libc, __scalbnf, scalblnf, GLIBC_2_1);
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S b/REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S
new file mode 100644
index 0000000000..839b5ff353
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S
@@ -0,0 +1,23 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__scalbnl)
+	fildl	16(%esp)
+	fldt	4(%esp)
+	fscale
+	fstp	%st(1)
+	ret
+END (__scalbnl)
+strong_alias (__scalbnl, __scalblnl)
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libc, GLIBC_2_1, GLIBC_2_20)
+compat_symbol (libc, __scalbnl, scalblnl, GLIBC_2_1);
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_significand.S b/REORG.TODO/sysdeps/i386/fpu/s_significand.S
new file mode 100644
index 0000000000..4859b7ed71
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_significand.S
@@ -0,0 +1,16 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_significand.S,v 1.4 1995/05/09 00:21:47 jtc Exp $")
+
+ENTRY(__significand)
+	fldl	4(%esp)
+	fxtract
+	fstp	%st(1)
+	ret
+END (__significand)
+weak_alias (__significand, significand)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_significandf.S b/REORG.TODO/sysdeps/i386/fpu/s_significandf.S
new file mode 100644
index 0000000000..3a2de97759
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_significandf.S
@@ -0,0 +1,16 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_significandf.S,v 1.3 1995/05/09 00:24:07 jtc Exp $")
+
+ENTRY(__significandf)
+	flds	4(%esp)
+	fxtract
+	fstp	%st(1)
+	ret
+END (__significandf)
+weak_alias (__significandf, significandf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_significandl.c b/REORG.TODO/sysdeps/i386/fpu/s_significandl.c
new file mode 100644
index 0000000000..b8cb093502
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_significandl.c
@@ -0,0 +1,19 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <math_private.h>
+
+long double
+__significandl (long double x)
+{
+  long double res;
+
+  asm ("fxtract\n"
+       "fstp	%%st(1)" : "=t" (res) : "0" (x));
+  return res;
+}
+
+weak_alias (__significandl, significandl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_trunc.S b/REORG.TODO/sysdeps/i386/fpu/s_trunc.S
new file mode 100644
index 0000000000..e9a850b877
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_trunc.S
@@ -0,0 +1,37 @@
+/* Truncate double value.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+ENTRY(__trunc)
+	fldl	4(%esp)
+	subl	$32, %esp
+	cfi_adjust_cfa_offset (32)
+	fnstenv	4(%esp)
+	movl	$0xc00, %edx
+	orl	4(%esp), %edx
+	movl	%edx, (%esp)
+	fldcw	(%esp)
+	frndint
+	fldenv	4(%esp)
+	addl	$32, %esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END(__trunc)
+weak_alias (__trunc, trunc)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_truncf.S b/REORG.TODO/sysdeps/i386/fpu/s_truncf.S
new file mode 100644
index 0000000000..a93f5b9a2e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_truncf.S
@@ -0,0 +1,37 @@
+/* Truncate float value.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+ENTRY(__truncf)
+	flds	4(%esp)
+	subl	$32, %esp
+	cfi_adjust_cfa_offset (32)
+	fnstenv	4(%esp)
+	movl	$0xc00, %edx
+	orl	4(%esp), %edx
+	movl	%edx, (%esp)
+	fldcw	(%esp)
+	frndint
+	fldenv	4(%esp)
+	addl	$32, %esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END(__truncf)
+weak_alias (__truncf, truncf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_truncl.S b/REORG.TODO/sysdeps/i386/fpu/s_truncl.S
new file mode 100644
index 0000000000..a884123612
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_truncl.S
@@ -0,0 +1,40 @@
+/* Truncate long double value.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+ENTRY(__truncl)
+	fldt	4(%esp)
+	subl	$32, %esp
+	cfi_adjust_cfa_offset (32)
+	fnstenv	4(%esp)
+	movl	$0xc00, %edx
+	orl	4(%esp), %edx
+	movl	%edx, (%esp)
+	fldcw	(%esp)
+	frndint
+	fnstsw
+	andl	$0x1, %eax
+	orl	%eax, 8(%esp)
+	fldenv	4(%esp)
+	addl	$32, %esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END(__truncl)
+weak_alias (__truncl, truncl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/slowexp.c b/REORG.TODO/sysdeps/i386/fpu/slowexp.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/slowexp.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/slowpow.c b/REORG.TODO/sysdeps/i386/fpu/slowpow.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/slowpow.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/t_exp.c b/REORG.TODO/sysdeps/i386/fpu/t_exp.c
new file mode 100644
index 0000000000..fd37963b05
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/t_exp.c
@@ -0,0 +1 @@
+/* Empty.  Not needed. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c b/REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c
new file mode 100644
index 0000000000..ddd36d0964
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c
@@ -0,0 +1,8 @@
+/* The inline __ieee754_sqrt is not correctly rounding; it's OK for
+   most internal uses in glibc, but not for sqrt itself.  */
+#define __ieee754_sqrt __avoid_ieee754_sqrt
+#include <math.h>
+#include <math_private.h>
+#undef __ieee754_sqrt
+extern double __ieee754_sqrt (double);
+#include <math/w_sqrt_compat.c>
diff --git a/REORG.TODO/sysdeps/i386/gccframe.h b/REORG.TODO/sysdeps/i386/gccframe.h
new file mode 100644
index 0000000000..579da40ae9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/gccframe.h
@@ -0,0 +1,27 @@
+/* Definition of object in frame unwind info.  i386 version.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define DWARF_FRAME_REGISTERS 17
+
+#define CRT_GET_RFIB_DATA(BASE)		\
+  {					\
+    register void *__ebx __asm__("ebx");\
+    BASE = __ebx;			\
+  }
+
+#include <sysdeps/generic/gccframe.h>
diff --git a/REORG.TODO/sysdeps/i386/gmp-mparam.h b/REORG.TODO/sysdeps/i386/gmp-mparam.h
new file mode 100644
index 0000000000..7ea503a403
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/gmp-mparam.h
@@ -0,0 +1,28 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991-2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+#define IEEE_DOUBLE_BIG_ENDIAN 0
diff --git a/REORG.TODO/sysdeps/i386/htonl.S b/REORG.TODO/sysdeps/i386/htonl.S
new file mode 100644
index 0000000000..63279bb6e1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/htonl.S
@@ -0,0 +1,34 @@
+/* Change byte order in word.  For Intel 80x86, x >= 4.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   word		(sp + 4)
+*/
+
+	.text
+ENTRY (htonl)
+	movl	4(%esp), %eax
+	bswap	%eax
+	ret
+END (htonl)
+
+weak_alias (htonl, ntohl)
diff --git a/REORG.TODO/sysdeps/i386/htons.S b/REORG.TODO/sysdeps/i386/htons.S
new file mode 100644
index 0000000000..a3c53a9944
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/htons.S
@@ -0,0 +1,35 @@
+/* Change byte order in word.  For Intel 80x86, x >= 3.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   word		(sp + 4)
+*/
+
+	.text
+ENTRY (htons)
+	movl	4(%esp), %eax
+	andl	$0xffff, %eax
+	rorw	$8, %ax
+	ret
+END (htons)
+
+weak_alias (htons, ntohs)
diff --git a/REORG.TODO/sysdeps/i386/i386-mcount.S b/REORG.TODO/sysdeps/i386/i386-mcount.S
new file mode 100644
index 0000000000..733b8c78e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i386-mcount.S
@@ -0,0 +1,79 @@
+/* i386-specific implementation of profiling support.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* We need a special version of the `mcount' function since for ix86 it
+   must not clobber any register.  This has several reasons:
+     - there is a bug in gcc as of version 2.7.2.2 which prohibits the
+       use of profiling together with nested functions
+     - the ELF `fixup' function uses GCC's regparm feature
+     - some (future) systems might want to pass parameters in registers.  */
+
+	.globl C_SYMBOL_NAME(_mcount)
+	.type C_SYMBOL_NAME(_mcount), @function
+	.align ALIGNARG(4)
+C_LABEL(_mcount)
+	/* Save the caller-clobbered registers.  */
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+
+	movl 12(%esp), %edx
+	movl 4(%ebp), %eax
+
+	/* No need to access the PLT or GOT, __mcount_internal is an
+	   internal function and we can make a relative call.  */
+	call C_SYMBOL_NAME(__mcount_internal)
+
+	/* Pop the saved registers.  Please note that `mcount' has no
+	   return value.  */
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+	ASM_SIZE_DIRECTIVE(C_SYMBOL_NAME(_mcount))
+
+#undef mcount
+weak_alias (_mcount, mcount)
+
+	/* Same as above, but doesn't require a frame pointer */
+	.globl C_SYMBOL_NAME(__fentry__)
+	.type C_SYMBOL_NAME(__fentry__), @function
+	.align ALIGNARG(4)
+C_LABEL(__fentry__)
+	/* Save the caller-clobbered registers.  */
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+
+	movl 12(%esp), %edx
+	movl 16(%esp), %eax
+
+	/* No need to access the PLT or GOT, __mcount_internal is an
+	   internal function and we can make a relative call.  */
+	call C_SYMBOL_NAME(__mcount_internal)
+
+	/* Pop the saved registers.  Please note that `__fentry__' has no
+	   return value.  */
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+	ASM_SIZE_DIRECTIVE(C_SYMBOL_NAME(__fentry__))
diff --git a/REORG.TODO/sysdeps/i386/i586/add_n.S b/REORG.TODO/sysdeps/i386/i586/add_n.S
new file mode 100644
index 0000000000..f73df092f0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/add_n.S
@@ -0,0 +1,143 @@
+/* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16		/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define S2	S1+4
+#define SIZE	S2+4
+
+	.text
+ENTRY (__mpn_add_n)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 12)
+	movl	S1(%esp),%esi
+	cfi_rel_offset (esi, 8)
+	movl	S2(%esp),%ebx
+	cfi_rel_offset (ebx, 0)
+	movl	SIZE(%esp),%ecx
+	movl	(%ebx),%ebp
+	cfi_rel_offset (ebp, 4)
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		/* zero carry flag */
+	jz	L(end)
+	pushl	%edx
+	cfi_adjust_cfa_offset (4)
+
+	ALIGN (3)
+L(oop):	movl	28(%edi),%eax		/* fetch destination cache line */
+	leal	32(%edi),%edi
+
+L(1):	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	adcl	%ebp,%eax
+	movl	4(%ebx),%ebp
+	adcl	%ebp,%edx
+	movl	8(%ebx),%ebp
+	movl	%eax,-32(%edi)
+	movl	%edx,-28(%edi)
+
+L(2):	movl	8(%esi),%eax
+	movl	12(%esi),%edx
+	adcl	%ebp,%eax
+	movl	12(%ebx),%ebp
+	adcl	%ebp,%edx
+	movl	16(%ebx),%ebp
+	movl	%eax,-24(%edi)
+	movl	%edx,-20(%edi)
+
+L(3):	movl	16(%esi),%eax
+	movl	20(%esi),%edx
+	adcl	%ebp,%eax
+	movl	20(%ebx),%ebp
+	adcl	%ebp,%edx
+	movl	24(%ebx),%ebp
+	movl	%eax,-16(%edi)
+	movl	%edx,-12(%edi)
+
+L(4):	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	adcl	%ebp,%eax
+	movl	28(%ebx),%ebp
+	adcl	%ebp,%edx
+	movl	32(%ebx),%ebp
+	movl	%eax,-8(%edi)
+	movl	%edx,-4(%edi)
+
+	leal	32(%esi),%esi
+	leal	32(%ebx),%ebx
+	decl	%ecx
+	jnz	L(oop)
+
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+L(end):
+	decl	%edx			/* test %edx w/o clobbering carry */
+	js	L(end2)
+	incl	%edx
+L(oop2):
+	leal	4(%edi),%edi
+	movl	(%esi),%eax
+	adcl	%ebp,%eax
+	movl	4(%ebx),%ebp
+	movl	%eax,-4(%edi)
+	leal	4(%esi),%esi
+	leal	4(%ebx),%ebx
+	decl	%edx
+	jnz	L(oop2)
+L(end2):
+	movl	(%esi),%eax
+	adcl	%ebp,%eax
+	movl	%eax,(%edi)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_add_n)
diff --git a/REORG.TODO/sysdeps/i386/i586/addmul_1.S b/REORG.TODO/sysdeps/i386/i586/addmul_1.S
new file mode 100644
index 0000000000..a713192982
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/addmul_1.S
@@ -0,0 +1,94 @@
+/* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+   the result to a second limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16	/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define SIZE	S1+4
+#define S2LIMB	SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+	.text
+ENTRY (__mpn_addmul_1)
+
+	pushl	%res_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%s1_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%s2_limb
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp), %res_ptr
+	cfi_rel_offset (res_ptr, 12)
+	movl	S1(%esp), %s1_ptr
+	cfi_rel_offset (s1_ptr, 8)
+	movl	SIZE(%esp), %size
+	movl	S2LIMB(%esp), %s2_limb
+	cfi_rel_offset (s2_limb, 0)
+	leal	(%res_ptr,%size,4), %res_ptr
+	leal	(%s1_ptr,%size,4), %s1_ptr
+	negl	%size
+	xorl	%ebp, %ebp
+	cfi_rel_offset (ebp, 4)
+	ALIGN (3)
+
+L(oop):	adcl	$0, %ebp
+	movl	(%s1_ptr,%size,4), %eax
+
+	mull	%s2_limb
+
+	addl	%ebp, %eax
+	movl	(%res_ptr,%size,4), %ebp
+
+	adcl	$0, %edx
+	addl	%eax, %ebp
+
+	movl	%ebp, (%res_ptr,%size,4)
+	incl	%size
+
+	movl	%edx, %ebp
+	jnz	L(oop)
+
+	adcl	$0, %ebp
+	movl	%ebp, %eax
+	popl	%s2_limb
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s2_limb)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%s1_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s1_ptr)
+	popl	%res_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (res_ptr)
+
+	ret
+#undef size
+END (__mpn_addmul_1)
diff --git a/REORG.TODO/sysdeps/i386/i586/bzero.S b/REORG.TODO/sysdeps/i386/i586/bzero.S
new file mode 100644
index 0000000000..2a106719a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/bzero.S
@@ -0,0 +1,4 @@
+#define USE_AS_BZERO
+#define memset __bzero
+#include <sysdeps/i386/i586/memset.S>
+weak_alias (__bzero, bzero)
diff --git a/REORG.TODO/sysdeps/i386/i586/init-arch.h b/REORG.TODO/sysdeps/i386/i586/init-arch.h
new file mode 100644
index 0000000000..4711212e6f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/init-arch.h
@@ -0,0 +1,19 @@
+/* Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define MINIMUM_ISA 586
+#include <sysdeps/x86/init-arch.h>
diff --git a/REORG.TODO/sysdeps/i386/i586/lshift.S b/REORG.TODO/sysdeps/i386/i586/lshift.S
new file mode 100644
index 0000000000..7941c28d9d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/lshift.S
@@ -0,0 +1,255 @@
+/* Pentium optimized __mpn_lshift --
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16		/* space for 4 saved regs */
+#define RES	PARMS
+#define S	RES+4
+#define SIZE	S+4
+#define CNT	SIZE+4
+
+	.text
+ENTRY (__mpn_lshift)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebp, 0)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 12)
+	movl	S(%esp),%esi
+	cfi_rel_offset (esi, 8)
+	movl	SIZE(%esp),%ebx
+	cfi_rel_offset (ebx, 0)
+	movl	CNT(%esp),%ecx
+
+/* We can use faster code for shift-by-1 under certain conditions.  */
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%esi),%eax
+	cmpl	%edi,%eax
+	jnc	L(special)		/* jump if s_ptr + 1 >= res_ptr */
+	leal	(%esi,%ebx,4),%eax
+	cmpl	%eax,%edi
+	jnc	L(special)		/* jump if res_ptr >= s_ptr + size */
+
+L(normal):
+	leal	-4(%edi,%ebx,4),%edi
+	leal	-4(%esi,%ebx,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+	xorl	%eax,%eax
+	shldl	%cl,%edx,%eax		/* compute carry limb */
+	pushl	%eax			/* push carry limb onto stack */
+	cfi_adjust_cfa_offset (4)
+
+	decl	%ebx
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	shrl	$3,%ebx
+	jz	L(end)
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+L(oop):	movl	-28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebp
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	shldl	%cl,%eax,%ebp
+	shldl	%cl,%edx,%eax
+	movl	%ebp,(%edi)
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebp
+	movl	-12(%esi),%eax
+	shldl	%cl,%ebp,%edx
+	shldl	%cl,%eax,%ebp
+	movl	%edx,-8(%edi)
+	movl	%ebp,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebp
+	shldl	%cl,%edx,%eax
+	shldl	%cl,%ebp,%edx
+	movl	%eax,-16(%edi)
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	shldl	%cl,%eax,%ebp
+	shldl	%cl,%edx,%eax
+	movl	%ebp,-24(%edi)
+	movl	%eax,-28(%edi)
+
+	subl	$32,%esi
+	subl	$32,%edi
+	decl	%ebx
+	jnz	L(oop)
+
+L(end):	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	andl	$7,%ebx
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shldl	%cl,%eax,%edx
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	subl	$4,%esi
+	subl	$4,%edi
+	decl	%ebx
+	jnz	L(oop2)
+
+L(end2):
+	shll	%cl,%edx		/* compute least significant limb */
+	movl	%edx,(%edi)		/* store it */
+
+	popl	%eax			/* pop carry limb */
+	cfi_adjust_cfa_offset (-4)
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+
+/* We loop from least significant end of the arrays, which is only
+   permissible if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.
+*/
+
+	cfi_adjust_cfa_offset (16)
+	cfi_rel_offset (edi, 12)
+	cfi_rel_offset (esi, 8)
+	cfi_rel_offset (ebp, 4)
+	cfi_rel_offset (ebx, 0)
+L(special):
+	movl	(%esi),%edx
+	addl	$4,%esi
+
+	decl	%ebx
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	shrl	$3,%ebx
+
+	addl	%edx,%edx
+	incl	%ebx
+	decl	%ebx
+	jz	L(Lend)
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+L(Loop):
+	movl	28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebp
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebp,(%edi)
+	adcl	%edx,%edx
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebp
+	movl	12(%esi),%eax
+	adcl	%ebp,%ebp
+	movl	%edx,8(%edi)
+	adcl	%eax,%eax
+	movl	%ebp,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebp
+	adcl	%edx,%edx
+	movl	%eax,16(%edi)
+	adcl	%ebp,%ebp
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebp,24(%edi)
+	adcl	%edx,%edx
+	movl	%eax,28(%edi)
+
+	leal	32(%esi),%esi		/* use leal not to clobber carry */
+	leal	32(%edi),%edi
+	decl	%ebx
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	sbbl	%eax,%eax		/* save carry in %eax */
+	andl	$7,%ebx
+	jz	L(Lend2)
+	addl	%eax,%eax		/* restore carry from eax */
+L(Loop2):
+	movl	%edx,%ebp
+	movl	(%esi),%edx
+	adcl	%edx,%edx
+	movl	%ebp,(%edi)
+
+	leal	4(%esi),%esi		/* use leal not to clobber carry */
+	leal	4(%edi),%edi
+	decl	%ebx
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		/* restore carry from eax */
+L(L1):	movl	%edx,(%edi)		/* store last limb */
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_lshift)
diff --git a/REORG.TODO/sysdeps/i386/i586/memcopy.h b/REORG.TODO/sysdeps/i386/i586/memcopy.h
new file mode 100644
index 0000000000..39f020a746
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memcopy.h
@@ -0,0 +1,95 @@
+/* memcopy.h -- definitions for memory copy functions.  Pentium version.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   Contributed by Torbjorn Granlund (tege@sics.se).
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Get the i386 definitions.  We will override some of them below.  */
+#include <sysdeps/i386/memcopy.h>
+
+/* Written like this, the Pentium pipeline can execute the loop at a
+   sustained rate of 2 instructions/clock, or asymptotically 480
+   Mbytes/second at 60Mhz.  */
+
+#undef	WORD_COPY_FWD
+#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)		\
+  do									\
+    {									\
+      asm volatile ("subl	$32,%2\n"				\
+		    "js		2f\n"					\
+		    "movl	0(%0),%%edx\n"	/* alloc dest line */	\
+		    "1:\n"						\
+		    "movl	28(%0),%%eax\n"	/* alloc dest line */	\
+		    "subl	$32,%2\n"	/* decr loop count */	\
+		    "movl	0(%1),%%eax\n"	/* U pipe */		\
+		    "movl	4(%1),%%edx\n"	/* V pipe */		\
+		    "movl	%%eax,0(%0)\n"	/* U pipe */		\
+		    "movl	%%edx,4(%0)\n"	/* V pipe */		\
+		    "movl	8(%1),%%eax\n"				\
+		    "movl	12(%1),%%edx\n"				\
+		    "movl	%%eax,8(%0)\n"				\
+		    "movl	%%edx,12(%0)\n"				\
+		    "movl	16(%1),%%eax\n"				\
+		    "movl	20(%1),%%edx\n"				\
+		    "movl	%%eax,16(%0)\n"				\
+		    "movl	%%edx,20(%0)\n"				\
+		    "movl	24(%1),%%eax\n"				\
+		    "movl	28(%1),%%edx\n"				\
+		    "movl	%%eax,24(%0)\n"				\
+		    "movl	%%edx,28(%0)\n"				\
+		    "leal	32(%1),%1\n"	/* update src ptr */	\
+		    "leal	32(%0),%0\n"	/* update dst ptr */	\
+		    "jns	1b\n"					\
+		    "2: addl	$32,%2" :				\
+		    "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) :	\
+		    "0" (dst_bp), "1" (src_bp), "2" (nbytes) :		\
+		    "ax", "dx");					\
+    } while (0)
+
+#undef	WORD_COPY_BWD
+#define WORD_COPY_BWD(dst_ep, src_ep, nbytes_left, nbytes)		\
+  do									\
+    {									\
+      asm volatile ("subl	$32,%2\n"				\
+		    "js		2f\n"					\
+		    "movl	-4(%0),%%edx\n"				\
+		    "1:\n"						\
+		    "movl	-32(%0),%%eax\n"			\
+		    "subl	$32,%2\n"				\
+		    "movl	-4(%1),%%eax\n"				\
+		    "movl	-8(%1),%%edx\n"				\
+		    "movl	%%eax,-4(%0)\n"				\
+		    "movl	%%edx,-8(%0)\n"				\
+		    "movl	-12(%1),%%eax\n"			\
+		    "movl	-16(%1),%%edx\n"			\
+		    "movl	%%eax,-12(%0)\n"			\
+		    "movl	%%edx,-16(%0)\n"			\
+		    "movl	-20(%1),%%eax\n"			\
+		    "movl	-24(%1),%%edx\n"			\
+		    "movl	%%eax,-20(%0)\n"			\
+		    "movl	%%edx,-24(%0)\n"			\
+		    "movl	-28(%1),%%eax\n"			\
+		    "movl	-32(%1),%%edx\n"			\
+		    "movl	%%eax,-28(%0)\n"			\
+		    "movl	%%edx,-32(%0)\n"			\
+		    "leal	-32(%1),%1\n"				\
+		    "leal	-32(%0),%0\n"				\
+		    "jns	1b\n"					\
+		    "2: addl	$32,%2" :				\
+		    "=r" (dst_ep), "=r" (src_ep), "=r" (nbytes_left) :	\
+		    "0" (dst_ep), "1" (src_ep), "2" (nbytes) :		\
+		    "ax", "dx");					\
+    } while (0)
diff --git a/REORG.TODO/sysdeps/i386/i586/memcpy.S b/REORG.TODO/sysdeps/i386/i586/memcpy.S
new file mode 100644
index 0000000000..6474a3f653
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memcpy.S
@@ -0,0 +1,124 @@
+/* Highly optimized version for i586.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+8	/* space for 2 saved regs */
+#define RTN	PARMS
+#define DEST	RTN
+#define SRC	DEST+4
+#define LEN	SRC+4
+
+        .text
+#if defined PIC && IS_IN (libc)
+ENTRY (__memcpy_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memcpy_chk)
+#endif
+ENTRY (memcpy)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+
+	movl	DEST(%esp), %edi
+	cfi_rel_offset (edi, 4)
+	movl	SRC(%esp), %esi
+	cfi_rel_offset (esi, 0)
+	movl	LEN(%esp), %ecx
+	movl	%edi, %eax
+
+	/* We need this in any case.  */
+	cld
+
+	/* Cutoff for the big loop is a size of 32 bytes since otherwise
+	   the loop will never be entered.  */
+	cmpl	$32, %ecx
+	jbe	L(1)
+
+	negl	%eax
+	andl	$3, %eax
+	subl	%eax, %ecx
+	xchgl	%eax, %ecx
+
+	rep; movsb
+
+	movl	%eax, %ecx
+	subl	$32, %ecx
+	js	L(2)
+
+	/* Read ahead to make sure we write in the cache since the stupid
+	   i586 designers haven't implemented read-on-write-miss.  */
+	movl	(%edi), %eax
+L(3):	movl	28(%edi), %edx
+
+	/* Now correct the loop counter.  Please note that in the following
+	   code the flags are not changed anymore.  */
+	subl	$32, %ecx
+
+	movl	(%esi), %eax
+	movl	4(%esi), %edx
+	movl	%eax, (%edi)
+	movl	%edx, 4(%edi)
+	movl	8(%esi), %eax
+	movl	12(%esi), %edx
+	movl	%eax, 8(%edi)
+	movl	%edx, 12(%edi)
+	movl	16(%esi), %eax
+	movl	20(%esi), %edx
+	movl	%eax, 16(%edi)
+	movl	%edx, 20(%edi)
+	movl	24(%esi), %eax
+	movl	28(%esi), %edx
+	movl	%eax, 24(%edi)
+	movl	%edx, 28(%edi)
+
+	leal	32(%esi), %esi
+	leal	32(%edi), %edi
+
+	jns	L(3)
+
+	/* Correct extra loop counter modification.  */
+L(2):	addl	$32, %ecx
+#ifndef USE_AS_MEMPCPY
+	movl	DEST(%esp), %eax
+#endif
+
+L(1):	rep; movsb
+
+#ifdef USE_AS_MEMPCPY
+	movl	%edi, %eax
+#endif
+
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (memcpy)
+#ifndef USE_AS_MEMPCPY
+libc_hidden_builtin_def (memcpy)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i586/mempcpy.S b/REORG.TODO/sysdeps/i386/i586/mempcpy.S
new file mode 100644
index 0000000000..720a4c0923
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/mempcpy.S
@@ -0,0 +1,8 @@
+#define USE_AS_MEMPCPY
+#define memcpy __mempcpy
+#define __memcpy_chk __mempcpy_chk
+#include <sysdeps/i386/i586/memcpy.S>
+
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/i386/i586/memset.S b/REORG.TODO/sysdeps/i386/i586/memset.S
new file mode 100644
index 0000000000..4f8f1bcf94
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memset.S
@@ -0,0 +1,121 @@
+/* memset/bzero -- set memory area to CH/0
+   Highly optimized version for ix86, x>=5.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Torbjorn Granlund, <tege@matematik.su.se>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* space for 1 saved reg */
+#define RTN	PARMS
+#define DEST	RTN
+#ifdef USE_AS_BZERO
+# define LEN	DEST+4
+#else
+# define CHR	DEST+4
+# define LEN	CHR+4
+#endif
+
+        .text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk)
+#endif
+ENTRY (memset)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+
+	movl	DEST(%esp), %edi
+	cfi_rel_offset (edi, 0)
+	movl	LEN(%esp), %edx
+#ifdef USE_AS_BZERO
+	xorl	%eax, %eax	/* we fill with 0 */
+#else
+	movb	CHR(%esp), %al
+	movb	%al, %ah
+	movl	%eax, %ecx
+	shll	$16, %eax
+	movw	%cx, %ax
+#endif
+	cld
+
+/* If less than 36 bytes to write, skip tricky code (it wouldn't work).  */
+	cmpl	$36, %edx
+	movl	%edx, %ecx	/* needed when branch is taken! */
+	jl	L(2)
+
+/* First write 0-3 bytes to make the pointer 32-bit aligned.  */
+	movl	%edi, %ecx	/* Copy ptr to ecx... */
+	negl	%ecx		/* ...and negate that and... */
+	andl	$3, %ecx	/* ...mask to get byte count.  */
+	subl	%ecx, %edx	/* adjust global byte count */
+	rep
+	stosb
+
+	subl	$32, %edx	/* offset count for unrolled loop */
+	movl	(%edi), %ecx	/* Fetch destination cache line */
+
+	.align	2, 0x90		/* supply 0x90 for broken assemblers */
+L(1):	movl	28(%edi), %ecx	/* allocate cache line for destination */
+	subl	$32, %edx	/* decr loop count */
+	movl	%eax, 0(%edi)	/* store words pairwise */
+	movl	%eax, 4(%edi)
+	movl	%eax, 8(%edi)
+	movl	%eax, 12(%edi)
+	movl	%eax, 16(%edi)
+	movl	%eax, 20(%edi)
+	movl	%eax, 24(%edi)
+	movl	%eax, 28(%edi)
+	leal	32(%edi), %edi	/* update destination pointer */
+	jge	L(1)
+
+	leal	32(%edx), %ecx	/* reset offset count */
+
+/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped).  */
+L(2):	shrl	$2, %ecx	/* convert byte count to longword count */
+	rep
+	stosl
+
+/* Finally write the last 0-3 bytes.  */
+	movl	%edx, %ecx
+	andl	$3, %ecx
+	rep
+	stosb
+
+#ifndef USE_AS_BZERO
+	/* Load result (only if used as memset).  */
+	movl DEST(%esp), %eax	/* start address of destination is result */
+#endif
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (memset)
+libc_hidden_builtin_def (memset)
+
+#if defined SHARED && IS_IN (libc) && !defined __memset_chk \
+    && !defined USE_AS_BZERO
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+	.section .gnu.warning.__memset_zero_constant_len_parameter
+	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i586/memusage.h b/REORG.TODO/sysdeps/i386/i586/memusage.h
new file mode 100644
index 0000000000..c8170874d0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memusage.h
@@ -0,0 +1 @@
+#include "../i686/memusage.h"
diff --git a/REORG.TODO/sysdeps/i386/i586/mul_1.S b/REORG.TODO/sysdeps/i386/i586/mul_1.S
new file mode 100644
index 0000000000..bd3a07de90
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/mul_1.S
@@ -0,0 +1,90 @@
+/* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store
+   the result in a second limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16	/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define SIZE	S1+4
+#define S2LIMB	SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+	.text
+ENTRY (__mpn_mul_1)
+
+	pushl	%res_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%s1_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%s2_limb
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp), %res_ptr
+	cfi_rel_offset (res_ptr, 12)
+	movl	S1(%esp), %s1_ptr
+	cfi_rel_offset (s1_ptr, 8)
+	movl	SIZE(%esp), %size
+	movl	S2LIMB(%esp), %s2_limb
+	cfi_rel_offset (s2_limb, 0)
+	leal	(%res_ptr,%size,4), %res_ptr
+	leal	(%s1_ptr,%size,4), %s1_ptr
+	negl	%size
+	xorl	%ebp, %ebp
+	cfi_rel_offset (ebp, 4)
+	ALIGN (3)
+
+L(oop):	adcl	$0, %ebp
+	movl	(%s1_ptr,%size,4), %eax
+
+	mull	%s2_limb
+
+	addl	%eax, %ebp
+
+	movl	%ebp, (%res_ptr,%size,4)
+	incl	%size
+
+	movl	%edx, %ebp
+	jnz	L(oop)
+
+	adcl	$0, %ebp
+	movl	%ebp, %eax
+	popl	%s2_limb
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s2_limb)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%s1_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s1_ptr)
+	popl	%res_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (res_ptr)
+
+	ret
+#undef size
+END (__mpn_mul_1)
diff --git a/REORG.TODO/sysdeps/i386/i586/rshift.S b/REORG.TODO/sysdeps/i386/i586/rshift.S
new file mode 100644
index 0000000000..24c76ee0bb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/rshift.S
@@ -0,0 +1,255 @@
+/* Pentium optimized __mpn_rshift --
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16		/* space for 4 saved regs */
+#define RES	PARMS
+#define S	RES+4
+#define SIZE	S+4
+#define CNT	SIZE+4
+
+	.text
+ENTRY (__mpn_rshift)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebp, 0)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 12)
+	movl	S(%esp),%esi
+	cfi_rel_offset (esi, 8)
+	movl	SIZE(%esp),%ebx
+	cfi_rel_offset (ebx, 0)
+	movl	CNT(%esp),%ecx
+
+/* We can use faster code for shift-by-1 under certain conditions.  */
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%edi),%eax
+	cmpl	%esi,%eax
+	jnc	L(special)		/* jump if res_ptr + 1 >= s_ptr */
+	leal	(%edi,%ebx,4),%eax
+	cmpl	%eax,%esi
+	jnc	L(special)		/* jump if s_ptr >= res_ptr + size */
+
+L(normal):
+	movl	(%esi),%edx
+	addl	$4,%esi
+	xorl	%eax,%eax
+	shrdl	%cl,%edx,%eax		/* compute carry limb */
+	pushl	%eax			/* push carry limb onto stack */
+	cfi_adjust_cfa_offset (4)
+
+	decl	%ebx
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	shrl	$3,%ebx
+	jz	L(end)
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+L(oop):	movl	28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebp
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	shrdl	%cl,%eax,%ebp
+	shrdl	%cl,%edx,%eax
+	movl	%ebp,(%edi)
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebp
+	movl	12(%esi),%eax
+	shrdl	%cl,%ebp,%edx
+	shrdl	%cl,%eax,%ebp
+	movl	%edx,8(%edi)
+	movl	%ebp,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebp
+	shrdl	%cl,%edx,%eax
+	shrdl	%cl,%ebp,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	shrdl	%cl,%eax,%ebp
+	shrdl	%cl,%edx,%eax
+	movl	%ebp,24(%edi)
+	movl	%eax,28(%edi)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	decl	%ebx
+	jnz	L(oop)
+
+L(end):	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	andl	$7,%ebx
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shrdl	%cl,%eax,%edx		/* compute result limb */
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	addl	$4,%esi
+	addl	$4,%edi
+	decl	%ebx
+	jnz	L(oop2)
+
+L(end2):
+	shrl	%cl,%edx		/* compute most significant limb */
+	movl	%edx,(%edi)		/* store it */
+
+	popl	%eax			/* pop carry limb */
+	cfi_adjust_cfa_offset (-4)
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+
+/* We loop from least significant end of the arrays, which is only
+   permissible if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.
+*/
+
+	cfi_adjust_cfa_offset (16)
+	cfi_rel_offset (edi, 12)
+	cfi_rel_offset (esi, 8)
+	cfi_rel_offset (ebp, 4)
+	cfi_rel_offset (ebx, 0)
+L(special):
+	leal	-4(%edi,%ebx,4),%edi
+	leal	-4(%esi,%ebx,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+
+	decl	%ebx
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	shrl	$3,%ebx
+
+	shrl	$1,%edx
+	incl	%ebx
+	decl	%ebx
+	jz	L(Lend)
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+L(Loop):
+	movl	-28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebp
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	rcrl	$1,%eax
+	movl	%ebp,(%edi)
+	rcrl	$1,%edx
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebp
+	movl	-12(%esi),%eax
+	rcrl	$1,%ebp
+	movl	%edx,-8(%edi)
+	rcrl	$1,%eax
+	movl	%ebp,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebp
+	rcrl	$1,%edx
+	movl	%eax,-16(%edi)
+	rcrl	$1,%ebp
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	rcrl	$1,%eax
+	movl	%ebp,-24(%edi)
+	rcrl	$1,%edx
+	movl	%eax,-28(%edi)
+
+	leal	-32(%esi),%esi		/* use leal not to clobber carry */
+	leal	-32(%edi),%edi
+	decl	%ebx
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	sbbl	%eax,%eax		/* save carry in %eax */
+	andl	$7,%ebx
+	jz	L(Lend2)
+	addl	%eax,%eax		/* restore carry from eax */
+L(Loop2):
+	movl	%edx,%ebp
+	movl	(%esi),%edx
+	rcrl	$1,%edx
+	movl	%ebp,(%edi)
+
+	leal	-4(%esi),%esi		/* use leal not to clobber carry */
+	leal	-4(%edi),%edi
+	decl	%ebx
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		/* restore carry from eax */
+L(L1):	movl	%edx,(%edi)		/* store last limb */
+
+	movl	$0,%eax
+	rcrl	$1,%eax
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_rshift)
diff --git a/REORG.TODO/sysdeps/i386/i586/stpcpy.S b/REORG.TODO/sysdeps/i386/i586/stpcpy.S
new file mode 100644
index 0000000000..8691efd01c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/stpcpy.S
@@ -0,0 +1,8 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+
+#include <sysdeps/i386/i586/strcpy.S>
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/i386/i586/strchr.S b/REORG.TODO/sysdeps/i386/i586/strchr.S
new file mode 100644
index 0000000000..02f66b8f72
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/strchr.S
@@ -0,0 +1,348 @@
+/* Find character CH in a NUL terminated string.
+   Highly optimized version for ix85, x>=5.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/* This version is especially optimized for the i586 (and following?)
+   processors.  This is mainly done by using the two pipelines.  The
+   version optimized for i486 is weak in this aspect because to get
+   as much parallelism we have to execute some *more* instructions.
+
+   The code below is structured to reflect the pairing of the instructions
+   as *I think* it is.  I have no processor data book to verify this.
+   If you find something you think is incorrect let me know.  */
+
+
+/* The magic value which is used throughout in the whole code.  */
+#define magic 0xfefefeff
+
+#define PARMS	4+16	/* space for 4 saved regs */
+#define RTN	PARMS
+#define STR	RTN
+#define CHR	STR+4
+
+	.text
+ENTRY (strchr)
+
+	pushl %edi		/* Save callee-safe registers.  */
+	cfi_adjust_cfa_offset (-4)
+	pushl %esi
+	cfi_adjust_cfa_offset (-4)
+
+	pushl %ebx
+	cfi_adjust_cfa_offset (-4)
+	pushl %ebp
+	cfi_adjust_cfa_offset (-4)
+
+	movl STR(%esp), %eax
+	movl CHR(%esp), %edx
+
+	movl %eax, %edi		/* duplicate string pointer for later */
+	cfi_rel_offset (edi, 12)
+	xorl %ecx, %ecx		/* clear %ecx */
+
+	/* At the moment %edx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* now it is 0|0|c|c */
+	movb %dl, %cl		/* we construct the lower half in %ecx */
+
+	shll $16, %edx		/* now %edx is c|c|0|0 */
+	movb %cl, %ch		/* now %ecx is 0|0|c|c */
+
+	orl %ecx, %edx		/* and finally c|c|c|c */
+	andl $3, %edi		/* mask alignment bits */
+
+	jz L(11)		/* alignment is 0 => start loop */
+
+	movb %dl, %cl		/* 0 is needed below */
+	jp L(0)			/* exactly two bits set */
+
+	xorb (%eax), %cl	/* is byte the one we are looking for? */
+	jz L(out)		/* yes => return pointer */
+
+	xorb %dl, %cl		/* load single byte and test for NUL */
+	je L(3)			/* yes => return NULL */
+
+	movb 1(%eax), %cl	/* load single byte */
+	incl %eax
+
+	cmpb %cl, %dl		/* is byte == C? */
+	je L(out)		/* aligned => return pointer */
+
+	cmpb $0, %cl		/* is byte NUL? */
+	je L(3)			/* yes => return NULL */
+
+	incl %eax
+	decl %edi
+
+	jne L(11)
+
+L(0):	movb (%eax), %cl	/* load single byte */
+
+	cmpb %cl, %dl		/* is byte == C? */
+	je L(out)		/* aligned => return pointer */
+
+	cmpb $0, %cl		/* is byte NUL? */
+	je L(3)			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	cfi_rel_offset (esi, 8)
+	cfi_rel_offset (ebx, 4)
+	cfi_rel_offset (ebp, 0)
+
+	/* The following code is the preparation for the loop.  The
+	   four instruction up to `L1' will not be executed in the loop
+	   because the same code is found at the end of the loop, but
+	   there it is executed in parallel with other instructions.  */
+L(11):	movl (%eax), %ecx
+	movl $magic, %ebp
+
+	movl $magic, %edi
+	addl %ecx, %ebp
+
+	/* The main loop: it looks complex and indeed it is.  I would
+	   love to say `it was hard to write, so it should he hard to
+	   read' but I will give some more hints.  To fully understand
+	   this code you should first take a look at the i486 version.
+	   The basic algorithm is the same, but here the code organized
+	   in a way which permits to use both pipelines all the time.
+
+	   I tried to make it a bit more understandable by indenting
+	   the code according to stage in the algorithm.  It goes as
+	   follows:
+		check for 0 in 1st word
+			check for C in 1st word
+					check for 0 in 2nd word
+						check for C in 2nd word
+		check for 0 in 3rd word
+			check for C in 3rd word
+					check for 0 in 4th word
+						check for C in 4th word
+
+	   Please note that doing the test for NUL before the test for
+	   C allows us to overlap the test for 0 in the next word with
+	   the test for C.  */
+
+L(1):	xorl %ecx, %ebp			/* (word^magic) */
+	addl %ecx, %edi			/* add magic word */
+
+	leal 4(%eax), %eax		/* increment pointer */
+	jnc L(4)			/* previous addl caused overflow? */
+
+		movl %ecx, %ebx		/* duplicate original word */
+	orl $magic, %ebp		/* (word^magic)|magic */
+
+	addl $1, %ebp			/* (word^magic)|magic == 0xffffffff? */
+	jne L(4)				/* yes => we found word with NUL */
+
+		movl $magic, %esi	/* load magic value */
+		xorl %edx, %ebx		/* clear words which are C */
+
+					movl (%eax), %ecx
+		addl %ebx, %esi		/* (word+magic) */
+
+					movl $magic, %edi
+		jnc L(5)		/* previous addl caused overflow? */
+
+					movl %edi, %ebp
+		xorl %ebx, %esi		/* (word+magic)^word */
+
+					addl %ecx, %ebp
+		orl $magic, %esi	/* ((word+magic)^word)|magic */
+
+		addl $1, %esi		/* ((word+magic)^word)|magic==0xf..f?*/
+		jne L(5)		/* yes => we found word with C */
+
+					xorl %ecx, %ebp
+					addl %ecx, %edi
+
+					leal 4(%eax), %eax
+					jnc L(4)
+
+						movl %ecx, %ebx
+					orl $magic, %ebp
+
+					addl $1, %ebp
+					jne L(4)
+
+						movl $magic, %esi
+						xorl %edx, %ebx
+
+	movl (%eax), %ecx
+						addl %ebx, %esi
+
+	movl $magic, %edi
+						jnc L(5)
+
+	movl %edi, %ebp
+						xorl %ebx, %esi
+
+	addl %ecx, %ebp
+						orl $magic, %esi
+
+						addl $1, %esi
+						jne L(5)
+
+	xorl %ecx, %ebp
+	addl %ecx, %edi
+
+	leal 4(%eax), %eax
+	jnc L(4)
+
+		movl %ecx, %ebx
+	orl $magic, %ebp
+
+	addl $1, %ebp
+	jne L(4)
+
+		movl $magic, %esi
+		xorl %edx, %ebx
+
+					movl (%eax), %ecx
+		addl %ebx, %esi
+
+					movl $magic, %edi
+		jnc L(5)
+
+					movl %edi, %ebp
+		xorl %ebx, %esi
+
+					addl %ecx, %ebp
+		orl $magic, %esi
+
+		addl $1, %esi
+		jne L(5)
+
+					xorl %ecx, %ebp
+					addl %ecx, %edi
+
+					leal 4(%eax), %eax
+					jnc L(4)
+
+						movl %ecx, %ebx
+					orl $magic, %ebp
+
+					addl $1, %ebp
+					jne L(4)
+
+						movl $magic, %esi
+						xorl %edx, %ebx
+
+	movl (%eax), %ecx
+						addl %ebx, %esi
+
+	movl $magic, %edi
+						jnc L(5)
+
+	movl %edi, %ebp
+						xorl %ebx, %esi
+
+	addl %ecx, %ebp
+						orl $magic, %esi
+
+						addl $1, %esi
+
+						je L(1)
+
+	/* We know there is no NUL byte but a C byte in the word.
+	   %ebx contains NUL in this particular byte.  */
+L(5):	subl $4, %eax		/* adjust pointer */
+	testb %bl, %bl		/* first byte == C? */
+
+	jz L(out)		/* yes => return pointer */
+
+	incl %eax		/* increment pointer */
+	testb %bh, %bh		/* second byte == C? */
+
+	jz L(out)		/* yes => return pointer */
+
+	shrl $16, %ebx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmp $0, %bl		/* third byte == C */
+	je L(out)		/* yes => return pointer */
+
+	incl %eax		/* increment pointer */
+
+L(out):	popl %ebp		/* restore saved registers */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl %ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+
+	popl %esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl %edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+
+	cfi_adjust_cfa_offset (16)
+	cfi_rel_offset (edi, 12)
+	cfi_rel_offset (esi, 8)
+	cfi_rel_offset (ebx, 4)
+	cfi_rel_offset (ebp, 0)
+	/* We know there is a NUL byte in the word.  But we have to test
+	   whether there is an C byte before it in the word.  */
+L(4):	subl $4, %eax		/* adjust pointer */
+	cmpb %dl, %cl		/* first byte == C? */
+
+	je L(out)		/* yes => return pointer */
+
+	cmpb $0, %cl		/* first byte == NUL? */
+	je L(3)			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	cmpb %dl, %ch		/* second byte == C? */
+	je L(out)		/* yes => return pointer */
+
+	cmpb $0, %ch		/* second byte == NUL? */
+	je L(3)			/* yes => return NULL */
+
+	shrl $16, %ecx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmpb %dl, %cl		/* third byte == C? */
+	je L(out)		/* yes => return pointer */
+
+	cmpb $0, %cl		/* third byte == NUL? */
+	je L(3)			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	/* The test four the fourth byte is necessary!  */
+	cmpb %dl, %ch		/* fourth byte == C? */
+	je L(out)		/* yes => return pointer */
+
+L(3):	xorl %eax, %eax
+	jmp L(out)
+END (strchr)
+
+#undef index
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
diff --git a/REORG.TODO/sysdeps/i386/i586/strcpy.S b/REORG.TODO/sysdeps/i386/i586/strcpy.S
new file mode 100644
index 0000000000..a444604f4f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/strcpy.S
@@ -0,0 +1,169 @@
+/* strcpy/stpcpy implementation for i586.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+12	/* space for 3 saved regs */
+#define RTN	PARMS
+#define DEST	RTN
+#define SRC	DEST+4
+
+#ifndef USE_AS_STPCPY
+# define STRCPY strcpy
+#endif
+
+#define magic 0xfefefeff
+
+	.text
+ENTRY (STRCPY)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+
+	movl	DEST(%esp), %edi
+	cfi_rel_offset (edi, 8)
+	movl	SRC(%esp), %esi
+	cfi_rel_offset (esi, 4)
+
+	xorl	%eax, %eax
+	leal	-1(%esi), %ecx
+
+	movl	$magic, %ebx
+	cfi_rel_offset (ebx, 0)
+	andl	$3, %ecx
+
+#ifdef PIC
+	call	2f
+	cfi_adjust_cfa_offset (4)
+2:	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	/* 0xb is the distance between 2: and 1: but we avoid writing
+	   1f-2b because the assembler generates worse code.  */
+	leal	0xb(%edx,%ecx,8), %ecx
+#else
+	leal	1f(,%ecx,8), %ecx
+#endif
+
+	jmp	*%ecx
+
+	.align 8
+1:
+	orb	(%esi), %al
+	jz	L(end)
+	stosb
+	xorl	%eax, %eax
+	incl	%esi
+
+	orb	(%esi), %al
+	jz	L(end)
+	stosb
+	xorl	%eax, %eax
+	incl	%esi
+
+	orb	(%esi), %al
+	jz	L(end)
+	stosb
+	xorl	%eax, %eax
+	incl	%esi
+
+L(1):	movl	(%esi), %ecx
+	leal	4(%esi),%esi
+
+	subl	%ecx, %eax
+	addl	%ebx, %ecx
+
+	decl	%eax
+	jnc	L(3)
+
+	movl	%ecx, %edx
+	xorl	%ecx, %eax
+
+	subl	%ebx, %edx
+	andl	$~magic, %eax
+
+	jne	L(4)
+
+	movl	%edx, (%edi)
+	leal	4(%edi),%edi
+
+	jmp	L(1)
+
+L(3):	movl	%ecx, %edx
+
+	subl	%ebx, %edx
+
+L(4):	movb	%dl, (%edi)
+	testb	%dl, %dl
+
+	movl	%edx, %eax
+	jz	L(end2)
+
+	shrl	$16, %eax
+	movb	%dh, 1(%edi)
+#ifdef USE_AS_STPCPY
+	addl	$1, %edi
+#endif
+
+	cmpb	$0, %dh
+	jz	L(end2)
+
+#ifdef USE_AS_STPCPY
+	movb	%al, 1(%edi)
+	addl	$1, %edi
+
+	cmpb	$0, %al
+	jz	L(end2)
+
+	addl	$1, %edi
+#else
+	movb	%al, 2(%edi)
+	testb	%al, %al
+
+	leal	3(%edi), %edi
+	jz	L(end2)
+#endif
+
+L(end):	movb	%ah, (%edi)
+
+L(end2):
+#ifdef USE_AS_STPCPY
+	movl	%edi, %eax
+#else
+	movl	DEST(%esp), %eax
+#endif
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (STRCPY)
+#ifndef USE_AS_STPCPY
+libc_hidden_builtin_def (strcpy)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i586/strlen.S b/REORG.TODO/sysdeps/i386/i586/strlen.S
new file mode 100644
index 0000000000..cfea2e020f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/strlen.S
@@ -0,0 +1,182 @@
+/* strlen -- Compute length of NUL terminated string.
+   Highly optimized version for ix86, x>=5.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/* This version is especially optimized for the i586 (and following?)
+   processors.  This is mainly done by using the two pipelines.  The
+   version optimized for i486 is weak in this aspect because to get
+   as much parallelism we have to execute some *more* instructions.
+
+   The code below is structured to reflect the pairing of the instructions
+   as *I think* it is.  I have no processor data book to verify this.
+   If you find something you think is incorrect let me know.  */
+
+
+/* The magic value which is used throughout in the whole code.  */
+#define magic 0xfefefeff
+
+#define PARMS	4		/* no space for saved regs */
+#define STR	PARMS
+
+	.text
+ENTRY (strlen)
+
+	movl STR(%esp), %eax
+	movl $3, %edx		/* load mask (= 3) */
+
+	andl %eax, %edx		/* separate last two bits of address */
+
+	jz L(1)			/* aligned => start loop */
+	jp L(0)			/* exactly two bits set */
+
+	cmpb %dh, (%eax)	/* is byte NUL? */
+	je L(2)			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	cmpb %dh, (%eax)	/* is byte NUL? */
+
+	je L(2)			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	xorl $2, %edx
+
+	jz L(1)
+
+L(0):	cmpb %dh, (%eax)	/* is byte NUL? */
+	je L(2)			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	xorl %edx, %edx		/* We need %edx == 0 for later */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 Note: %edx == 0 in any case here.  */
+
+L(1):
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L(3)		/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L(3)		/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L(3)		/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L(3)		/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L(3)		/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L(3)		/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L(3)		/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	je L(1)			/* no => start loop again */
+
+
+L(3):	subl $4, %eax		/* correct too early pointer increment */
+	subl $magic, %ecx
+
+	cmpb $0, %cl		/* lowest byte NUL? */
+	jz L(2)			/* yes => return */
+
+	inc %eax		/* increment pointer */
+	testb %ch, %ch		/* second byte NUL? */
+
+	jz L(2)			/* yes => return */
+
+	shrl $16, %ecx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmpb $0, %cl		/* is third byte NUL? */
+	jz L(2)			/* yes => return */
+
+	incl %eax		/* increment pointer */
+
+L(2):	subl STR(%esp), %eax	/* now compute the length as difference
+				   between start and terminating NUL
+				   character */
+	ret
+END (strlen)
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/i386/i586/sub_n.S b/REORG.TODO/sysdeps/i386/i586/sub_n.S
new file mode 100644
index 0000000000..21b5a2742c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/sub_n.S
@@ -0,0 +1,143 @@
+/* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0
+   and store difference in a third limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16		/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define S2	S1+4
+#define SIZE	S2+4
+
+	.text
+ENTRY (__mpn_sub_n)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 12)
+	movl	S1(%esp),%esi
+	cfi_rel_offset (esi, 8)
+	movl	S2(%esp),%ebx
+	cfi_rel_offset (ebx, 0)
+	movl	SIZE(%esp),%ecx
+	movl	(%ebx),%ebp
+	cfi_rel_offset (ebp, 4)
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		/* zero carry flag */
+	jz	L(end)
+	pushl	%edx
+	cfi_adjust_cfa_offset (4)
+
+	ALIGN (3)
+L(oop):	movl	28(%edi),%eax		/* fetch destination cache line */
+	leal	32(%edi),%edi
+
+L(1):	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	sbbl	%ebp,%eax
+	movl	4(%ebx),%ebp
+	sbbl	%ebp,%edx
+	movl	8(%ebx),%ebp
+	movl	%eax,-32(%edi)
+	movl	%edx,-28(%edi)
+
+L(2):	movl	8(%esi),%eax
+	movl	12(%esi),%edx
+	sbbl	%ebp,%eax
+	movl	12(%ebx),%ebp
+	sbbl	%ebp,%edx
+	movl	16(%ebx),%ebp
+	movl	%eax,-24(%edi)
+	movl	%edx,-20(%edi)
+
+L(3):	movl	16(%esi),%eax
+	movl	20(%esi),%edx
+	sbbl	%ebp,%eax
+	movl	20(%ebx),%ebp
+	sbbl	%ebp,%edx
+	movl	24(%ebx),%ebp
+	movl	%eax,-16(%edi)
+	movl	%edx,-12(%edi)
+
+L(4):	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	sbbl	%ebp,%eax
+	movl	28(%ebx),%ebp
+	sbbl	%ebp,%edx
+	movl	32(%ebx),%ebp
+	movl	%eax,-8(%edi)
+	movl	%edx,-4(%edi)
+
+	leal	32(%esi),%esi
+	leal	32(%ebx),%ebx
+	decl	%ecx
+	jnz	L(oop)
+
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+L(end):
+	decl	%edx			/* test %edx w/o clobbering carry */
+	js	L(end2)
+	incl	%edx
+L(oop2):
+	leal	4(%edi),%edi
+	movl	(%esi),%eax
+	sbbl	%ebp,%eax
+	movl	4(%ebx),%ebp
+	movl	%eax,-4(%edi)
+	leal	4(%esi),%esi
+	leal	4(%ebx),%ebx
+	decl	%edx
+	jnz	L(oop2)
+L(end2):
+	movl	(%esi),%eax
+	sbbl	%ebp,%eax
+	movl	%eax,(%edi)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_sub_n)
diff --git a/REORG.TODO/sysdeps/i386/i586/submul_1.S b/REORG.TODO/sysdeps/i386/i586/submul_1.S
new file mode 100644
index 0000000000..5e5e121ca2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/submul_1.S
@@ -0,0 +1,94 @@
+/* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+   the result from a second limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16	/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define SIZE	S1+4
+#define S2LIMB	SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+	.text
+ENTRY (__mpn_submul_1)
+
+	pushl	%res_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%s1_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%s2_limb
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp), %res_ptr
+	cfi_rel_offset (res_ptr, 12)
+	movl	S1(%esp), %s1_ptr
+	cfi_rel_offset (s1_ptr, 8)
+	movl	SIZE(%esp), %size
+	movl	S2LIMB(%esp), %s2_limb
+	cfi_rel_offset (s2_limb, 0)
+	leal	(%res_ptr,%size,4), %res_ptr
+	leal	(%s1_ptr,%size,4), %s1_ptr
+	negl	%size
+	xorl	%ebp, %ebp
+	cfi_rel_offset (ebp, 4)
+	ALIGN (3)
+
+L(oop):	adcl	$0, %ebp
+	movl	(%s1_ptr,%size,4), %eax
+
+	mull	%s2_limb
+
+	addl	%ebp, %eax
+	movl	(%res_ptr,%size,4), %ebp
+
+	adcl	$0, %edx
+	subl	%eax, %ebp
+
+	movl	%ebp, (%res_ptr,%size,4)
+	incl	%size
+
+	movl	%edx, %ebp
+	jnz	L(oop)
+
+	adcl	$0, %ebp
+	movl	%ebp, %eax
+	popl	%s2_limb
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s2_limb)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%s1_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s1_ptr)
+	popl	%res_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (res_ptr)
+
+	ret
+#undef size
+END (__mpn_submul_1)
diff --git a/REORG.TODO/sysdeps/i386/i686/Makefile b/REORG.TODO/sysdeps/i386/i686/Makefile
new file mode 100644
index 0000000000..311042787b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/Makefile
@@ -0,0 +1,12 @@
+# So that we can test __m128's alignment
+stack-align-test-flags += -msse
+
+CFLAGS-.o += -Wa,-mtune=i686
+CFLAGS-.os += -Wa,-mtune=i686
+CFLAGS-.op += -Wa,-mtune=i686
+CFLAGS-.oS += -Wa,-mtune=i686
+
+ASFLAGS-.o += -Wa,-mtune=i686
+ASFLAGS-.os += -Wa,-mtune=i686
+ASFLAGS-.op += -Wa,-mtune=i686
+ASFLAGS-.oS += -Wa,-mtune=i686
diff --git a/REORG.TODO/sysdeps/i386/i686/add_n.S b/REORG.TODO/sysdeps/i386/i686/add_n.S
new file mode 100644
index 0000000000..4afa648ceb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/add_n.S
@@ -0,0 +1,110 @@
+/* Add two limb vectors of the same length > 0 and store sum in a third
+   limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+8		/* space for 2 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define S2	S1+4
+#define SIZE	S2+4
+
+	.text
+#ifdef PIC
+L(1):	addl    (%esp), %eax
+	ret
+#endif
+ENTRY (__mpn_add_n)
+
+	pushl %edi
+	cfi_adjust_cfa_offset (4)
+	pushl %esi
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 4)
+	movl	S1(%esp),%esi
+	cfi_rel_offset (esi, 0)
+	movl	S2(%esp),%edx
+	movl	SIZE(%esp),%ecx
+	movl	%ecx,%eax
+	shrl	$3,%ecx			/* compute count for unrolled loop */
+	negl	%eax
+	andl	$7,%eax			/* get index where to start loop */
+	jz	L(oop)			/* necessary special case for 0 */
+	incl	%ecx			/* adjust loop count */
+	shll	$2,%eax			/* adjustment for pointers... */
+	subl	%eax,%edi		/* ... since they are offset ... */
+	subl	%eax,%esi		/* ... by a constant when we ... */
+	subl	%eax,%edx		/* ... enter the loop */
+	shrl	$2,%eax			/* restore previous value */
+#ifdef PIC
+/* Calculate start address in loop for PIC.  */
+	leal	(L(oop)-L(0)-3)(%eax,%eax,8),%eax
+	call	L(1)
+L(0):
+#else
+/* Calculate start address in loop for non-PIC.  */
+ 	leal	(L(oop) - 3)(%eax,%eax,8),%eax
+#endif
+	jmp	*%eax			/* jump into loop */
+	ALIGN (3)
+L(oop):	movl	(%esi),%eax
+	adcl	(%edx),%eax
+	movl	%eax,(%edi)
+	movl	4(%esi),%eax
+	adcl	4(%edx),%eax
+	movl	%eax,4(%edi)
+	movl	8(%esi),%eax
+	adcl	8(%edx),%eax
+	movl	%eax,8(%edi)
+	movl	12(%esi),%eax
+	adcl	12(%edx),%eax
+	movl	%eax,12(%edi)
+	movl	16(%esi),%eax
+	adcl	16(%edx),%eax
+	movl	%eax,16(%edi)
+	movl	20(%esi),%eax
+	adcl	20(%edx),%eax
+	movl	%eax,20(%edi)
+	movl	24(%esi),%eax
+	adcl	24(%edx),%eax
+	movl	%eax,24(%edi)
+	movl	28(%esi),%eax
+	adcl	28(%edx),%eax
+	movl	%eax,28(%edi)
+	leal	32(%edi),%edi
+	leal	32(%esi),%esi
+	leal	32(%edx),%edx
+	decl	%ecx
+	jnz	L(oop)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl %esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl %edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_add_n)
diff --git a/REORG.TODO/sysdeps/i386/i686/bcopy.S b/REORG.TODO/sysdeps/i386/i686/bcopy.S
new file mode 100644
index 0000000000..15ef9419a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/bcopy.S
@@ -0,0 +1,3 @@
+#define USE_AS_BCOPY
+#define memmove bcopy
+#include <sysdeps/i386/i686/memmove.S>
diff --git a/REORG.TODO/sysdeps/i386/i686/bzero.S b/REORG.TODO/sysdeps/i386/i686/bzero.S
new file mode 100644
index 0000000000..c7898f18e0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/bzero.S
@@ -0,0 +1,4 @@
+#define USE_AS_BZERO
+#define memset __bzero
+#include <sysdeps/i386/i686/memset.S>
+weak_alias (__bzero, bzero)
diff --git a/REORG.TODO/sysdeps/i386/i686/dl-hash.h b/REORG.TODO/sysdeps/i386/i686/dl-hash.h
new file mode 100644
index 0000000000..ceda785b32
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/dl-hash.h
@@ -0,0 +1,79 @@
+/* Compute hash alue for given string according to ELF standard.
+   Copyright (C) 1998-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_HASH_H
+#define _DL_HASH_H	1
+
+
+/* This is the hashing function specified by the ELF ABI.  It is highly
+   optimized for the PII processors.  Though it will run on i586 it
+   would be much slower than the generic C implementation.  So don't
+   use it.  */
+static unsigned int
+__attribute__ ((unused))
+_dl_elf_hash (const char *name)
+{
+  unsigned int result;
+  unsigned int temp0;
+  unsigned int temp1;
+
+  __asm__ __volatile__
+    ("movzbl (%1),%2\n\t"
+     "testl %2, %2\n\t"
+     "jz 1f\n\t"
+     "movl %2, %0\n\t"
+     "movzbl 1(%1), %2\n\t"
+     "jecxz 1f\n\t"
+     "shll $4, %0\n\t"
+     "addl %2, %0\n\t"
+     "movzbl 2(%1), %2\n\t"
+     "jecxz 1f\n\t"
+     "shll $4, %0\n\t"
+     "addl %2, %0\n\t"
+     "movzbl 3(%1), %2\n\t"
+     "jecxz 1f\n\t"
+     "shll $4, %0\n\t"
+     "addl %2, %0\n\t"
+     "movzbl 4(%1), %2\n\t"
+     "jecxz 1f\n\t"
+     "shll $4, %0\n\t"
+     "addl $5, %1\n\t"
+     "addl %2, %0\n\t"
+     "movzbl (%1), %2\n\t"
+     "jecxz 1f\n"
+     "2:\t"
+     "shll $4, %0\n\t"
+     "movl $0xf0000000, %3\n\t"
+     "incl %1\n\t"
+     "addl %2, %0\n\t"
+     "andl %0, %3\n\t"
+     "andl $0x0fffffff, %0\n\t"
+     "shrl $24, %3\n\t"
+     "movzbl (%1), %2\n\t"
+     "xorl %3, %0\n\t"
+     "testl %2, %2\n\t"
+     "jnz 2b\n"
+     "1:\t"
+     : "=&r" (result), "=r" (name), "=&c" (temp0), "=&r" (temp1)
+     : "0" (0), "1" ((const unsigned char *) name));
+
+  return result;
+}
+
+#endif /* dl-hash.h */
diff --git a/REORG.TODO/sysdeps/i386/i686/ffs.c b/REORG.TODO/sysdeps/i386/i686/ffs.c
new file mode 100644
index 0000000000..cbe36ff873
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/ffs.c
@@ -0,0 +1,48 @@
+/* ffs -- find first set bit in a word, counted from least significant end.
+   For Intel 80x86, x>=6.
+   This file is part of the GNU C Library.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define ffsl __something_else
+#include <string.h>
+
+#undef	ffs
+
+#ifdef	__GNUC__
+
+int
+__ffs (int x)
+{
+  int cnt;
+  int tmp;
+
+  asm ("bsfl %2,%0\n"		/* Count low bits in X and store in %1.  */
+       "cmovel %1,%0\n"		/* If number was zero, use -1 as result.  */
+       : "=&r" (cnt), "=r" (tmp) : "rm" (x), "1" (-1));
+
+  return cnt + 1;
+}
+weak_alias (__ffs, ffs)
+libc_hidden_def (__ffs)
+libc_hidden_builtin_def (ffs)
+#undef ffsl
+weak_alias (__ffs, ffsl)
+
+#else
+#include <string/ffs.c>
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S
new file mode 100644
index 0000000000..73060b088c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S
@@ -0,0 +1,29 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+
+	.text
+ENTRY(__ieee754_log)
+	fldln2			// log(2)
+	fldl	4(%esp)		// x : log(2)
+	fucomi	%st
+	jp	3f
+	fyl2x			// log(x)
+	ret
+
+3:	fstp	%st(1)
+	ret
+END (__ieee754_log)
+
+ENTRY(__log_finite)
+	fldln2			// log(2)
+	fldl	4(%esp)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+END(__log_finite)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S
new file mode 100644
index 0000000000..6fd39d50d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+
+	.text
+ENTRY(__ieee754_logf)
+	fldln2			// log(2)
+	flds	4(%esp)		// x : log(2)
+	fucomi	%st
+	jp	3f
+	fyl2x			// log(x)
+	ret
+
+3:	fstp	%st(1)
+	ret
+END (__ieee754_logf)
+
+ENTRY(__logf_finite)
+	fldln2			// log(2)
+	flds	4(%esp)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+END(__logf_finite)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S
new file mode 100644
index 0000000000..7e3bc8d817
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S
@@ -0,0 +1,94 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_logl)
+	fldln2			// log(2)
+	fldt	4(%esp)		// x : log(2)
+	fucomi	%st
+	jp	3f
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	movzwl	4+8(%esp), %eax
+	cmpl	$0xc000, %eax
+	jae	5f		// x <= -2, avoid overflow from -LDBL_MAX - 1.
+	fsubl	MO(one)		// x-1 : x : log(2)
+5:	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fld	MO(limit)	// 0.29 : |x-1| : x-1 : x : log(2)
+	fcomip	%st(1)		// |x-1| : x-1 : x : log(2)
+	fstp	%st(0)		// x-1 : x : log(2)
+	jc	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	4f
+	fabs			// log(1) is +0 in all rounding modes.
+4:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+
+3:	fstp	%st(1)
+	fadd	%st(0)
+	ret
+END (__ieee754_logl)
+
+ENTRY(__logl_finite)
+	fldln2			// log(2)
+	fldt	4(%esp)		// x : log(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fld	MO(limit)	// 0.29 : |x-1| : x-1 : x : log(2)
+	fcomip	%st(1)		// |x-1| : x-1 : x : log(2)
+	fstp	%st(0)		// x-1 : x : log(2)
+	jc	2b
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	6f
+	fabs			// log(1) is +0 in all rounding modes.
+6:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+END(__logl_finite)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile
new file mode 100644
index 0000000000..7d9089232f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),math)
+libm-sysdep_routines += e_expf-sse2 e_expf-ia32 s_sinf-sse2 s_cosf-sse2 \
+                        s_sincosf-sse2
+endif
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S
new file mode 100644
index 0000000000..b486b4d1ca
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S
@@ -0,0 +1,22 @@
+/*
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define __ieee754_expf __ieee754_expf_ia32
+#define __expf_finite __expf_finite_ia32
+
+#include <sysdeps/i386/fpu/e_expf.S>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S
new file mode 100644
index 0000000000..e6bb6fa289
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S
@@ -0,0 +1,325 @@
+/* SSE2 version of __ieee754_expf and __expf_finite
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+/* Short algorithm description:
+ *
+ *  Let K = 64 (table size).
+ *       e^x  = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
+ *  where
+ *       x = m*log(2)/K + y,    y in [0.0..log(2)/K]
+ *       m = n*K + j,           m,n,j - signed integer, j in [0..K-1]
+ *       values of 2^(j/K) are tabulated as T[j].
+ *
+ *       P(y) is a minimax polynomial approximation of expf(x)-1
+ *       on small interval [0.0..log(2)/K].
+ *
+ *       P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
+ *       z = y*y;    P(y) = (P3*z + P1)*z + (P2*z + P0)*y
+ *
+ * Special cases:
+ *  __ieee754_expf_sse2(NaN) = NaN
+ *  __ieee754_expf_sse2(+INF) = +INF
+ *  __ieee754_expf_sse2(-INF) = 0
+ *  __ieee754_expf_sse2(x) = 1 for subnormals
+ *  for finite argument, only __ieee754_expf_sse2(0)=1 is exact
+ *  __ieee754_expf_sse2(x) overflows if x>700
+ *  __ieee754_expf_sse2(x) underflows if x<-700
+ *
+ * Note:
+ *  For |x|<700, __ieee754_expf_sse2 computes result in double precision,
+ *  with accuracy a bit more than needed for expf, and does not round it
+ *  to single precision.
+ */
+
+
+#ifdef	PIC
+# define MO1(symbol)			L(symbol)##@GOTOFF(%edx)
+# define MO2(symbol,reg2,_scale)	L(symbol)##@GOTOFF(%edx,reg2,_scale)
+#else
+# define MO1(symbol)			L(symbol)
+# define MO2(symbol,reg2,_scale)	L(symbol)(,reg2,_scale)
+#endif
+
+	.text
+ENTRY(__ieee754_expf_sse2)
+	/* Input: single precision x on stack at address 4(%esp) */
+
+#ifdef	PIC
+	LOAD_PIC_REG(dx)
+#endif
+
+	cvtss2sd	4(%esp), %xmm1	/* Convert x to double precision */
+	mov	4(%esp), %ecx		/* Copy x */
+	movsd	MO1(DP_KLN2), %xmm2	/* DP K/log(2) */
+	movsd	MO1(DP_P2), %xmm3	/* DP P2 */
+	movl	%ecx, %eax		/* x */
+	mulsd	%xmm1, %xmm2		/* DP x*K/log(2) */
+	andl	$0x7fffffff, %ecx	/* |x| */
+	cmpl	$0x442f0000, %ecx	/* |x|<700 ? */
+	movsd	MO1(DP_P3), %xmm4	/* DP P3 */
+	addsd	MO1(DP_RS), %xmm2	/* DP x*K/log(2)+RS */
+	jae	L(special_paths)
+
+	/* Here if |x|<700 */
+	cmpl	$0x31800000, %ecx	/* |x|<2^(-28) ? */
+	jb	L(small_arg)
+
+	/* Main path: here if 2^(-28)<=|x|<700 */
+	cvtsd2ss	%xmm2, %xmm2	/* SP x*K/log(2)+RS */
+	movd	%xmm2, %eax		/* bits of n*K+j with trash */
+	subss	MO1(SP_RS), %xmm2	/* SP t=round(x*K/log(2)) */
+	movl	%eax, %ecx		/* n*K+j with trash */
+	cvtss2sd	%xmm2, %xmm2	/* DP t */
+	andl	$0x3f, %eax		/* bits of j */
+	mulsd	MO1(DP_NLN2K), %xmm2	/* DP -t*log(2)/K */
+	andl	$0xffffffc0, %ecx	/* bits of n */
+#ifdef __AVX__
+	vaddsd	%xmm1, %xmm2, %xmm0	/* DP y=x-t*log(2)/K */
+	vmulsd	%xmm0, %xmm0, %xmm2	/* DP z=y*y */
+#else
+	addsd	%xmm1, %xmm2		/* DP y=x-t*log(2)/K */
+	movaps	%xmm2, %xmm0		/* DP y */
+	mulsd	%xmm2, %xmm2		/* DP z=y*y */
+#endif
+	mulsd	%xmm2, %xmm4		/* DP P3*z */
+	addl	$0xffc0, %ecx		/* bits of n + DP exponent bias */
+	mulsd	%xmm2, %xmm3		/* DP P2*z */
+	shrl	$2, %ecx		/* High 2 bytes of DP 2^n */
+	pxor	%xmm1, %xmm1		/* clear %xmm1 */
+	addsd	MO1(DP_P1), %xmm4	/* DP P3*z+P1 */
+	addsd	MO1(DP_P0), %xmm3	/* DP P2*z+P0 */
+	pinsrw	$3, %ecx, %xmm1		/* DP 2^n */
+	mulsd	%xmm2, %xmm4		/* DP (P3*z+P1)*z */
+	mulsd	%xmm3, %xmm0		/* DP (P2*z+P0)*y */
+	addsd	%xmm4, %xmm0		/* DP P(y) */
+	mulsd	MO2(DP_T,%eax,8), %xmm0	/* DP P(y)*T[j] */
+	addsd	MO2(DP_T,%eax,8), %xmm0	/* DP T[j]*(P(y)+1) */
+	mulsd	%xmm1, %xmm0		/* DP result=2^n*(T[j]*(P(y)+1)) */
+	cvtsd2ss	%xmm0, %xmm1
+
+	lea	-4(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	movss	%xmm1, 0(%esp)		/* Move result from sse... */
+	flds	0(%esp)			/* ...to FPU. */
+	lea	4(%esp), %esp		/* Return back 4 bytes of stack frame */
+	ret
+
+	.p2align	4
+L(small_arg):
+	/* Here if 0<=|x|<2^(-28) */
+	movss	4(%esp), %xmm0		/* load x */
+	addss	MO1(SP_ONE), %xmm0	/* 1.0 + x */
+	/* Return 1.0 with inexact raised, except for x==0 */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(special_paths):
+	/* Here if x is NaN, or Inf, or finite |x|>=700 */
+	movss	4(%esp), %xmm0		/* load x */
+
+	cmpl	$0x7f800000, %ecx	/* |x| is finite ? */
+	jae	L(arg_inf_or_nan)
+
+	/* Here if finite |x|>=700 */
+	testl	$0x80000000, %eax	/* sign of x nonzero ? */
+	je	L(res_overflow)
+
+	/* Here if finite x<=-700 */
+	movss	MO1(SP_SMALL), %xmm0	/* load small value 2^(-100) */
+	mulss	%xmm0, %xmm0		/* Return underflowed result (zero or subnormal) */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(res_overflow):
+	/* Here if finite x>=700 */
+	movss	MO1(SP_LARGE), %xmm0	/* load large value 2^100 */
+	mulss	%xmm0, %xmm0		/* Return overflowed result (Inf or max normal) */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(arg_inf_or_nan):
+	/* Here if |x| is Inf or NAN */
+	jne	L(arg_nan)	/* |x| is Inf ? */
+
+	/* Here if |x| is Inf */
+	shrl	$31, %eax		/* Get sign bit of x */
+	movss	MO2(SP_INF_0,%eax,4), %xmm0/* return zero or Inf, depending on sign of x */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(arg_nan):
+	/* Here if |x| is NaN */
+	addss	%xmm0, %xmm0		/* Return x+x (raise invalid) */
+
+	.p2align	4
+L(epilogue):
+	lea	-4(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	movss	%xmm0, 0(%esp)		/* Move result from sse... */
+	flds	0(%esp)			/* ...to FPU. */
+	lea	4(%esp), %esp		/* Return back 4 bytes of stack frame */
+	ret
+END(__ieee754_expf_sse2)
+
+	.section .rodata, "a"
+	.p2align 3
+L(DP_T): /* table of double precision values 2^(j/K) for j=[0..K-1] */
+	.long	0x00000000, 0x3ff00000
+	.long	0x3e778061, 0x3ff02c9a
+	.long	0xd3158574, 0x3ff059b0
+	.long	0x18759bc8, 0x3ff08745
+	.long	0x6cf9890f, 0x3ff0b558
+	.long	0x32d3d1a2, 0x3ff0e3ec
+	.long	0xd0125b51, 0x3ff11301
+	.long	0xaea92de0, 0x3ff1429a
+	.long	0x3c7d517b, 0x3ff172b8
+	.long	0xeb6fcb75, 0x3ff1a35b
+	.long	0x3168b9aa, 0x3ff1d487
+	.long	0x88628cd6, 0x3ff2063b
+	.long	0x6e756238, 0x3ff2387a
+	.long	0x65e27cdd, 0x3ff26b45
+	.long	0xf51fdee1, 0x3ff29e9d
+	.long	0xa6e4030b, 0x3ff2d285
+	.long	0x0a31b715, 0x3ff306fe
+	.long	0xb26416ff, 0x3ff33c08
+	.long	0x373aa9cb, 0x3ff371a7
+	.long	0x34e59ff7, 0x3ff3a7db
+	.long	0x4c123422, 0x3ff3dea6
+	.long	0x21f72e2a, 0x3ff4160a
+	.long	0x6061892d, 0x3ff44e08
+	.long	0xb5c13cd0, 0x3ff486a2
+	.long	0xd5362a27, 0x3ff4bfda
+	.long	0x769d2ca7, 0x3ff4f9b2
+	.long	0x569d4f82, 0x3ff5342b
+	.long	0x36b527da, 0x3ff56f47
+	.long	0xdd485429, 0x3ff5ab07
+	.long	0x15ad2148, 0x3ff5e76f
+	.long	0xb03a5585, 0x3ff6247e
+	.long	0x82552225, 0x3ff66238
+	.long	0x667f3bcd, 0x3ff6a09e
+	.long	0x3c651a2f, 0x3ff6dfb2
+	.long	0xe8ec5f74, 0x3ff71f75
+	.long	0x564267c9, 0x3ff75feb
+	.long	0x73eb0187, 0x3ff7a114
+	.long	0x36cf4e62, 0x3ff7e2f3
+	.long	0x994cce13, 0x3ff82589
+	.long	0x9b4492ed, 0x3ff868d9
+	.long	0x422aa0db, 0x3ff8ace5
+	.long	0x99157736, 0x3ff8f1ae
+	.long	0xb0cdc5e5, 0x3ff93737
+	.long	0x9fde4e50, 0x3ff97d82
+	.long	0x82a3f090, 0x3ff9c491
+	.long	0x7b5de565, 0x3ffa0c66
+	.long	0xb23e255d, 0x3ffa5503
+	.long	0x5579fdbf, 0x3ffa9e6b
+	.long	0x995ad3ad, 0x3ffae89f
+	.long	0xb84f15fb, 0x3ffb33a2
+	.long	0xf2fb5e47, 0x3ffb7f76
+	.long	0x904bc1d2, 0x3ffbcc1e
+	.long	0xdd85529c, 0x3ffc199b
+	.long	0x2e57d14b, 0x3ffc67f1
+	.long	0xdcef9069, 0x3ffcb720
+	.long	0x4a07897c, 0x3ffd072d
+	.long	0xdcfba487, 0x3ffd5818
+	.long	0x03db3285, 0x3ffda9e6
+	.long	0x337b9b5f, 0x3ffdfc97
+	.long	0xe78b3ff6, 0x3ffe502e
+	.long	0xa2a490da, 0x3ffea4af
+	.long	0xee615a27, 0x3ffefa1b
+	.long	0x5b6e4540, 0x3fff5076
+	.long	0x819e90d8, 0x3fffa7c1
+	.type L(DP_T), @object
+	ASM_SIZE_DIRECTIVE(L(DP_T))
+
+	.section .rodata.cst8,"aM",@progbits,8
+	.p2align 3
+L(DP_KLN2): /* double precision K/log(2) */
+	.long	0x652b82fe, 0x40571547
+	.type L(DP_KLN2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_KLN2))
+
+	.p2align 3
+L(DP_NLN2K): /* double precision -log(2)/K */
+	.long	0xfefa39ef, 0xbf862e42
+	.type L(DP_NLN2K), @object
+	ASM_SIZE_DIRECTIVE(L(DP_NLN2K))
+
+	.p2align 3
+L(DP_RS): /* double precision 2^23+2^22 */
+	.long	0x00000000, 0x41680000
+	.type L(DP_RS), @object
+	ASM_SIZE_DIRECTIVE(L(DP_RS))
+
+	.p2align 3
+L(DP_P3): /* double precision polynomial coefficient P3 */
+	.long	0xeb78fa85, 0x3fa56420
+	.type L(DP_P3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_P3))
+
+	.p2align 3
+L(DP_P1): /* double precision polynomial coefficient P1 */
+	.long	0x008d6118, 0x3fe00000
+	.type L(DP_P1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_P1))
+
+	.p2align 3
+L(DP_P2): /* double precision polynomial coefficient P2 */
+	.long	0xda752d4f, 0x3fc55550
+	.type L(DP_P2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_P2))
+
+	.p2align 3
+L(DP_P0): /* double precision polynomial coefficient P0 */
+	.long	0xffffe7c6, 0x3fefffff
+	.type L(DP_P0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_P0))
+
+	.p2align 2
+L(SP_INF_0):
+	.long	0x7f800000	/* single precision Inf */
+	.long	0		/* single precision zero */
+	.type L(SP_INF_0), @object
+	ASM_SIZE_DIRECTIVE(L(SP_INF_0))
+
+	.section .rodata.cst4,"aM",@progbits,4
+	.p2align 2
+L(SP_RS): /* single precision 2^23+2^22 */
+	.long	0x4b400000
+	.type L(SP_RS), @object
+	ASM_SIZE_DIRECTIVE(L(SP_RS))
+
+	.p2align 2
+L(SP_SMALL): /* single precision small value 2^(-100) */
+	.long	0x0d800000
+	.type L(SP_SMALL), @object
+	ASM_SIZE_DIRECTIVE(L(SP_SMALL))
+
+	.p2align 2
+L(SP_LARGE): /* single precision large value 2^100 */
+	.long	0x71800000
+	.type L(SP_LARGE), @object
+	ASM_SIZE_DIRECTIVE(L(SP_LARGE))
+
+	.p2align 2
+L(SP_ONE): /* single precision 1.0 */
+	.long	0x3f800000
+	.type L(SP_ONE), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+strong_alias (__ieee754_expf_sse2, __expf_finite_sse2)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c
new file mode 100644
index 0000000000..388cf98a39
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c
@@ -0,0 +1,37 @@
+/* Multiple versions of expf
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern double __ieee754_expf_sse2 (double);
+extern double __ieee754_expf_ia32 (double);
+
+double __ieee754_expf (double);
+libm_ifunc (__ieee754_expf,
+	    HAS_CPU_FEATURE (SSE2)
+	    ? __ieee754_expf_sse2
+	    : __ieee754_expf_ia32);
+
+extern double __expf_finite_sse2 (double);
+extern double __expf_finite_ia32 (double);
+
+double __expf_finite (double);
+libm_ifunc (__expf_finite,
+	    HAS_CPU_FEATURE (SSE2)
+	    ? __expf_finite_sse2
+	    : __expf_finite_ia32);
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
new file mode 100644
index 0000000000..04bc23b37b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
@@ -0,0 +1,2188 @@
+# Begin of automatic generation
+
+# Maximal error of functions:
+Function: "acos":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "acos_downward":
+ildouble: 2
+ldouble: 2
+
+Function: "acos_towardzero":
+ildouble: 2
+ldouble: 2
+
+Function: "acos_upward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "acosh":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 2
+
+Function: "acosh_downward":
+double: 1
+idouble: 1
+ildouble: 6
+ldouble: 4
+
+Function: "acosh_towardzero":
+double: 1
+idouble: 1
+ildouble: 6
+ldouble: 4
+
+Function: "acosh_upward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 3
+
+Function: "asin":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_downward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "asin_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asinh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "asinh_downward":
+double: 1
+float: 1
+idouble: 1
+ildouble: 5
+ldouble: 5
+
+Function: "asinh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "asinh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "atan":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atanh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "atanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 4
+
+Function: "atanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 3
+
+Function: "atanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "cabs":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cacos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacos_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "cacos_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "cacos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "cacosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacosh_downward":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cacosh_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacosh_upward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "carg":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "casin_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "casin_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "casin_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casin_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casinh_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Imaginary part of "casinh_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "casinh_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "casinh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "catan":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "catanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catanh":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cbrt":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cbrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "cbrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cbrt_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccosh_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cexp":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cexp":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cexp_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cexp_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "clog":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog10":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog10":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "clog10_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "clog10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cos":
+ildouble: 1
+ldouble: 1
+
+Function: "cos_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cos_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cos_upward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh":
+double: 1
+float: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_downward":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: "cosh_towardzero":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_upward":
+double: 4
+float: 2
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: Real part of "cpow":
+double: 2
+float: 5
+idouble: 2
+ifloat: 5
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cpow":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "cpow_downward":
+double: 5
+float: 8
+idouble: 5
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cpow_towardzero":
+double: 5
+float: 8
+idouble: 5
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cpow_upward":
+double: 4
+float: 1
+idouble: 4
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cpow_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+
+Function: Real part of "csin_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "csinh_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csqrt":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "csqrt":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csqrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "csqrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ctan_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctan_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ctanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ctanh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctanh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "erf":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erfc":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "erfc_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "exp":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_downward":
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_upward":
+ildouble: 1
+ldouble: 1
+
+Function: "exp_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "expm1":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "expm1_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "gamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "gamma_downward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_towardzero":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_upward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "hypot":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j0":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "j0_downward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "j0_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "j0_upward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "j1":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j1_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "j1_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "j1_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: "jn":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "jn_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "jn_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "jn_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "lgamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "lgamma_downward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_towardzero":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_upward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "log":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log1p":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log1p_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "log2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log_downward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow_downward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_towardzero":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_upward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "sin":
+ildouble: 1
+ldouble: 1
+
+Function: "sin_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sin_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sin_upward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos":
+ildouble: 1
+ldouble: 1
+
+Function: "sincos_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sincos_upward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sinh":
+double: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sinh_downward":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 5
+
+Function: "sinh_towardzero":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 4
+
+Function: "sinh_upward":
+double: 4
+float: 2
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 5
+
+Function: "tan":
+float: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "tan_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "tanh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 7
+ldouble: 4
+
+Function: "tanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 4
+
+Function: "tgamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_downward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_towardzero":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_upward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "y0":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "y0_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y0_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y0_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "y1":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "y1_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "y1_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y1_upward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 7
+ldouble: 7
+
+Function: "yn":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "yn_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "yn_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "yn_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+# end of automatic generation
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name
new file mode 100644
index 0000000000..193dd704b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name
@@ -0,0 +1 @@
+i686
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S
new file mode 100644
index 0000000000..f37850d0b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S
@@ -0,0 +1,553 @@
+/* Optimized with sse2 version of cosf
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ *  1) if |x| == 0: return 1.0-|x|.
+ *  2) if |x| <  2^-27: return 1.0-|x|.
+ *  3) if |x| <  2^-5 : return 1.0+x^2*DP_COS2_0+x^5*DP_COS2_1.
+ *  4) if |x| <   Pi/4: return 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
+ *  5) if |x| < 9*Pi/4:
+ *      5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+3,
+ *           t=|x|-j*Pi/4.
+ *      5.2) Reconstruction:
+ *          s = (-1.0)^((n>>2)&1)
+ *          if(n&2 != 0) {
+ *              using cos(t) polynomial for |t|<Pi/4, result is
+ *              s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+ *          } else {
+ *              using sin(t) polynomial for |t|<Pi/4, result is
+ *              s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+ *          }
+ *  6) if |x| < 2^23, large args:
+ *      6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
+ *           t=|x|-j*Pi/4.
+ *      6.2) Reconstruction same as (5.2).
+ *  7) if |x| >= 2^23, very large args:
+ *      7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
+ *           t=|x|-j*Pi/4.
+ *      7.2) Reconstruction same as (5.2).
+ *  8) if x is Inf, return x-x, and set errno=EDOM.
+ *  9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ *  cos(+-0) = 1 not raising inexact,
+ *  cos(subnormal) raises inexact,
+ *  cos(min_normalized) raises inexact,
+ *  cos(normalized) raises inexact,
+ *  cos(Inf) = NaN, raises invalid, sets errno to EDOM,
+ *  cos(NaN) = NaN.
+ */
+
+#ifdef	PIC
+# define MO1(symbol)			L(symbol)##@GOTOFF(%ebx)
+# define MO2(symbol,reg2,_scale)	L(symbol)##@GOTOFF(%ebx,reg2,_scale)
+# define CFI_PUSH(REG)	cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
+# define CFI_POP(REG)	cfi_adjust_cfa_offset(-4); cfi_restore(REG)
+# define PUSH(REG)			pushl REG; CFI_PUSH(REG)
+# define POP(REG)			popl REG; CFI_POP(REG)
+# define ENTRANCE			PUSH(%ebx); LOAD_PIC_REG(bx)
+# define RETURN				POP(%ebx); ret; CFI_PUSH(%ebx)
+# define ARG_X				8(%esp)
+#else
+# define MO1(symbol)			L(symbol)
+# define MO2(symbol,reg2,_scale)	L(symbol)(,reg2,_scale)
+# define ENTRANCE
+# define RETURN				ret
+# define ARG_X				4(%esp)
+#endif
+
+	.text
+ENTRY(__cosf_sse2)
+	/* Input: single precision x on stack at address ARG_X */
+
+	ENTRANCE
+	movl	ARG_X, %eax		/* Bits of x */
+	cvtss2sd ARG_X, %xmm0		/* DP x */
+	andl	$0x7fffffff, %eax	/* |x| */
+
+	cmpl	$0x3f490fdb, %eax	/* |x|<Pi/4?  */
+	jb	L(arg_less_pio4)
+
+	/* Here if |x|>=Pi/4 */
+	movd	%eax, %xmm3		/* SP |x| */
+	andpd	MO1(DP_ABS_MASK),%xmm0	/* DP |x| */
+	movss	MO1(SP_INVPIO4), %xmm2	/* SP 1/(Pi/4) */
+
+	cmpl	$0x40e231d6, %eax	/* |x|<9*Pi/4?  */
+	jae	L(large_args)
+
+	/* Here if Pi/4<=|x|<9*Pi/4 */
+	mulss	%xmm3, %xmm2		/* SP |x|/(Pi/4) */
+	cvttss2si %xmm2, %eax		/* k, number of Pi/4 in x */
+	addl	$1, %eax		/* k+1 */
+	movl	$0x0e, %edx
+	andl	%eax, %edx		/* j = (k+1)&0x0e */
+	addl	$2, %eax		/* n */
+	subsd	MO2(PIO4J,%edx,8), %xmm0 /* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+	/* Input: %eax=n, %xmm0=t */
+	testl	$2, %eax		/* n&2 != 0?  */
+	jz	L(sin_poly)
+
+/*L(cos_poly):*/
+	/* Here if cos(x) calculated using cos(t) polynomial for |t|<Pi/4:
+	 * y = t*t; z = y*y;
+	 * s = sign(x) * (-1.0)^((n>>2)&1)
+	 * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
+	 */
+	shrl	$2, %eax		/* n>>2 */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	andl	$1, %eax		/* (n>>2)&1 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=t^4 */
+
+	movsd	MO1(DP_C4), %xmm4	/* C4 */
+	mulsd	%xmm0, %xmm4		/* z*C4 */
+	movsd	MO1(DP_C3), %xmm3	/* C3 */
+	mulsd	%xmm0, %xmm3		/* z*C3 */
+	addsd	MO1(DP_C2), %xmm4	/* C2+z*C4 */
+	mulsd	%xmm0, %xmm4		/* z*(C2+z*C4) */
+	lea	-8(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	addsd	MO1(DP_C1), %xmm3	/* C1+z*C3 */
+	mulsd	%xmm0, %xmm3		/* z*(C1+z*C3) */
+	addsd	MO1(DP_C0), %xmm4	/* C0+z*(C2+z*C4) */
+	mulsd	%xmm1, %xmm4		/* y*(C0+z*(C2+z*C4)) */
+
+	addsd	%xmm4, %xmm3		/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	/* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	addsd	MO1(DP_ONES), %xmm3
+
+	mulsd	MO2(DP_ONES,%eax,8), %xmm3 /* DP result */
+	movsd	%xmm3, 0(%esp)		/* Move result from sse...  */
+	fldl	0(%esp)			/* ...to FPU.  */
+	/* Return back 4 bytes of stack frame */
+	lea	8(%esp), %esp
+	RETURN
+
+	.p2align	4
+L(sin_poly):
+	/* Here if cos(x) calculated using sin(t) polynomial for |t|<Pi/4:
+	 * y = t*t; z = y*y;
+	 * s = sign(x) * (-1.0)^((n>>2)&1)
+	 * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
+	 */
+
+	movaps	%xmm0, %xmm4		/* t */
+	shrl	$2, %eax		/* n>>2 */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	andl	$1, %eax		/* (n>>2)&1 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=t^4 */
+
+	movsd	MO1(DP_S4), %xmm2	/* S4 */
+	mulsd	%xmm0, %xmm2		/* z*S4 */
+	movsd	MO1(DP_S3), %xmm3	/* S3 */
+	mulsd	%xmm0, %xmm3		/* z*S3 */
+	lea	-8(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	addsd	MO1(DP_S2), %xmm2	/* S2+z*S4 */
+	mulsd	%xmm0, %xmm2		/* z*(S2+z*S4) */
+	addsd	MO1(DP_S1), %xmm3	/* S1+z*S3 */
+	mulsd	%xmm0, %xmm3		/* z*(S1+z*S3) */
+	addsd	MO1(DP_S0), %xmm2	/* S0+z*(S2+z*S4) */
+	mulsd	%xmm1, %xmm2		/* y*(S0+z*(S2+z*S4)) */
+	/* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
+	mulsd	MO2(DP_ONES,%eax,8), %xmm4
+	addsd	%xmm2, %xmm3		/* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	/* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	mulsd	%xmm4, %xmm3
+	/* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	addsd	%xmm4, %xmm3
+	movsd	%xmm3, 0(%esp)		/* Move result from sse...   */
+	fldl	0(%esp)			/* ...to FPU.  */
+	/* Return back 4 bytes of stack frame */
+	lea	8(%esp), %esp
+	RETURN
+
+	.p2align	4
+L(large_args):
+	/* Here if |x|>=9*Pi/4 */
+	cmpl	$0x7f800000, %eax	/* x is Inf or NaN?  */
+	jae	L(arg_inf_or_nan)
+
+	/* Here if finite |x|>=9*Pi/4 */
+	cmpl	$0x4b000000, %eax	/* |x|<2^23?  */
+	jae	L(very_large_args)
+
+	/* Here if 9*Pi/4<=|x|<2^23 */
+	movsd	MO1(DP_INVPIO4), %xmm1	/* 1/(Pi/4) */
+	mulsd	%xmm0, %xmm1		/* |x|/(Pi/4) */
+	cvttsd2si %xmm1, %eax		/* k=trunc(|x|/(Pi/4)) */
+	addl	$1, %eax		/* k+1 */
+	movl	%eax, %edx
+	andl	$0xfffffffe, %edx	/* j=(k+1)&0xfffffffe */
+	cvtsi2sdl %edx, %xmm4		/* DP j */
+	movsd	MO1(DP_PIO4HI), %xmm2	/* -PIO4HI = high part of -Pi/4 */
+	mulsd	%xmm4, %xmm2		/* -j*PIO4HI */
+	movsd	MO1(DP_PIO4LO), %xmm3	/* -PIO4LO = low part of -Pi/4 */
+	addsd	%xmm2, %xmm0		/* |x| - j*PIO4HI */
+	addl	$2, %eax		/* n */
+	mulsd	%xmm3, %xmm4		/* j*PIO4LO */
+	addsd	%xmm4, %xmm0		/* t = |x| - j*PIO4HI - j*PIO4LO */
+	jmp	L(reconstruction)
+
+	.p2align	4
+L(very_large_args):
+	/* Here if finite |x|>=2^23 */
+
+	/* bitpos = (ix>>23) - BIAS_32 + 59; */
+	shrl	$23, %eax		/* eb = biased exponent of x */
+	/* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
+	subl	$68, %eax
+	movl	$28, %ecx		/* %cl=28 */
+	movl	%eax, %edx		/* bitpos copy */
+
+	/* j = bitpos/28; */
+	div	%cl			/* j in register %al=%ax/%cl */
+	movapd	%xmm0, %xmm3		/* |x| */
+	/* clear unneeded remainder from %ah */
+	andl	$0xff, %eax
+
+	imull	$28, %eax, %ecx		/* j*28 */
+	movsd	MO1(DP_HI_MASK), %xmm4	/* DP_HI_MASK */
+	movapd	%xmm0, %xmm5		/* |x| */
+	mulsd	-2*8+MO2(_FPI,%eax,8), %xmm3	/* tmp3 = FPI[j-2]*|x| */
+	movapd	%xmm0, %xmm1		/* |x| */
+	mulsd	-1*8+MO2(_FPI,%eax,8), %xmm5	/* tmp2 = FPI[j-1]*|x| */
+	mulsd	0*8+MO2(_FPI,%eax,8), %xmm0	/* tmp0 = FPI[j]*|x| */
+	addl	$19, %ecx		/* j*28+19 */
+	mulsd	1*8+MO2(_FPI,%eax,8), %xmm1	/* tmp1 = FPI[j+1]*|x| */
+	cmpl	%ecx, %edx		/* bitpos>=j*28+19?  */
+	jl	L(very_large_skip1)
+
+	/* Here if bitpos>=j*28+19 */
+	andpd	%xmm3, %xmm4		/* HI(tmp3) */
+	subsd	%xmm4, %xmm3		/* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+	movsd	MO1(DP_2POW52), %xmm6
+	movapd	%xmm5, %xmm2		/* tmp2 copy */
+	addsd	%xmm3, %xmm5		/* tmp5 = tmp3 + tmp2 */
+	movl	$1, %edx
+	addsd	%xmm5, %xmm6		/* tmp6 = tmp5 + 2^52 */
+	movsd	8+MO1(DP_2POW52), %xmm4
+	movd	%xmm6, %eax		/* k = I64_LO(tmp6); */
+	addsd	%xmm6, %xmm4		/* tmp4 = tmp6 - 2^52 */
+	comisd	%xmm5, %xmm4		/* tmp4 > tmp5?  */
+	jbe	L(very_large_skip2)
+
+	/* Here if tmp4 > tmp5 */
+	subl	$1, %eax		/* k-- */
+	addsd	8+MO1(DP_ONES), %xmm4	/* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+	andl	%eax, %edx		/* k&1 */
+	subsd	%xmm4, %xmm3		/* tmp3 -= tmp4 */
+	addsd	MO2(DP_ZERONE,%edx,8), %xmm3 /* t  = DP_ZERONE[k&1] + tmp3 */
+	addsd	%xmm2, %xmm3		/* t += tmp2 */
+	addsd	%xmm3, %xmm0		/* t += tmp0 */
+	addl	$3, %eax		/* n=k+3 */
+	addsd	%xmm1, %xmm0		/* t += tmp1 */
+	mulsd	MO1(DP_PIO4), %xmm0	/* t *= PI04 */
+
+	jmp	L(reconstruction)	/* end of very_large_args peth */
+
+	.p2align	4
+L(arg_less_pio4):
+	/* Here if |x|<Pi/4 */
+	cmpl	$0x3d000000, %eax	/* |x|<2^-5?  */
+	jl	L(arg_less_2pn5)
+
+	/* Here if 2^-5<=|x|<Pi/4 */
+	mulsd	%xmm0, %xmm0		/* y=x^2 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=x^4 */
+	movsd	MO1(DP_C4), %xmm3	/* C4 */
+	mulsd	%xmm0, %xmm3		/* z*C4 */
+	movsd	MO1(DP_C3), %xmm5	/* C3 */
+	mulsd	%xmm0, %xmm5		/* z*C3 */
+	addsd	MO1(DP_C2), %xmm3	/* C2+z*C4 */
+	mulsd	%xmm0, %xmm3		/* z*(C2+z*C4) */
+	addsd	MO1(DP_C1), %xmm5	/* C1+z*C3 */
+	mulsd	%xmm0, %xmm5		/* z*(C1+z*C3) */
+	addsd	MO1(DP_C0), %xmm3	/* C0+z*(C2+z*C4) */
+	mulsd	%xmm1, %xmm3		/* y*(C0+z*(C2+z*C4)) */
+	addsd	%xmm5, %xmm3		/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	/* 1.0 + y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	addsd	MO1(DP_ONES), %xmm3
+	cvtsd2ss %xmm3, %xmm3		/* SP result */
+
+L(epilogue):
+	lea	-4(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	movss	%xmm3, 0(%esp)		/* Move result from sse...  */
+	flds	0(%esp)			/* ...to FPU.  */
+	/* Return back 4 bytes of stack frame */
+	lea	4(%esp), %esp
+	RETURN
+
+	.p2align	4
+L(arg_less_2pn5):
+	/* Here if |x|<2^-5 */
+	cmpl	$0x32000000, %eax	/* |x|<2^-27?  */
+	jl	L(arg_less_2pn27)
+
+	/* Here if 2^-27<=|x|<2^-5 */
+	mulsd	%xmm0, %xmm0		/* DP x^2 */
+	movsd	MO1(DP_COS2_1), %xmm3	/* DP DP_COS2_1 */
+	mulsd	%xmm0, %xmm3		/* DP x^2*DP_COS2_1 */
+	addsd	MO1(DP_COS2_0), %xmm3	/* DP DP_COS2_0+x^2*DP_COS2_1 */
+	mulsd	%xmm0, %xmm3		/* DP x^2*DP_COS2_0+x^4*DP_COS2_1 */
+	/* DP 1.0+x^2*DP_COS2_0+x^4*DP_COS2_1 */
+	addsd	MO1(DP_ONES), %xmm3
+	cvtsd2ss %xmm3, %xmm3		/* SP result */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(arg_less_2pn27):
+	/* Here if |x|<2^-27 */
+	movss	ARG_X, %xmm0		/* x */
+	andps	MO1(SP_ABS_MASK),%xmm0	/* |x| */
+	movss	MO1(SP_ONE), %xmm3	/* 1.0 */
+	subss	%xmm0, %xmm3		/* result is 1.0-|x| */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(arg_inf_or_nan):
+	/* Here if |x| is Inf or NAN */
+	jne	L(skip_errno_setting)	/* in case of x is NaN */
+
+	/* Here if x is Inf. Set errno to EDOM.  */
+	call	JUMPTARGET(__errno_location)
+	movl	$EDOM, (%eax)
+
+	.p2align	4
+L(skip_errno_setting):
+	/* Here if |x| is Inf or NAN. Continued.  */
+	movss	ARG_X, %xmm3		/* load x */
+	subss	%xmm3, %xmm3		/* Result is NaN */
+	jmp	L(epilogue)
+END(__cosf_sse2)
+
+	.section .rodata, "a"
+	.p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+	.long	0x00000000,0x00000000
+	.long	0x54442d18,0x3fe921fb
+	.long	0x54442d18,0x3ff921fb
+	.long	0x7f3321d2,0x4002d97c
+	.long	0x54442d18,0x400921fb
+	.long	0x2955385e,0x400f6a7a
+	.long	0x7f3321d2,0x4012d97c
+	.long	0xe9bba775,0x4015fdbb
+	.long	0x54442d18,0x401921fb
+	.long	0xbeccb2bb,0x401c463a
+	.long	0x2955385e,0x401f6a7a
+	.type L(PIO4J), @object
+	ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+	.p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+	.long	0x00000000,0x00000000
+	.long	0x6c000000,0x3ff45f30
+	.long	0x2a000000,0x3e3c9c88
+	.long	0xa8000000,0x3c54fe13
+	.long	0xd0000000,0x3aaf47d4
+	.long	0x6c000000,0x38fbb81b
+	.long	0xe0000000,0x3714acc9
+	.long	0x7c000000,0x3560e410
+	.long	0x56000000,0x33bca2c7
+	.long	0xac000000,0x31fbd778
+	.long	0xe0000000,0x300b7246
+	.long	0xe8000000,0x2e5d2126
+	.long	0x48000000,0x2c970032
+	.long	0xe8000000,0x2ad77504
+	.long	0xe0000000,0x290921cf
+	.long	0xb0000000,0x274deb1c
+	.long	0xe0000000,0x25829a73
+	.long	0xbe000000,0x23fd1046
+	.long	0x10000000,0x2224baed
+	.long	0x8e000000,0x20709d33
+	.long	0x80000000,0x1e535a2f
+	.long	0x64000000,0x1cef904e
+	.long	0x30000000,0x1b0d6398
+	.long	0x24000000,0x1964ce7d
+	.long	0x16000000,0x17b908bf
+	.type L(_FPI), @object
+	ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomial
+ for cos(x)~=1.0+x^2*DP_COS2_0+x^4*DP_COS2_1, |x|<2^-5.  */
+	.p2align 3
+L(DP_COS2_0):
+	.long	0xff5cc6fd,0xbfdfffff
+	.type L(DP_COS2_0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_COS2_0))
+
+	.p2align 3
+L(DP_COS2_1):
+	.long	0xb178dac5,0x3fa55514
+	.type L(DP_COS2_1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_COS2_1))
+
+	.p2align 3
+L(DP_ZERONE):
+	.long	0x00000000,0x00000000	/* 0.0 */
+	.long	0x00000000,0xbff00000	/* 1.0 */
+	.type L(DP_ZERONE),@object
+	ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+	.p2align 3
+L(DP_ONES):
+	.long	0x00000000,0x3ff00000	/* +1.0 */
+	.long	0x00000000,0xbff00000	/* -1.0 */
+	.type L(DP_ONES), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomial
+ for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4.  */
+	.p2align 3
+L(DP_S3):
+	.long	0x64e6b5b4,0x3ec71d72
+	.type L(DP_S3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S3))
+
+	.p2align 3
+L(DP_S1):
+	.long	0x10c2688b,0x3f811111
+	.type L(DP_S1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S1))
+
+	.p2align 3
+L(DP_S4):
+	.long	0x1674b58a,0xbe5a947e
+	.type L(DP_S4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S4))
+
+	.p2align 3
+L(DP_S2):
+	.long	0x8b4bd1f9,0xbf2a019f
+	.type L(DP_S2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S2))
+
+	.p2align 3
+L(DP_S0):
+	.long	0x55551cd9,0xbfc55555
+	.type L(DP_S0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S0))
+
+/* Coefficients of polynomial
+ for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4.  */
+	.p2align 3
+L(DP_C3):
+	.long	0x9ac43cc0,0x3efa00eb
+	.type L(DP_C3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C3))
+
+	.p2align 3
+L(DP_C1):
+	.long	0x545c50c7,0x3fa55555
+	.type L(DP_C1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C1))
+
+	.p2align 3
+L(DP_C4):
+	.long	0xdd8844d7,0xbe923c97
+	.type L(DP_C4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C4))
+
+	.p2align 3
+L(DP_C2):
+	.long	0x348b6874,0xbf56c16b
+	.type L(DP_C2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C2))
+
+	.p2align 3
+L(DP_C0):
+	.long	0xfffe98ae,0xbfdfffff
+	.type L(DP_C0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C0))
+
+	.p2align 3
+L(DP_PIO4):
+	.long	0x54442d18,0x3fe921fb	/* Pi/4 */
+	.type L(DP_PIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+	.p2align 3
+L(DP_2POW52):
+	.long	0x00000000,0x43300000	/* +2^52 */
+	.long	0x00000000,0xc3300000	/* -2^52 */
+	.type L(DP_2POW52), @object
+	ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+	.p2align 3
+L(DP_INVPIO4):
+	.long	0x6dc9c883,0x3ff45f30	/* 4/Pi */
+	.type L(DP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+	.p2align 3
+L(DP_PIO4HI):
+	.long	0x54000000,0xbfe921fb	/* High part of Pi/4 */
+	.type L(DP_PIO4HI), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+	.p2align 3
+L(DP_PIO4LO):
+	.long	0x11A62633,0xbe010b46	/* Low part of Pi/4 */
+	.type L(DP_PIO4LO), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+	.p2align 2
+L(SP_INVPIO4):
+	.long	0x3fa2f983		/* 4/Pi */
+	.type L(SP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+	.p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+	.long	0xffffffff,0x7fffffff
+	.long	0xffffffff,0x7fffffff
+	.type L(DP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+	.p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+	.long	0x00000000,0xffffffff
+	.type L(DP_HI_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+	.p2align 4
+L(SP_ABS_MASK): /* Mask for getting SP absolute value */
+	.long	0x7fffffff,0x7fffffff
+	.long	0x7fffffff,0x7fffffff
+	.type L(SP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
+
+	.p2align 2
+L(SP_ONE):
+	.long	0x3f800000		/* 1.0 */
+	.type L(SP_ONE), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+weak_alias (__cosf, cosf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c
new file mode 100644
index 0000000000..af588de9dc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c
@@ -0,0 +1,29 @@
+/* Multiple versions of cosf
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern float __cosf_sse2 (float);
+extern float __cosf_ia32 (float);
+float __cosf (float);
+
+libm_ifunc (__cosf, HAS_CPU_FEATURE (SSE2) ? __cosf_sse2 : __cosf_ia32);
+weak_alias (__cosf, cosf);
+
+#define COSF __cosf_ia32
+#include <sysdeps/ieee754/flt-32/s_cosf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S
new file mode 100644
index 0000000000..f31a925522
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S
@@ -0,0 +1,586 @@
+/* Optimized with sse2 version of sincosf
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ *  1) if |x|==0:    sin(x)=x,
+ *                   cos(x)=1.
+ *  2) if |x|<2^-27: sin(x)=x-x*DP_SMALL, raising underflow only when needed,
+ *                   cos(x)=1-|x|.
+ *  3) if |x|<2^-5 : sin(x)=x+x*x^2*DP_SIN2_0+x^5*DP_SIN2_1,
+ *                   cos(x)=1+1*x^2*DP_COS2_0+x^5*DP_COS2_1
+ *  4) if |x|< Pi/4: sin(x)=x+x*x^2*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))),
+ *                   cos(x)=1+1*x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
+ *  5) if |x| < 9*Pi/4:
+ *      5.1) Range reduction:
+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1, t=|x|-j*Pi/4.
+ *      5.2) Reconstruction:
+ *          sign_sin = sign(x) * (-1.0)^(( n   >>2)&1)
+ *          sign_cos =           (-1.0)^(((n+2)>>2)&1)
+ *          poly_sin = ((((S4*t^2 + S3)*t^2 + S2)*t^2 + S1)*t^2 + S0)*t^2*t+t
+ *          poly_cos = ((((C4*t^2 + C3)*t^2 + C2)*t^2 + C1)*t^2 + C0)*t^2*s+s
+ *          if(n&2 != 0) {
+ *              using cos(t) and sin(t) polynomials for |t|<Pi/4, results are
+ *              cos(x) = poly_sin * sign_cos
+ *              sin(x) = poly_cos * sign_sin
+ *          } else {
+ *              sin(x) = poly_sin * sign_sin
+ *              cos(x) = poly_cos * sign_cos
+ *          }
+ *  6) if |x| < 2^23, large args:
+ *      6.1) Range reduction:
+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4
+ *      6.2) Reconstruction same as (5.2).
+ *  7) if |x| >= 2^23, very large args:
+ *      7.1) Range reduction:
+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4.
+ *      7.2) Reconstruction same as (5.2).
+ *  8) if x is Inf, return x-x, and set errno=EDOM.
+ *  9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ *  sin/cos(+-0) = +-0/1 not raising inexact/underflow,
+ *  sin/cos(subnormal) raises inexact/underflow,
+ *  sin/cos(min_normalized) raises inexact/underflow,
+ *  sin/cos(normalized) raises inexact,
+ *  sin/cos(Inf) = NaN, raises invalid, sets errno to EDOM,
+ *  sin/cos(NaN) = NaN.
+ */
+
+#ifdef	PIC
+# define MO1(symbol)			L(symbol)##@GOTOFF(%ebx)
+# define MO2(symbol,reg2,_scale)	L(symbol)##@GOTOFF(%ebx,reg2,_scale)
+# define CFI_PUSH(REG)	cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
+# define CFI_POP(REG)	cfi_adjust_cfa_offset(-4); cfi_restore(REG)
+# define PUSH(REG)			pushl REG; CFI_PUSH(REG)
+# define POP(REG)			popl REG; CFI_POP(REG)
+# define ENTRANCE			PUSH(%ebx); LOAD_PIC_REG(bx)
+# define RETURN				POP(%ebx); ret; CFI_PUSH(%ebx)
+# define ARG_X				8(%esp)
+# define ARG_SIN_PTR			12(%esp)
+# define ARG_COS_PTR			16(%esp)
+#else
+# define MO1(symbol)			L(symbol)
+# define MO2(symbol,reg2,_scale)	L(symbol)(,reg2,_scale)
+# define ENTRANCE
+# define RETURN				ret
+# define ARG_X				4(%esp)
+# define ARG_SIN_PTR			8(%esp)
+# define ARG_COS_PTR			12(%esp)
+#endif
+
+	.text
+ENTRY(__sincosf_sse2)
+	/* Input: single precision x on stack at address ARG_X */
+	/*        pointer to sin result on stack at address ARG_SIN_PTR */
+	/*        pointer to cos result on stack at address ARG_COS_PTR */
+
+	ENTRANCE
+	movl	ARG_X, %eax		/* Bits of x */
+	cvtss2sd ARG_X, %xmm0		/* DP x */
+	andl	$0x7fffffff, %eax	/* |x| */
+
+	cmpl	$0x3f490fdb, %eax	/* |x|<Pi/4 ? */
+	jb	L(arg_less_pio4)
+
+	/* Here if |x|>=Pi/4 */
+	movd	%eax, %xmm3		/* SP |x| */
+	andpd	MO1(DP_ABS_MASK),%xmm0	/* DP |x| */
+	movss	MO1(SP_INVPIO4), %xmm2	/* SP 1/(Pi/4) */
+
+	cmpl	$0x40e231d6, %eax	/* |x|<9*Pi/4 ? */
+	jae	L(large_args)
+
+	/* Here if Pi/4<=|x|<9*Pi/4 */
+	mulss	%xmm3, %xmm2		/* SP |x|/(Pi/4) */
+	movl	ARG_X, %ecx		/* Load x */
+	cvttss2si %xmm2, %eax		/* k, number of Pi/4 in x */
+	shrl	$29, %ecx		/* (sign of x) << 2 */
+	addl	$1, %eax		/* k+1 */
+	movl	$0x0e, %edx
+	andl	%eax, %edx		/* j = (k+1)&0x0e */
+	subsd	MO2(PIO4J,%edx,8), %xmm0/* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+	/* Input: %eax=n, %xmm0=t, %ecx=sign(x) */
+
+	movaps	%xmm0, %xmm4		/* t */
+	movhpd	MO1(DP_ONES), %xmm4	/* 1|t */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	movl	$2, %edx
+	unpcklpd %xmm0, %xmm0		/* y|y */
+	addl	%eax, %edx		/* k+2 */
+	movaps	%xmm0, %xmm1		/* y|y */
+	mulpd	%xmm0, %xmm0		/* z=t^4|z=t^4 */
+
+	movaps	MO1(DP_SC4), %xmm2	/* S4 */
+	mulpd	%xmm0, %xmm2		/* z*S4 */
+	movaps	MO1(DP_SC3), %xmm3	/* S3 */
+	mulpd	%xmm0, %xmm3		/* z*S3 */
+	xorl	%eax, %ecx		/* (sign_x ^ (k>>2))<<2 */
+	addpd	MO1(DP_SC2), %xmm2	/* S2+z*S4 */
+	mulpd	%xmm0, %xmm2		/* z*(S2+z*S4) */
+	shrl	$2, %edx		/* (k+2)>>2 */
+	addpd	MO1(DP_SC1), %xmm3	/* S1+z*S3 */
+	mulpd	%xmm0, %xmm3		/* z*(S1+z*S3) */
+	shrl	$2, %ecx		/* sign_x ^ k>>2 */
+	addpd	MO1(DP_SC0), %xmm2	/* S0+z*(S2+z*S4) */
+	andl	$1, %edx		/* sign_cos = ((k+2)>>2)&1 */
+	mulpd	%xmm1, %xmm2		/* y*(S0+z*(S2+z*S4)) */
+	andl	$1, %ecx		/* sign_sin = sign_x ^ ((k>>2)&1) */
+	addpd	%xmm2, %xmm3		/* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	mulpd	%xmm4, %xmm3		/*t*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))))*/
+	testl	$2, %eax		/* n&2 != 0 ? */
+	addpd	%xmm4, %xmm3		/*t+t*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))*/
+	jnz	L(sin_result_sin_poly)
+
+/*L(sin_result_cos_poly):*/
+	/*
+	 * Here if
+	 * cos(x) = poly_sin * sign_cos
+	 * sin(x) = poly_cos * sign_sin
+	 */
+	movsd	MO2(DP_ONES,%ecx,8), %xmm4/* 0|sign_sin */
+	movhpd	MO2(DP_ONES,%edx,8), %xmm4/* sign_cos|sign_sin */
+	mulpd	%xmm4, %xmm3		/* result_cos|result_sin */
+	movl	ARG_SIN_PTR, %eax
+	cvtpd2ps %xmm3, %xmm0		/* SP results */
+	movl	ARG_COS_PTR, %ecx
+	movss	%xmm0, (%eax)		/* store sin(x) from xmm0[0] */
+	shufps	$1, %xmm0, %xmm0	/* move cos(x) to xmm0[0] */
+	movss	%xmm0, (%ecx)		/* store cos(x) */
+	RETURN
+
+	.p2align	4
+L(sin_result_sin_poly):
+	/*
+	 * Here if
+	 * sin(x) = poly_sin * sign_sin
+	 * cos(x) = poly_cos * sign_cos
+	 */
+	movsd	MO2(DP_ONES,%edx,8), %xmm4/* 0|sign_cos */
+	movhpd	MO2(DP_ONES,%ecx,8), %xmm4/* sign_sin|sign_cos */
+	mulpd	%xmm4, %xmm3		/* result_sin|result_cos */
+	movl	ARG_SIN_PTR, %eax
+	cvtpd2ps %xmm3, %xmm0		/* SP results */
+	movl	ARG_COS_PTR, %ecx
+	movss	%xmm0, (%ecx)		/* store cos(x) from xmm0[0] */
+	shufps	$1, %xmm0, %xmm0	/* move sin(x) to xmm0[0] */
+	movss	%xmm0, (%eax)		/* store sin(x) */
+	RETURN
+
+	.p2align	4
+L(large_args):
+	/* Here if |x|>=9*Pi/4 */
+	cmpl	$0x7f800000, %eax	/* x is Inf or NaN ? */
+	jae	L(arg_inf_or_nan)
+
+	/* Here if finite |x|>=9*Pi/4 */
+	cmpl	$0x4b000000, %eax	/* |x|<2^23 ? */
+	jae	L(very_large_args)
+
+	/* Here if 9*Pi/4<=|x|<2^23 */
+	movsd	MO1(DP_INVPIO4), %xmm1	/* 1/(Pi/4) */
+	mulsd	%xmm0, %xmm1		/* |x|/(Pi/4) */
+	cvttsd2si %xmm1, %eax		/* k=trunc(|x|/(Pi/4)) */
+	addl	$1, %eax		/* k+1 */
+	movl	%eax, %edx
+	andl	$0xfffffffe, %edx	/* j=(k+1)&0xfffffffe */
+	cvtsi2sdl %edx, %xmm4		/* DP j */
+	movl	ARG_X, %ecx		/* Load x */
+	movsd	MO1(DP_PIO4HI), %xmm2	/* -PIO4HI = high part of -Pi/4 */
+	shrl	$29, %ecx		/* (sign of x) << 2 */
+	mulsd	%xmm4, %xmm2		/* -j*PIO4HI */
+	movsd	MO1(DP_PIO4LO), %xmm3	/* -PIO4LO = low part of -Pi/4 */
+	addsd	%xmm2, %xmm0		/* |x| - j*PIO4HI */
+	mulsd	%xmm3, %xmm4		/* j*PIO4LO */
+	addsd	%xmm4, %xmm0		/* t = |x| - j*PIO4HI - j*PIO4LO */
+	jmp	L(reconstruction)
+
+	.p2align	4
+L(very_large_args):
+	/* Here if finite |x|>=2^23 */
+
+	/* bitpos = (ix>>23) - BIAS_32 + 59; */
+	shrl	$23, %eax		/* eb = biased exponent of x */
+	subl	$68, %eax		/* bitpos=eb-0x7f+59, where 0x7f */
+							/*is exponent bias */
+	movl	$28, %ecx		/* %cl=28 */
+	movl	%eax, %edx		/* bitpos copy */
+
+	/* j = bitpos/28; */
+	div	%cl			/* j in register %al=%ax/%cl */
+	movapd	%xmm0, %xmm3		/* |x| */
+	andl	$0xff, %eax		/* clear unneeded remainder from %ah*/
+
+	imull	$28, %eax, %ecx		/* j*28 */
+	movsd	MO1(DP_HI_MASK), %xmm4	/* DP_HI_MASK */
+	movapd	%xmm0, %xmm5		/* |x| */
+	mulsd	-2*8+MO2(_FPI,%eax,8), %xmm3/* tmp3 = FPI[j-2]*|x| */
+	movapd	%xmm0, %xmm1		/* |x| */
+	mulsd	-1*8+MO2(_FPI,%eax,8), %xmm5/* tmp2 = FPI[j-1]*|x| */
+	mulsd	0*8+MO2(_FPI,%eax,8), %xmm0/* tmp0 = FPI[j]*|x| */
+	addl	$19, %ecx		/* j*28+19 */
+	mulsd	1*8+MO2(_FPI,%eax,8), %xmm1/* tmp1 = FPI[j+1]*|x| */
+	cmpl	%ecx, %edx		/* bitpos>=j*28+19 ? */
+	jl	L(very_large_skip1)
+
+	/* Here if bitpos>=j*28+19 */
+	andpd	%xmm3, %xmm4		/* HI(tmp3) */
+	subsd	%xmm4, %xmm3		/* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+	movsd	MO1(DP_2POW52), %xmm6
+	movapd	%xmm5, %xmm2		/* tmp2 copy */
+	addsd	%xmm3, %xmm5		/* tmp5 = tmp3 + tmp2 */
+	movl	$1, %edx
+	addsd	%xmm5, %xmm6		/* tmp6 = tmp5 + 2^52 */
+	movsd	8+MO1(DP_2POW52), %xmm4
+	movd	%xmm6, %eax		/* k = I64_LO(tmp6); */
+	addsd	%xmm6, %xmm4		/* tmp4 = tmp6 - 2^52 */
+	movl	ARG_X, %ecx		/* Load x */
+	comisd	%xmm5, %xmm4		/* tmp4 > tmp5 ? */
+	jbe	L(very_large_skip2)
+
+	/* Here if tmp4 > tmp5 */
+	subl	$1, %eax		/* k-- */
+	addsd	8+MO1(DP_ONES), %xmm4	/* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+	andl	%eax, %edx		/* k&1 */
+	subsd	%xmm4, %xmm3		/* tmp3 -= tmp4 */
+	addsd	MO2(DP_ZERONE,%edx,8), %xmm3/* t  = DP_ZERONE[k&1] + tmp3 */
+	addsd	%xmm2, %xmm3		/* t += tmp2 */
+	shrl	$29, %ecx		/* (sign of x) << 2 */
+	addsd	%xmm3, %xmm0		/* t += tmp0 */
+	addl	$1, %eax		/* n=k+1 */
+	addsd	%xmm1, %xmm0		/* t += tmp1 */
+	mulsd	MO1(DP_PIO4), %xmm0	/* t *= PI04 */
+
+	jmp	L(reconstruction)	/* end of very_large_args peth */
+
+	.p2align	4
+L(arg_less_pio4):
+	/* Here if |x|<Pi/4 */
+	cmpl	$0x3d000000, %eax	/* |x|<2^-5 ? */
+	jl	L(arg_less_2pn5)
+
+	/* Here if 2^-5<=|x|<Pi/4 */
+	movaps	%xmm0, %xmm3		/* DP x */
+	movhpd	MO1(DP_ONES), %xmm3	/* DP 1|x */
+	mulsd	%xmm0, %xmm0		/* DP y=x^2 */
+	unpcklpd %xmm0, %xmm0		/* DP y|y */
+	movaps	%xmm0, %xmm1		/* y|y */
+	mulpd	%xmm0, %xmm0		/* z=x^4|z=x^4 */
+
+	movapd	MO1(DP_SC4), %xmm4	/* S4 */
+	mulpd	%xmm0, %xmm4		/* z*S4 */
+	movapd	MO1(DP_SC3), %xmm5	/* S3 */
+	mulpd	%xmm0, %xmm5		/* z*S3 */
+	addpd	MO1(DP_SC2), %xmm4	/* S2+z*S4 */
+	mulpd	%xmm0, %xmm4		/* z*(S2+z*S4) */
+	addpd	MO1(DP_SC1), %xmm5	/* S1+z*S3 */
+	mulpd	%xmm0, %xmm5		/* z*(S1+z*S3) */
+	addpd	MO1(DP_SC0), %xmm4	/* S0+z*(S2+z*S4) */
+	mulpd	%xmm1, %xmm4		/* y*(S0+z*(S2+z*S4)) */
+	mulpd	%xmm3, %xmm5		/* x*z*(S1+z*S3) */
+	mulpd	%xmm3, %xmm4		/* x*y*(S0+z*(S2+z*S4)) */
+	addpd	%xmm5, %xmm4		/*x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))))*/
+	movl	ARG_SIN_PTR, %eax
+	addpd	%xmm4, %xmm3		/*x+x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))*/
+	movl	ARG_COS_PTR, %ecx
+	cvtpd2ps %xmm3, %xmm0		/* SP results */
+	movss	%xmm0, (%eax)		/* store sin(x) from xmm0[0] */
+	shufps	$1, %xmm0, %xmm0	/* move cos(x) to xmm0[0] */
+	movss	%xmm0, (%ecx)		/* store cos(x) */
+	RETURN
+
+	.p2align	4
+L(arg_less_2pn5):
+	/* Here if |x|<2^-5 */
+	cmpl	$0x32000000, %eax	/* |x|<2^-27 ? */
+	jl	L(arg_less_2pn27)
+
+	/* Here if 2^-27<=|x|<2^-5 */
+	movaps	%xmm0, %xmm1		/* DP x */
+	movhpd	MO1(DP_ONES), %xmm1	/* DP 1|x */
+	mulsd	%xmm0, %xmm0		/* DP x^2 */
+	unpcklpd %xmm0, %xmm0		/* DP x^2|x^2 */
+
+	movaps	MO1(DP_SINCOS2_1), %xmm3/* DP DP_SIN2_1 */
+	mulpd	%xmm0, %xmm3		/* DP x^2*DP_SIN2_1 */
+	addpd	MO1(DP_SINCOS2_0), %xmm3/* DP DP_SIN2_0+x^2*DP_SIN2_1 */
+	mulpd	%xmm0, %xmm3		/* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */
+	mulpd	%xmm1, %xmm3		/* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+	addpd	%xmm1, %xmm3		/* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+	movl	ARG_SIN_PTR, %eax
+	cvtpd2ps %xmm3, %xmm0		/* SP results */
+	movl	ARG_COS_PTR, %ecx
+	movss	%xmm0, (%eax)		/* store sin(x) from xmm0[0] */
+	shufps	$1, %xmm0, %xmm0	/* move cos(x) to xmm0[0] */
+	movss	%xmm0, (%ecx)		/* store cos(x) */
+	RETURN
+
+	.p2align	4
+L(arg_less_2pn27):
+	movss	ARG_X, %xmm7		/* SP x */
+	cmpl	$0, %eax		/* x=0 ? */
+	je	L(arg_zero)		/* in case x=0 return sin(+-0)==+-0 */
+	/* Here if |x|<2^-27 */
+	/*
+	 * Special cases here:
+	 *  sin(subnormal) raises inexact/underflow
+	 *  sin(min_normalized) raises inexact/underflow
+	 *  sin(normalized) raises inexact
+	 *  cos(here)=1-|x| (raising inexact)
+	 */
+	movaps	%xmm0, %xmm3		/* DP x */
+	mulsd	MO1(DP_SMALL), %xmm0	/* DP x*DP_SMALL */
+	subsd	%xmm0, %xmm3		/* DP sin result is x-x*DP_SMALL */
+	andps	MO1(SP_ABS_MASK), %xmm7	/* SP |x| */
+	cvtsd2ss %xmm3, %xmm0		/* sin(x) */
+	movl	ARG_SIN_PTR, %eax
+	movss	MO1(SP_ONE), %xmm1	/* SP 1.0 */
+	movss	%xmm0, (%eax)		/* sin(x) store */
+	movl	ARG_COS_PTR, %ecx
+	subss	%xmm7, %xmm1		/* cos(x) */
+	movss	%xmm1, (%ecx)		/* cos(x) store */
+	RETURN
+
+	.p2align	4
+L(arg_zero):
+	movss	MO1(SP_ONE), %xmm0	/* 1.0 */
+	movl	ARG_SIN_PTR, %eax
+	movl	ARG_COS_PTR, %ecx
+	movss	%xmm7, (%eax)		/* sin(+-0)==x */
+	movss	%xmm0, (%ecx)		/* cos(+-0)==1 */
+	RETURN
+
+	.p2align	4
+L(arg_inf_or_nan):
+	movss	ARG_X, %xmm7		/* SP x */
+	/* Here if |x| is Inf or NAN */
+	jne	L(skip_errno_setting)	/* in case of x is NaN */
+
+	/* Here if x is Inf. Set errno to EDOM.  */
+	call	JUMPTARGET(__errno_location)
+	movl	$EDOM, (%eax)
+
+	.p2align	4
+L(skip_errno_setting):
+	/* Here if |x| is Inf or NAN. Continued. */
+	subss	%xmm7, %xmm7		/* x-x, result is NaN */
+	movl	ARG_SIN_PTR, %eax
+	movl	ARG_COS_PTR, %ecx
+	movss	%xmm7, (%eax)
+	movss	%xmm7, (%ecx)
+	RETURN
+END(__sincosf_sse2)
+
+	.section .rodata, "a"
+	.p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+	.long	0x00000000,0x00000000
+	.long	0x54442d18,0x3fe921fb
+	.long	0x54442d18,0x3ff921fb
+	.long	0x7f3321d2,0x4002d97c
+	.long	0x54442d18,0x400921fb
+	.long	0x2955385e,0x400f6a7a
+	.long	0x7f3321d2,0x4012d97c
+	.long	0xe9bba775,0x4015fdbb
+	.long	0x54442d18,0x401921fb
+	.long	0xbeccb2bb,0x401c463a
+	.long	0x2955385e,0x401f6a7a
+	.type L(PIO4J), @object
+	ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+	.p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+	.long	0x00000000,0x00000000
+	.long	0x6c000000,0x3ff45f30
+	.long	0x2a000000,0x3e3c9c88
+	.long	0xa8000000,0x3c54fe13
+	.long	0xd0000000,0x3aaf47d4
+	.long	0x6c000000,0x38fbb81b
+	.long	0xe0000000,0x3714acc9
+	.long	0x7c000000,0x3560e410
+	.long	0x56000000,0x33bca2c7
+	.long	0xac000000,0x31fbd778
+	.long	0xe0000000,0x300b7246
+	.long	0xe8000000,0x2e5d2126
+	.long	0x48000000,0x2c970032
+	.long	0xe8000000,0x2ad77504
+	.long	0xe0000000,0x290921cf
+	.long	0xb0000000,0x274deb1c
+	.long	0xe0000000,0x25829a73
+	.long	0xbe000000,0x23fd1046
+	.long	0x10000000,0x2224baed
+	.long	0x8e000000,0x20709d33
+	.long	0x80000000,0x1e535a2f
+	.long	0x64000000,0x1cef904e
+	.long	0x30000000,0x1b0d6398
+	.long	0x24000000,0x1964ce7d
+	.long	0x16000000,0x17b908bf
+	.type L(_FPI), @object
+	ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomials for */
+/* sin(x)~=x+x*x^2*(DP_SIN2_0+x^2*DP_SIN2_1) in low  DP part, */
+/* cos(x)~=1+1*x^2*(DP_COS2_0+x^2*DP_COS2_1) in high DP part, */
+/* for |x|<2^-5. */
+	.p2align 4
+L(DP_SINCOS2_0):
+	.long	0x5543d49d,0xbfc55555
+	.long	0xff5cc6fd,0xbfdfffff
+	.type L(DP_SINCOS2_0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SINCOS2_0))
+
+	.p2align 4
+L(DP_SINCOS2_1):
+	.long	0x75cec8c5,0x3f8110f4
+	.long	0xb178dac5,0x3fa55514
+	.type L(DP_SINCOS2_1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SINCOS2_1))
+
+	.p2align 3
+L(DP_ZERONE):
+	.long	0x00000000,0x00000000	/* 0.0 */
+	.long	0x00000000,0xbff00000	/* 1.0 */
+	.type L(DP_ZERONE), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+	.p2align 3
+L(DP_ONES):
+	.long	0x00000000,0x3ff00000	/* +1.0 */
+	.long	0x00000000,0xbff00000	/* -1.0 */
+	.type L(DP_ONES), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomials for */
+/* sin(t)~=t+t*t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))) in low  DP part, */
+/* cos(t)~=1+1*t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))) in high DP part, */
+/* for |t|<Pi/4. */
+	.p2align 4
+L(DP_SC4):
+	.long	0x1674b58a,0xbe5a947e
+	.long	0xdd8844d7,0xbe923c97
+	.type L(DP_SC4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC4))
+
+	.p2align 4
+L(DP_SC3):
+	.long	0x64e6b5b4,0x3ec71d72
+	.long	0x9ac43cc0,0x3efa00eb
+	.type L(DP_SC3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC3))
+
+	.p2align 4
+L(DP_SC2):
+	.long	0x8b4bd1f9,0xbf2a019f
+	.long	0x348b6874,0xbf56c16b
+	.type L(DP_SC2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC2))
+
+	.p2align 4
+L(DP_SC1):
+	.long	0x10c2688b,0x3f811111
+	.long	0x545c50c7,0x3fa55555
+	.type L(DP_SC1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC1))
+
+	.p2align 4
+L(DP_SC0):
+	.long	0x55551cd9,0xbfc55555
+	.long	0xfffe98ae,0xbfdfffff
+	.type L(DP_SC0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC0))
+
+	.p2align 3
+L(DP_SMALL):
+	.long	0x00000000,0x3cd00000	/* 2^(-50) */
+	.type L(DP_SMALL), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SMALL))
+
+	.p2align 3
+L(DP_PIO4):
+	.long	0x54442d18,0x3fe921fb	/* Pi/4 */
+	.type L(DP_PIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+	.p2align 3
+L(DP_2POW52):
+	.long	0x00000000,0x43300000	/* +2^52 */
+	.long	0x00000000,0xc3300000	/* -2^52 */
+	.type L(DP_2POW52), @object
+	ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+	.p2align 3
+L(DP_INVPIO4):
+	.long	0x6dc9c883,0x3ff45f30	/* 4/Pi */
+	.type L(DP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+	.p2align 3
+L(DP_PIO4HI):
+	.long	0x54000000,0xbfe921fb	/* High part of Pi/4 */
+	.type L(DP_PIO4HI), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+	.p2align 3
+L(DP_PIO4LO):
+	.long	0x11A62633,0xbe010b46	/* Low part of Pi/4 */
+	.type L(DP_PIO4LO), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+	.p2align 2
+L(SP_INVPIO4):
+	.long	0x3fa2f983		/* 4/Pi */
+	.type L(SP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+	.p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+	.long	0xffffffff,0x7fffffff
+	.long	0xffffffff,0x7fffffff
+	.type L(DP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+	.p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+	.long	0x00000000,0xffffffff
+	.type L(DP_HI_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+	.p2align 4
+L(SP_ABS_MASK): /* Mask for getting SP absolute value */
+	.long	0x7fffffff,0x7fffffff
+	.long	0x7fffffff,0x7fffffff
+	.type L(SP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
+
+	.p2align 2
+L(SP_ONE):
+	.long	0x3f800000		/* 1.0 */
+	.type L(SP_ONE), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+weak_alias(__sincosf, sincosf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c
new file mode 100644
index 0000000000..9428f9b4ea
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c
@@ -0,0 +1,30 @@
+/* Multiple versions of sincosf
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern void __sincosf_sse2 (float, float *, float *);
+extern void __sincosf_ia32 (float, float *, float *);
+void __sincosf (float, float *, float *);
+
+libm_ifunc (__sincosf,
+	    HAS_CPU_FEATURE (SSE2) ? __sincosf_sse2 : __sincosf_ia32);
+weak_alias (__sincosf, sincosf);
+
+#define SINCOSF __sincosf_ia32
+#include <sysdeps/ieee754/flt-32/s_sincosf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S
new file mode 100644
index 0000000000..ee96018061
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S
@@ -0,0 +1,566 @@
+/* Optimized with sse2 version of sinf
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ *  1) if |x| == 0: return x.
+ *  2) if |x| <  2^-27: return x-x*DP_SMALL, raise underflow only when needed.
+ *  3) if |x| <  2^-5 : return x+x^3*DP_SIN2_0+x^5*DP_SIN2_1.
+ *  4) if |x| <   Pi/4: return x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).
+ *  5) if |x| < 9*Pi/4:
+ *      5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1,
+ *           t=|x|-j*Pi/4.
+ *      5.2) Reconstruction:
+ *          s = sign(x) * (-1.0)^((n>>2)&1)
+ *          if(n&2 != 0) {
+ *              using cos(t) polynomial for |t|<Pi/4, result is
+ *              s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+ *          } else {
+ *              using sin(t) polynomial for |t|<Pi/4, result is
+ *              s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+ *          }
+ *  6) if |x| < 2^23, large args:
+ *      6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ *           t=|x|-j*Pi/4.
+ *      6.2) Reconstruction same as (5.2).
+ *  7) if |x| >= 2^23, very large args:
+ *      7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ *           t=|x|-j*Pi/4.
+ *      7.2) Reconstruction same as (5.2).
+ *  8) if x is Inf, return x-x, and set errno=EDOM.
+ *  9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ *  sin(+-0) = +-0 not raising inexact/underflow,
+ *  sin(subnormal) raises inexact/underflow,
+ *  sin(min_normalized) raises inexact/underflow,
+ *  sin(normalized) raises inexact,
+ *  sin(Inf) = NaN, raises invalid, sets errno to EDOM,
+ *  sin(NaN) = NaN.
+ */
+
+#ifdef	PIC
+# define MO1(symbol)			L(symbol)##@GOTOFF(%ebx)
+# define MO2(symbol,reg2,_scale)	L(symbol)##@GOTOFF(%ebx,reg2,_scale)
+# define CFI_PUSH(REG)	cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
+# define CFI_POP(REG)	cfi_adjust_cfa_offset(-4); cfi_restore(REG)
+# define PUSH(REG)			pushl REG; CFI_PUSH(REG)
+# define POP(REG)			popl REG; CFI_POP(REG)
+# define ENTRANCE			PUSH(%ebx); LOAD_PIC_REG(bx)
+# define RETURN				POP(%ebx); ret; CFI_PUSH(%ebx)
+# define ARG_X				8(%esp)
+#else
+# define MO1(symbol)			L(symbol)
+# define MO2(symbol,reg2,_scale)	L(symbol)(,reg2,_scale)
+# define ENTRANCE
+# define RETURN				ret
+# define ARG_X				4(%esp)
+#endif
+
+	.text
+ENTRY(__sinf_sse2)
+	/* Input: single precision x on stack at address ARG_X */
+
+	ENTRANCE
+	movl	ARG_X, %eax		/* Bits of x */
+	cvtss2sd ARG_X, %xmm0		/* DP x */
+	andl	$0x7fffffff, %eax	/* |x| */
+
+	cmpl	$0x3f490fdb, %eax	/* |x|<Pi/4?  */
+	jb	L(arg_less_pio4)
+
+	/* Here if |x|>=Pi/4 */
+	movd	%eax, %xmm3		/* SP |x| */
+	andpd	MO1(DP_ABS_MASK),%xmm0	/* DP |x| */
+	movss	MO1(SP_INVPIO4), %xmm2	/* SP 1/(Pi/4) */
+
+	cmpl	$0x40e231d6, %eax	/* |x|<9*Pi/4?  */
+	jae	L(large_args)
+
+	/* Here if Pi/4<=|x|<9*Pi/4 */
+	mulss	%xmm3, %xmm2		/* SP |x|/(Pi/4) */
+	movl	ARG_X, %ecx		/* Load x */
+	cvttss2si %xmm2, %eax		/* k, number of Pi/4 in x */
+	shrl	$31, %ecx		/* sign of x */
+	addl	$1, %eax		/* k+1 */
+	movl	$0x0e, %edx
+	andl	%eax, %edx		/* j = (k+1)&0x0e */
+	subsd	MO2(PIO4J,%edx,8), %xmm0 /* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+	/* Input: %eax=n, %xmm0=t, %ecx=sign(x) */
+	testl	$2, %eax		/* n&2 != 0?  */
+	jz	L(sin_poly)
+
+/*L(cos_poly):*/
+	/* Here if sin(x) calculated using cos(t) polynomial for |t|<Pi/4:
+	 * y = t*t; z = y*y;
+	 * s = sign(x) * (-1.0)^((n>>2)&1)
+	 * result = s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
+	 */
+	shrl	$2, %eax		/* n>>2 */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	andl	$1, %eax		/* (n>>2)&1 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=t^4 */
+
+	movsd	MO1(DP_C4), %xmm4	/* C4 */
+	mulsd	%xmm0, %xmm4		/* z*C4 */
+	xorl	%eax, %ecx		/* (-1.0)^((n>>2)&1) XOR sign(x) */
+	movsd	MO1(DP_C3), %xmm3	/* C3 */
+	mulsd	%xmm0, %xmm3		/* z*C3 */
+	addsd	MO1(DP_C2), %xmm4	/* C2+z*C4 */
+	mulsd	%xmm0, %xmm4		/* z*(C2+z*C4) */
+	lea	-8(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	addsd	MO1(DP_C1), %xmm3	/* C1+z*C3 */
+	mulsd	%xmm0, %xmm3		/* z*(C1+z*C3) */
+	addsd	MO1(DP_C0), %xmm4	/* C0+z*(C2+z*C4) */
+	mulsd	%xmm1, %xmm4		/* y*(C0+z*(C2+z*C4)) */
+
+	addsd	%xmm4, %xmm3		/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	/* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	addsd	MO1(DP_ONES), %xmm3
+
+	mulsd	MO2(DP_ONES,%ecx,8), %xmm3 /* DP result */
+	movsd	%xmm3, 0(%esp)		/* Move result from sse...  */
+	fldl	0(%esp)			/* ...to FPU.  */
+	/* Return back 4 bytes of stack frame */
+	lea	8(%esp), %esp
+	RETURN
+
+	.p2align	4
+L(sin_poly):
+	/* Here if sin(x) calculated using sin(t) polynomial for |t|<Pi/4:
+	 * y = t*t; z = y*y;
+	 * s = sign(x) * (-1.0)^((n>>2)&1)
+	 * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
+	 */
+
+	movaps	%xmm0, %xmm4		/* t */
+	shrl	$2, %eax		/* n>>2 */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	andl	$1, %eax		/* (n>>2)&1 */
+	movaps	%xmm0, %xmm1		/* y */
+	xorl	%eax, %ecx		/* (-1.0)^((n>>2)&1) XOR sign(x) */
+	mulsd	%xmm0, %xmm0		/* z=t^4 */
+
+	movsd	MO1(DP_S4), %xmm2	/* S4 */
+	mulsd	%xmm0, %xmm2		/* z*S4 */
+	movsd	MO1(DP_S3), %xmm3	/* S3 */
+	mulsd	%xmm0, %xmm3		/* z*S3 */
+	lea	-8(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	addsd	MO1(DP_S2), %xmm2	/* S2+z*S4 */
+	mulsd	%xmm0, %xmm2		/* z*(S2+z*S4) */
+	addsd	MO1(DP_S1), %xmm3	/* S1+z*S3 */
+	mulsd	%xmm0, %xmm3		/* z*(S1+z*S3) */
+	addsd	MO1(DP_S0), %xmm2	/* S0+z*(S2+z*S4) */
+	mulsd	%xmm1, %xmm2		/* y*(S0+z*(S2+z*S4)) */
+	/* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
+	mulsd	MO2(DP_ONES,%ecx,8), %xmm4
+	addsd	%xmm2, %xmm3		/* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	/* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	mulsd	%xmm4, %xmm3
+	/* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	addsd	%xmm4, %xmm3
+	movsd	%xmm3, 0(%esp)		/* Move result from sse...  */
+	fldl	0(%esp)			/* ...to FPU.  */
+	/* Return back 4 bytes of stack frame */
+	lea	8(%esp), %esp
+	RETURN
+
+	.p2align	4
+L(large_args):
+	/* Here if |x|>=9*Pi/4 */
+	cmpl	$0x7f800000, %eax	/* x is Inf or NaN?  */
+	jae	L(arg_inf_or_nan)
+
+	/* Here if finite |x|>=9*Pi/4 */
+	cmpl	$0x4b000000, %eax	/* |x|<2^23?  */
+	jae	L(very_large_args)
+
+	/* Here if 9*Pi/4<=|x|<2^23 */
+	movsd	MO1(DP_INVPIO4), %xmm1	/* 1/(Pi/4) */
+	mulsd	%xmm0, %xmm1		/* |x|/(Pi/4) */
+	cvttsd2si %xmm1, %eax		/* k=trunc(|x|/(Pi/4)) */
+	addl	$1, %eax		/* k+1 */
+	movl	%eax, %edx
+	andl	$0xfffffffe, %edx	/* j=(k+1)&0xfffffffe */
+	cvtsi2sdl %edx, %xmm4		/* DP j */
+	movl	ARG_X, %ecx		/* Load x */
+	movsd	MO1(DP_PIO4HI), %xmm2	/* -PIO4HI = high part of -Pi/4 */
+	shrl	$31, %ecx		/* sign bit of x */
+	mulsd	%xmm4, %xmm2		/* -j*PIO4HI */
+	movsd	MO1(DP_PIO4LO), %xmm3	/* -PIO4LO = low part of -Pi/4 */
+	addsd	%xmm2, %xmm0		/* |x| - j*PIO4HI */
+	mulsd	%xmm3, %xmm4		/* j*PIO4LO */
+	addsd	%xmm4, %xmm0		/* t = |x| - j*PIO4HI - j*PIO4LO */
+	jmp	L(reconstruction)
+
+	.p2align	4
+L(very_large_args):
+	/* Here if finite |x|>=2^23 */
+
+	/* bitpos = (ix>>23) - BIAS_32 + 59; */
+	shrl	$23, %eax		/* eb = biased exponent of x */
+	/* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
+	subl	$68, %eax
+	movl	$28, %ecx		/* %cl=28 */
+	movl	%eax, %edx		/* bitpos copy */
+
+	/* j = bitpos/28; */
+	div	%cl			/* j in register %al=%ax/%cl */
+	movapd	%xmm0, %xmm3		/* |x| */
+	/* clear unneeded remainder from %ah */
+	andl	$0xff, %eax
+
+	imull	$28, %eax, %ecx		/* j*28 */
+	movsd	MO1(DP_HI_MASK), %xmm4	/* DP_HI_MASK */
+	movapd	%xmm0, %xmm5		/* |x| */
+	mulsd	-2*8+MO2(_FPI,%eax,8), %xmm3	/* tmp3 = FPI[j-2]*|x| */
+	movapd	%xmm0, %xmm1		/* |x| */
+	mulsd	-1*8+MO2(_FPI,%eax,8), %xmm5	/* tmp2 = FPI[j-1]*|x| */
+	mulsd	0*8+MO2(_FPI,%eax,8), %xmm0	/* tmp0 = FPI[j]*|x| */
+	addl	$19, %ecx		/* j*28+19 */
+	mulsd	1*8+MO2(_FPI,%eax,8), %xmm1	/* tmp1 = FPI[j+1]*|x| */
+	cmpl	%ecx, %edx		/* bitpos>=j*28+19?   */
+	jl	L(very_large_skip1)
+
+	/* Here if bitpos>=j*28+19 */
+	andpd	%xmm3, %xmm4		/* HI(tmp3) */
+	subsd	%xmm4, %xmm3		/* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+	movsd	MO1(DP_2POW52), %xmm6
+	movapd	%xmm5, %xmm2		/* tmp2 copy */
+	addsd	%xmm3, %xmm5		/* tmp5 = tmp3 + tmp2 */
+	movl	$1, %edx
+	addsd	%xmm5, %xmm6		/* tmp6 = tmp5 + 2^52 */
+	movsd	8+MO1(DP_2POW52), %xmm4
+	movd	%xmm6, %eax		/* k = I64_LO(tmp6); */
+	addsd	%xmm6, %xmm4		/* tmp4 = tmp6 - 2^52 */
+	movl	ARG_X, %ecx		/* Load x */
+	comisd	%xmm5, %xmm4		/* tmp4 > tmp5?  */
+	jbe	L(very_large_skip2)
+
+	/* Here if tmp4 > tmp5 */
+	subl	$1, %eax		/* k-- */
+	addsd	8+MO1(DP_ONES), %xmm4	/* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+	andl	%eax, %edx		/* k&1 */
+	subsd	%xmm4, %xmm3		/* tmp3 -= tmp4 */
+	addsd	MO2(DP_ZERONE,%edx,8), %xmm3 /* t  = DP_ZERONE[k&1] + tmp3 */
+	addsd	%xmm2, %xmm3		/* t += tmp2 */
+	shrl	$31, %ecx		/* sign of x */
+	addsd	%xmm3, %xmm0		/* t += tmp0 */
+	addl	$1, %eax		/* n=k+1 */
+	addsd	%xmm1, %xmm0		/* t += tmp1 */
+	mulsd	MO1(DP_PIO4), %xmm0	/* t *= PI04 */
+
+	jmp	L(reconstruction)	/* end of very_large_args peth */
+
+	.p2align	4
+L(arg_less_pio4):
+	/* Here if |x|<Pi/4 */
+	cmpl	$0x3d000000, %eax	/* |x|<2^-5?  */
+	jl	L(arg_less_2pn5)
+
+	/* Here if 2^-5<=|x|<Pi/4 */
+	movaps	%xmm0, %xmm3		/* x */
+	mulsd	%xmm0, %xmm0		/* y=x^2 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=x^4 */
+	movsd	MO1(DP_S4), %xmm4	/* S4 */
+	mulsd	%xmm0, %xmm4		/* z*S4 */
+	movsd	MO1(DP_S3), %xmm5	/* S3 */
+	mulsd	%xmm0, %xmm5		/* z*S3 */
+	addsd	MO1(DP_S2), %xmm4	/* S2+z*S4 */
+	mulsd	%xmm0, %xmm4		/* z*(S2+z*S4) */
+	addsd	MO1(DP_S1), %xmm5	/* S1+z*S3 */
+	mulsd	%xmm0, %xmm5		/* z*(S1+z*S3) */
+	addsd	MO1(DP_S0), %xmm4	/* S0+z*(S2+z*S4) */
+	mulsd	%xmm1, %xmm4		/* y*(S0+z*(S2+z*S4)) */
+	mulsd	%xmm3, %xmm5		/* x*z*(S1+z*S3) */
+	mulsd	%xmm3, %xmm4		/* x*y*(S0+z*(S2+z*S4)) */
+	/* x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	addsd	%xmm5, %xmm4
+	/* x + x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	addsd	%xmm4, %xmm3
+	cvtsd2ss %xmm3, %xmm3		/* SP result */
+
+L(epilogue):
+	lea	-4(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	movss	%xmm3, 0(%esp)		/* Move result from sse...  */
+	flds	0(%esp)			/* ...to FPU.  */
+	/* Return back 4 bytes of stack frame */
+	lea	4(%esp), %esp
+	RETURN
+
+	.p2align	4
+L(arg_less_2pn5):
+	/* Here if |x|<2^-5 */
+	cmpl	$0x32000000, %eax	/* |x|<2^-27?  */
+	jl	L(arg_less_2pn27)
+
+	/* Here if 2^-27<=|x|<2^-5 */
+	movaps	%xmm0, %xmm1		/* DP x */
+	mulsd	%xmm0, %xmm0		/* DP x^2 */
+	movsd	MO1(DP_SIN2_1), %xmm3	/* DP DP_SIN2_1 */
+	mulsd	%xmm0, %xmm3		/* DP x^2*DP_SIN2_1 */
+	addsd	MO1(DP_SIN2_0), %xmm3	/* DP DP_SIN2_0+x^2*DP_SIN2_1 */
+	mulsd	%xmm0, %xmm3		/* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */
+	mulsd	%xmm1, %xmm3		/* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+	addsd	%xmm1, %xmm3		/* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+	cvtsd2ss %xmm3, %xmm3		/* SP result */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(arg_less_2pn27):
+	movss	ARG_X, %xmm3		/* SP x */
+	cmpl	$0, %eax		/* x=0?  */
+	je	L(epilogue)		/* in case x=0 return sin(+-0)==+-0 */
+	/* Here if |x|<2^-27 */
+	/*
+	 * Special cases here:
+	 *  sin(subnormal) raises inexact/underflow
+	 *  sin(min_normalized) raises inexact/underflow
+	 *  sin(normalized) raises inexact
+	 */
+	movaps	%xmm0, %xmm3		/* Copy of DP x */
+	mulsd	MO1(DP_SMALL), %xmm0	/* x*DP_SMALL */
+	subsd	%xmm0, %xmm3		/* Result is x-x*DP_SMALL */
+	cvtsd2ss %xmm3, %xmm3		/* Result converted to SP */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(arg_inf_or_nan):
+	/* Here if |x| is Inf or NAN */
+	jne	L(skip_errno_setting)	/* in case of x is NaN */
+
+	/* Here if x is Inf. Set errno to EDOM.  */
+	call	JUMPTARGET(__errno_location)
+	movl	$EDOM, (%eax)
+
+	.p2align	4
+L(skip_errno_setting):
+	/* Here if |x| is Inf or NAN. Continued.  */
+	movss	ARG_X, %xmm3		/* load x */
+	subss	%xmm3, %xmm3		/* Result is NaN */
+	jmp	L(epilogue)
+END(__sinf_sse2)
+
+	.section .rodata, "a"
+	.p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+	.long	0x00000000,0x00000000
+	.long	0x54442d18,0x3fe921fb
+	.long	0x54442d18,0x3ff921fb
+	.long	0x7f3321d2,0x4002d97c
+	.long	0x54442d18,0x400921fb
+	.long	0x2955385e,0x400f6a7a
+	.long	0x7f3321d2,0x4012d97c
+	.long	0xe9bba775,0x4015fdbb
+	.long	0x54442d18,0x401921fb
+	.long	0xbeccb2bb,0x401c463a
+	.long	0x2955385e,0x401f6a7a
+	.type L(PIO4J), @object
+	ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+	.p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+	.long	0x00000000,0x00000000
+	.long	0x6c000000,0x3ff45f30
+	.long	0x2a000000,0x3e3c9c88
+	.long	0xa8000000,0x3c54fe13
+	.long	0xd0000000,0x3aaf47d4
+	.long	0x6c000000,0x38fbb81b
+	.long	0xe0000000,0x3714acc9
+	.long	0x7c000000,0x3560e410
+	.long	0x56000000,0x33bca2c7
+	.long	0xac000000,0x31fbd778
+	.long	0xe0000000,0x300b7246
+	.long	0xe8000000,0x2e5d2126
+	.long	0x48000000,0x2c970032
+	.long	0xe8000000,0x2ad77504
+	.long	0xe0000000,0x290921cf
+	.long	0xb0000000,0x274deb1c
+	.long	0xe0000000,0x25829a73
+	.long	0xbe000000,0x23fd1046
+	.long	0x10000000,0x2224baed
+	.long	0x8e000000,0x20709d33
+	.long	0x80000000,0x1e535a2f
+	.long	0x64000000,0x1cef904e
+	.long	0x30000000,0x1b0d6398
+	.long	0x24000000,0x1964ce7d
+	.long	0x16000000,0x17b908bf
+	.type L(_FPI), @object
+	ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomial
+   for sin(x)~=x+x^3*DP_SIN2_0+x^5*DP_SIN2_1, |x|<2^-5.  */
+	.p2align 3
+L(DP_SIN2_0):
+	.long	0x5543d49d,0xbfc55555
+	.type L(DP_SIN2_0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SIN2_0))
+
+	.p2align 3
+L(DP_SIN2_1):
+	.long	0x75cec8c5,0x3f8110f4
+	.type L(DP_SIN2_1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SIN2_1))
+
+	.p2align 3
+L(DP_ZERONE):
+	.long	0x00000000,0x00000000	/* 0.0 */
+	.long	0x00000000,0xbff00000	/* 1.0 */
+	.type L(DP_ZERONE), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+	.p2align 3
+L(DP_ONES):
+	.long	0x00000000,0x3ff00000	/* +1.0 */
+	.long	0x00000000,0xbff00000	/* -1.0 */
+	.type L(DP_ONES), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomial
+   for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4.  */
+	.p2align 3
+L(DP_S3):
+	.long	0x64e6b5b4,0x3ec71d72
+	.type L(DP_S3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S3))
+
+	.p2align 3
+L(DP_S1):
+	.long	0x10c2688b,0x3f811111
+	.type L(DP_S1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S1))
+
+	.p2align 3
+L(DP_S4):
+	.long	0x1674b58a,0xbe5a947e
+	.type L(DP_S4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S4))
+
+	.p2align 3
+L(DP_S2):
+	.long	0x8b4bd1f9,0xbf2a019f
+	.type L(DP_S2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S2))
+
+	.p2align 3
+L(DP_S0):
+	.long	0x55551cd9,0xbfc55555
+	.type L(DP_S0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S0))
+
+	.p2align 3
+L(DP_SMALL):
+	.long	0x00000000,0x3cd00000	/* 2^(-50) */
+	.type L(DP_SMALL), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SMALL))
+
+/* Coefficients of polynomial
+   for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4.  */
+	.p2align 3
+L(DP_C3):
+	.long	0x9ac43cc0,0x3efa00eb
+	.type L(DP_C3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C3))
+
+	.p2align 3
+L(DP_C1):
+	.long	0x545c50c7,0x3fa55555
+	.type L(DP_C1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C1))
+
+	.p2align 3
+L(DP_C4):
+	.long	0xdd8844d7,0xbe923c97
+	.type L(DP_C4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C4))
+
+	.p2align 3
+L(DP_C2):
+	.long	0x348b6874,0xbf56c16b
+	.type L(DP_C2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C2))
+
+	.p2align 3
+L(DP_C0):
+	.long	0xfffe98ae,0xbfdfffff
+	.type L(DP_C0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C0))
+
+	.p2align 3
+L(DP_PIO4):
+	.long	0x54442d18,0x3fe921fb	/* Pi/4 */
+	.type L(DP_PIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+	.p2align 3
+L(DP_2POW52):
+	.long	0x00000000,0x43300000	/* +2^52 */
+	.long	0x00000000,0xc3300000	/* -2^52 */
+	.type L(DP_2POW52), @object
+	ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+	.p2align 3
+L(DP_INVPIO4):
+	.long	0x6dc9c883,0x3ff45f30	/* 4/Pi */
+	.type L(DP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+	.p2align 3
+L(DP_PIO4HI):
+	.long	0x54000000,0xbfe921fb	/* High part of Pi/4 */
+	.type L(DP_PIO4HI), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+	.p2align 3
+L(DP_PIO4LO):
+	.long	0x11A62633,0xbe010b46	/* Low part of Pi/4 */
+	.type L(DP_PIO4LO), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+	.p2align 2
+L(SP_INVPIO4):
+	.long	0x3fa2f983		/* 4/Pi */
+	.type L(SP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+	.p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+	.long	0xffffffff,0x7fffffff
+	.long	0xffffffff,0x7fffffff
+	.type L(DP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+	.p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+	.long	0x00000000,0xffffffff
+	.type L(DP_HI_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+weak_alias (__sinf, sinf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c
new file mode 100644
index 0000000000..8ccdd2f34d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c
@@ -0,0 +1,28 @@
+/* Multiple versions of sinf
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern float __sinf_sse2 (float);
+extern float __sinf_ia32 (float);
+float __sinf (float);
+
+libm_ifunc (__sinf, HAS_CPU_FEATURE (SSE2) ? __sinf_sse2 : __sinf_ia32);
+weak_alias (__sinf, sinf);
+#define SINF __sinf_ia32
+#include <sysdeps/ieee754/flt-32/s_sinf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S
new file mode 100644
index 0000000000..ace8db9410
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S
@@ -0,0 +1,39 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmax)
+	fldl	4(%esp)		// x
+	fldl	12(%esp)	// x : y
+
+	fucomi	%st(0), %st
+	fcmovu	%st(1), %st	// now %st contains y if not NaN, x otherwise
+
+	fxch
+
+	fucomi	%st(1), %st
+	fcmovb	%st(1), %st
+
+	fstp	%st(1)
+
+	ret
+END(__fmax)
+weak_alias (__fmax, fmax)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S
new file mode 100644
index 0000000000..3a25951a09
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S
@@ -0,0 +1,39 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmaxf)
+	flds	4(%esp)		// x
+	flds	8(%esp)		// x : y
+
+	fucomi	%st(0), %st
+	fcmovu	%st(1), %st	// now %st contains y if not NaN, x otherwise
+
+	fxch
+
+	fucomi	%st(1), %st
+	fcmovb	%st(1), %st
+
+	fstp	%st(1)
+
+	ret
+END(__fmaxf)
+weak_alias (__fmaxf, fmaxf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S
new file mode 100644
index 0000000000..3f6c21c63d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S
@@ -0,0 +1,58 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmaxl)
+	fldt	4(%esp)		// x
+	fldt	16(%esp)	// x : y
+
+	fucomi	%st(1), %st
+	jp	2f
+	fcmovb	%st(1), %st
+
+	fstp	%st(1)
+
+	ret
+
+2:	// Unordered.
+	fucomi	%st(0), %st
+	jp	3f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 11(%esp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+3:	// st(0) is a NaN; st(1) may or may not be.
+	fxch
+	fucomi	%st(0), %st
+	jp	4f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 23(%esp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+4:	// Both arguments are NaNs, or one is a signaling NaN.
+	faddp
+	ret
+END(__fmaxl)
+weak_alias (__fmaxl, fmaxl)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S
new file mode 100644
index 0000000000..72d306fd79
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S
@@ -0,0 +1,37 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmin)
+	fldl	4(%esp)		// x
+	fldl	12(%esp)	// x : y
+
+	fucomi	%st(0), %st
+	fcmovu	%st(1), %st	// now %st contains y if not NaN, x otherwise
+
+	fucomi	%st(1), %st
+	fcmovnb	%st(1), %st
+
+	fstp	%st(1)
+
+	ret
+END(__fmin)
+weak_alias (__fmin, fmin)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S
new file mode 100644
index 0000000000..52ea892bad
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S
@@ -0,0 +1,37 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fminf)
+	flds	4(%esp)		// x
+	flds	8(%esp)		// x : y
+
+	fucomi	%st(0), %st
+	fcmovu	%st(1), %st	// now %st contains y if not NaN, x otherwise
+
+	fucomi	%st(1), %st
+	fcmovnb	%st(1), %st
+
+	fstp	%st(1)
+
+	ret
+END(__fminf)
+weak_alias (__fminf, fminf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S
new file mode 100644
index 0000000000..e1cb83fed7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S
@@ -0,0 +1,58 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fminl)
+	fldt	4(%esp)		// x
+	fldt	16(%esp)	// x : y
+
+	fucomi	%st(1), %st
+	jp	2f
+	fcmovnb	%st(1), %st
+
+	fstp	%st(1)
+
+	ret
+
+2:	// Unordered.
+	fucomi	%st(0), %st
+	jp	3f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 11(%esp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+3:	// st(0) is a NaN; st(1) may or may not be.
+	fxch
+	fucomi	%st(0), %st
+	jp	4f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 23(%esp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+4:	// Both arguments are NaNs, or one is a signaling NaN.
+	faddp
+	ret
+END(__fminl)
+weak_alias (__fminl, fminl)
diff --git a/REORG.TODO/sysdeps/i386/i686/hp-timing.h b/REORG.TODO/sysdeps/i386/i686/hp-timing.h
new file mode 100644
index 0000000000..1b11410feb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/hp-timing.h
@@ -0,0 +1,42 @@
+/* High precision, low overhead timing functions.  i686 version.
+   Copyright (C) 1998-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _HP_TIMING_H
+#define _HP_TIMING_H	1
+
+/* We always assume having the timestamp register.  */
+#define HP_TIMING_AVAIL		(1)
+#define HP_SMALL_TIMING_AVAIL	(1)
+
+/* We indeed have inlined functions.  */
+#define HP_TIMING_INLINE	(1)
+
+/* We use 64bit values for the times.  */
+typedef unsigned long long int hp_timing_t;
+
+/* That's quite simple.  Use the `rdtsc' instruction.  Note that the value
+   might not be 100% accurate since there might be some more instructions
+   running in this moment.  This could be changed by using a barrier like
+   'cpuid' right before the `rdtsc' instruciton.  But we are not interested
+   in accurate clock cycles here so we don't do this.  */
+#define HP_TIMING_NOW(Var)	__asm__ __volatile__ ("rdtsc" : "=A" (Var))
+
+#include <hp-timing-common.h>
+
+#endif	/* hp-timing.h */
diff --git a/REORG.TODO/sysdeps/i386/i686/init-arch.h b/REORG.TODO/sysdeps/i386/i686/init-arch.h
new file mode 100644
index 0000000000..f55f80efa0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/init-arch.h
@@ -0,0 +1,19 @@
+/* Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define MINIMUM_ISA 686
+#include <sysdeps/x86/init-arch.h>
diff --git a/REORG.TODO/sysdeps/i386/i686/memcmp.S b/REORG.TODO/sysdeps/i386/i686/memcmp.S
new file mode 100644
index 0000000000..5140ee2145
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memcmp.S
@@ -0,0 +1,408 @@
+/* Compare two memory blocks for differences in the first COUNT bytes.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS		4+4	/* Preserve EBX.  */
+#define BLK1		PARMS
+#define BLK2		BLK1+4
+#define LEN		BLK2+4
+#define ENTRANCE	pushl %ebx; cfi_adjust_cfa_offset (4); \
+			cfi_rel_offset (ebx, 0)
+#define RETURN		popl %ebx; cfi_adjust_cfa_offset (-4); \
+			cfi_restore (ebx); ret
+
+/* Load an entry in a jump table into EBX.  TABLE is a jump table
+   with relative offsets.  INDEX is a register contains the index
+   into the jump table.  */
+#define LOAD_JUMP_TABLE_ENTRY(TABLE, INDEX) \
+  /* We first load PC into EBX.  */					      \
+  SETUP_PIC_REG(bx);							      \
+  /* Get the address of the jump table.  */				      \
+  addl	$(TABLE - .), %ebx;						      \
+  /* Get the entry and convert the relative offset to the		      \
+     absolute address.  */						      \
+  addl	(%ebx,INDEX,4), %ebx
+
+        .text
+	ALIGN (4)
+ENTRY (memcmp)
+	ENTRANCE
+
+	movl	BLK1(%esp), %eax
+	movl	BLK2(%esp), %edx
+	movl	LEN(%esp), %ecx
+
+	cmpl 	$1, %ecx
+	jne	L(not_1)
+	movzbl	(%eax), %ecx		/* LEN == 1  */
+	cmpb	(%edx), %cl
+	jne	L(neq)
+L(bye):
+	xorl	%eax, %eax
+	RETURN
+
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+L(neq):
+	sbbl	%eax, %eax
+	sbbl	$-1, %eax
+	RETURN
+
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+L(not_1):
+	jl	L(bye)			/* LEN == 0  */
+
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	movl	%eax, %esi
+	cfi_rel_offset (esi, 0)
+	cmpl	$32, %ecx;
+	jge	L(32bytesormore)	/* LEN => 32  */
+
+	LOAD_JUMP_TABLE_ENTRY (L(table_32bytes), %ecx)
+	addl	%ecx, %edx
+	addl	%ecx, %esi
+	jmp	*%ebx
+
+	ALIGN (4)
+L(28bytes):
+	movl	-28(%esi), %eax
+	movl	-28(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(24bytes):
+	movl	-24(%esi), %eax
+	movl	-24(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(20bytes):
+	movl	-20(%esi), %eax
+	movl	-20(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(16bytes):
+	movl	-16(%esi), %eax
+	movl	-16(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(12bytes):
+	movl	-12(%esi), %eax
+	movl	-12(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(8bytes):
+	movl	-8(%esi), %eax
+	movl	-8(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(4bytes):
+	movl	-4(%esi), %eax
+	movl	-4(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(0bytes):
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	xorl	%eax, %eax
+	RETURN
+
+	cfi_adjust_cfa_offset (8)
+	cfi_rel_offset (esi, 0)
+	cfi_rel_offset (ebx, 4)
+L(29bytes):
+	movl	-29(%esi), %eax
+	movl	-29(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(25bytes):
+	movl	-25(%esi), %eax
+	movl	-25(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(21bytes):
+	movl	-21(%esi), %eax
+	movl	-21(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(17bytes):
+	movl	-17(%esi), %eax
+	movl	-17(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(13bytes):
+	movl	-13(%esi), %eax
+	movl	-13(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(9bytes):
+	movl	-9(%esi), %eax
+	movl	-9(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(5bytes):
+	movl	-5(%esi), %eax
+	movl	-5(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(1bytes):
+	movzbl	-1(%esi), %eax
+	cmpb	-1(%edx), %al
+	jne	L(set)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	xorl	%eax, %eax
+	RETURN
+
+	cfi_adjust_cfa_offset (8)
+	cfi_rel_offset (esi, 0)
+	cfi_rel_offset (ebx, 4)
+L(30bytes):
+	movl	-30(%esi), %eax
+	movl	-30(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(26bytes):
+	movl	-26(%esi), %eax
+	movl	-26(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(22bytes):
+	movl	-22(%esi), %eax
+	movl	-22(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(18bytes):
+	movl	-18(%esi), %eax
+	movl	-18(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(14bytes):
+	movl	-14(%esi), %eax
+	movl	-14(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(10bytes):
+	movl	-10(%esi), %eax
+	movl	-10(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(6bytes):
+	movl	-6(%esi), %eax
+	movl	-6(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%esi), %eax
+	movzwl	-2(%edx), %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+	cmpl	%ecx, %eax
+	jne	L(set)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	xorl	%eax, %eax
+	RETURN
+
+	cfi_adjust_cfa_offset (8)
+	cfi_rel_offset (esi, 0)
+	cfi_rel_offset (ebx, 4)
+L(31bytes):
+	movl	-31(%esi), %eax
+	movl	-31(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(27bytes):
+	movl	-27(%esi), %eax
+	movl	-27(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(23bytes):
+	movl	-23(%esi), %eax
+	movl	-23(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(19bytes):
+	movl	-19(%esi), %eax
+	movl	-19(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%esi), %eax
+	movl	-15(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%esi), %eax
+	movl	-11(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%esi), %eax
+	movl	-7(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%esi), %eax
+	movzwl	-3(%edx), %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+	cmpl	%ecx, %eax
+	jne	L(set)
+	movzbl	-1(%esi), %eax
+	cmpb	-1(%edx), %al
+	jne	L(set)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	xorl	%eax, %eax
+	RETURN
+
+	cfi_adjust_cfa_offset (8)
+	cfi_rel_offset (esi, 0)
+	cfi_rel_offset (ebx, 4)
+	ALIGN (4)
+/* ECX >= 32.  */
+L(32bytesormore):
+	subl	$32, %ecx
+
+	movl	(%esi), %eax
+	cmpl	(%edx), %eax
+	jne	L(load_ecx)
+
+	movl	4(%esi), %eax
+	cmpl	4(%edx), %eax
+	jne	L(load_ecx_4)
+
+	movl	8(%esi), %eax
+	cmpl	8(%edx), %eax
+	jne	L(load_ecx_8)
+
+	movl	12(%esi), %eax
+	cmpl	12(%edx), %eax
+	jne	L(load_ecx_12)
+
+	movl	16(%esi), %eax
+	cmpl	16(%edx), %eax
+	jne	L(load_ecx_16)
+
+	movl	20(%esi), %eax
+	cmpl	20(%edx), %eax
+	jne	L(load_ecx_20)
+
+	movl	24(%esi), %eax
+	cmpl	24(%edx), %eax
+	jne	L(load_ecx_24)
+
+	movl	28(%esi), %eax
+	cmpl	28(%edx), %eax
+	jne	L(load_ecx_28)
+
+	addl	$32, %esi
+	addl	$32, %edx
+	cmpl	$32, %ecx
+	jge	L(32bytesormore)
+
+	LOAD_JUMP_TABLE_ENTRY (L(table_32bytes), %ecx)
+	addl	%ecx, %edx
+	addl	%ecx, %esi
+	jmp	*%ebx
+
+L(load_ecx_28):
+	addl	$0x4, %edx
+L(load_ecx_24):
+	addl	$0x4, %edx
+L(load_ecx_20):
+	addl	$0x4, %edx
+L(load_ecx_16):
+	addl	$0x4, %edx
+L(load_ecx_12):
+	addl	$0x4, %edx
+L(load_ecx_8):
+	addl	$0x4, %edx
+L(load_ecx_4):
+	addl	$0x4, %edx
+L(load_ecx):
+	movl	(%edx), %ecx
+
+L(find_diff):
+	cmpb	%cl, %al
+	jne	L(set)
+	cmpb	%ch, %ah
+	jne	L(set)
+	shrl	$16,%eax
+	shrl	$16,%ecx
+	cmpb	%cl, %al
+	jne	L(set)
+	/* We get there only if we already know there is a
+	   difference.  */
+	cmpl	%ecx, %eax
+L(set):
+	sbbl	%eax, %eax
+	sbbl	$-1, %eax
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	RETURN
+END (memcmp)
+
+	.section	.rodata
+	ALIGN (2)
+L(table_32bytes) :
+	.long	L(0bytes) - L(table_32bytes)
+	.long	L(1bytes) - L(table_32bytes)
+	.long	L(2bytes) - L(table_32bytes)
+	.long	L(3bytes) - L(table_32bytes)
+	.long	L(4bytes) - L(table_32bytes)
+	.long	L(5bytes) - L(table_32bytes)
+	.long	L(6bytes) - L(table_32bytes)
+	.long	L(7bytes) - L(table_32bytes)
+	.long	L(8bytes) - L(table_32bytes)
+	.long	L(9bytes) - L(table_32bytes)
+	.long	L(10bytes) - L(table_32bytes)
+	.long	L(11bytes) - L(table_32bytes)
+	.long	L(12bytes) - L(table_32bytes)
+	.long	L(13bytes) - L(table_32bytes)
+	.long	L(14bytes) - L(table_32bytes)
+	.long	L(15bytes) - L(table_32bytes)
+	.long	L(16bytes) - L(table_32bytes)
+	.long	L(17bytes) - L(table_32bytes)
+	.long	L(18bytes) - L(table_32bytes)
+	.long	L(19bytes) - L(table_32bytes)
+	.long	L(20bytes) - L(table_32bytes)
+	.long	L(21bytes) - L(table_32bytes)
+	.long	L(22bytes) - L(table_32bytes)
+	.long	L(23bytes) - L(table_32bytes)
+	.long	L(24bytes) - L(table_32bytes)
+	.long	L(25bytes) - L(table_32bytes)
+	.long	L(26bytes) - L(table_32bytes)
+	.long	L(27bytes) - L(table_32bytes)
+	.long	L(28bytes) - L(table_32bytes)
+	.long	L(29bytes) - L(table_32bytes)
+	.long	L(30bytes) - L(table_32bytes)
+	.long	L(31bytes) - L(table_32bytes)
+
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
+libc_hidden_builtin_def (memcmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/memcpy.S b/REORG.TODO/sysdeps/i386/i686/memcpy.S
new file mode 100644
index 0000000000..1d61447430
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memcpy.S
@@ -0,0 +1,98 @@
+/* Copy memory block and return pointer to beginning of destination block
+   For Intel 80x86, x>=6.
+   This file is part of the GNU C Library.
+   Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define RTN	PARMS
+#define DEST	RTN
+#define SRC	DEST+4
+#define LEN	SRC+4
+
+	.text
+#if defined PIC && IS_IN (libc)
+ENTRY_CHK (__memcpy_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memcpy_chk)
+#endif
+ENTRY (memcpy)
+
+	movl	%edi, %eax
+	movl	DEST(%esp), %edi
+	movl	%esi, %edx
+	movl	SRC(%esp), %esi
+
+	movl	%edi, %ecx
+	xorl	%esi, %ecx
+	andl	$3, %ecx
+	movl	LEN(%esp), %ecx
+	cld
+	jne	.Lunaligned
+
+	cmpl	$3, %ecx
+	jbe	.Lunaligned
+
+	testl	$3, %esi
+	je	1f
+	movsb
+	decl	%ecx
+	testl	$3, %esi
+	je	1f
+	movsb
+	decl	%ecx
+	testl	$3, %esi
+	je	1f
+	movsb
+	decl	%ecx
+1:	pushl	%eax
+	movl	%ecx, %eax
+	shrl	$2, %ecx
+	andl	$3, %eax
+	rep
+	movsl
+	movl	%eax, %ecx
+	rep
+	movsb
+	popl	%eax
+
+.Lend:	movl	%eax, %edi
+	movl	%edx, %esi
+	movl	DEST(%esp), %eax
+
+	ret
+
+	/* When we come here the pointers do not have the same
+	   alignment or the length is too short.  No need to optimize for
+	   aligned memory accesses. */
+.Lunaligned:
+	shrl	$1, %ecx
+	jnc	1f
+	movsb
+1:	shrl	$1, %ecx
+	jnc	2f
+	movsw
+2:	rep
+	movsl
+	jmp	.Lend
+END (memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/memmove.S b/REORG.TODO/sysdeps/i386/i686/memmove.S
new file mode 100644
index 0000000000..f60c3d501b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memmove.S
@@ -0,0 +1,120 @@
+/* Copy memory block and return pointer to beginning of destination block
+   For Intel 80x86, x>=6.
+   This file is part of the GNU C Library.
+   Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 2003.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* one spilled register */
+#define RTN	PARMS
+
+	.text
+
+#ifdef USE_AS_BCOPY
+# define SRC	RTN
+# define DEST	SRC+4
+# define LEN	DEST+4
+#else
+# define DEST	RTN
+# define SRC	DEST+4
+# define LEN	SRC+4
+
+# if defined PIC && IS_IN (libc)
+ENTRY_CHK (__memmove_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memmove_chk)
+# endif
+#endif
+
+ENTRY (memmove)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+
+	movl	LEN(%esp), %ecx
+	movl	DEST(%esp), %edi
+	cfi_rel_offset (edi, 0)
+	movl	%esi, %edx
+	movl	SRC(%esp), %esi
+	cfi_register (esi, edx)
+
+	movl	%edi, %eax
+	subl	%esi, %eax
+	cmpl	%eax, %ecx
+	ja	3f
+
+	cld
+	shrl	$1, %ecx
+	jnc	1f
+	movsb
+1:	shrl	$1, %ecx
+	jnc	2f
+	movsw
+2:	rep
+	movsl
+	movl	%edx, %esi
+	cfi_restore (esi)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#endif
+
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+	cfi_register (esi, edx)
+
+	/* Backward copying.  */
+3:	std
+	leal	-1(%edi, %ecx), %edi
+	leal	-1(%esi, %ecx), %esi
+	shrl	$1, %ecx
+	jnc	1f
+	movsb
+1:	subl	$1, %edi
+	subl	$1, %esi
+	shrl	$1, %ecx
+	jnc	2f
+	movsw
+2:	subl	$2, %edi
+	subl	$2, %esi
+	rep
+	movsl
+	movl	%edx, %esi
+	cfi_restore (esi)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#endif
+
+	cld
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (memmove)
+#ifndef USE_AS_BCOPY
+libc_hidden_builtin_def (memmove)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/mempcpy.S b/REORG.TODO/sysdeps/i386/i686/mempcpy.S
new file mode 100644
index 0000000000..31cb4efdb2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/mempcpy.S
@@ -0,0 +1,65 @@
+/* Copy memory block and return pointer to following byte.
+   For Intel 80x86, x>=6.
+   This file is part of the GNU C Library.
+   Copyright (C) 1998-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define RTN	PARMS
+#define DEST	RTN
+#define SRC	DEST+4
+#define LEN	SRC+4
+
+	.text
+#if defined PIC && IS_IN (libc)
+ENTRY_CHK (__mempcpy_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__mempcpy_chk)
+#endif
+ENTRY (__mempcpy)
+
+	movl	LEN(%esp), %ecx
+	movl	%edi, %eax
+	cfi_register (edi, eax)
+	movl	DEST(%esp), %edi
+	movl	%esi, %edx
+	cfi_register (esi, edx)
+	movl	SRC(%esp), %esi
+	cld
+	shrl	$1, %ecx
+	jnc	1f
+	movsb
+1:	shrl	$1, %ecx
+	jnc	2f
+	movsw
+2:	rep
+	movsl
+	xchgl	%edi, %eax
+	cfi_restore (edi)
+	movl	%edx, %esi
+	cfi_restore (esi)
+
+	ret
+END (__mempcpy)
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/memset.S b/REORG.TODO/sysdeps/i386/i686/memset.S
new file mode 100644
index 0000000000..24d06178d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memset.S
@@ -0,0 +1,100 @@
+/* memset/bzero -- set memory area to CH/0
+   Highly optimized version for ix86, x>=6.
+   Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* space for 1 saved reg */
+#ifdef USE_AS_BZERO
+# define DEST	PARMS
+# define LEN	DEST+4
+#else
+# define RTN	PARMS
+# define DEST	RTN
+# define CHR	DEST+4
+# define LEN	CHR+4
+#endif
+
+        .text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY_CHK (__memset_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memset_chk)
+#endif
+ENTRY (memset)
+
+	cld
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	movl	DEST(%esp), %edx
+	movl	LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+	xorl	%eax, %eax	/* fill with 0 */
+#else
+	movzbl	CHR(%esp), %eax
+#endif
+	jecxz	1f
+	movl	%edx, %edi
+	cfi_rel_offset (edi, 0)
+	andl	$3, %edx
+	jz	2f	/* aligned */
+	jp	3f	/* misaligned at 3, store just one byte below */
+	stosb		/* misaligned at 1 or 2, store two bytes */
+	decl	%ecx
+	jz	1f
+3:	stosb
+	decl	%ecx
+	jz	1f
+	xorl	$1, %edx
+	jnz	2f	/* was misaligned at 2 or 3, now aligned */
+	stosb		/* was misaligned at 1, store third byte */
+	decl	%ecx
+2:	movl	%ecx, %edx
+	shrl	$2, %ecx
+	andl	$3, %edx
+#ifndef USE_AS_BZERO
+	imul	$0x01010101, %eax
+#endif
+	rep
+	stosl
+	movl	%edx, %ecx
+	rep
+	stosb
+
+1:
+#ifndef USE_AS_BZERO
+	movl DEST(%esp), %eax	/* start address of destination is result */
+#endif
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (memset)
+libc_hidden_builtin_def (memset)
+
+#if defined SHARED && IS_IN (libc) && !defined __memset_chk \
+    && !defined USE_AS_BZERO
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+	.section .gnu.warning.__memset_zero_constant_len_parameter
+	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/memusage.h b/REORG.TODO/sysdeps/i386/i686/memusage.h
new file mode 100644
index 0000000000..77a020d7c0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memusage.h
@@ -0,0 +1,21 @@
+/* Copyright (C) 2000-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define GETSP() ({ register uintptr_t stack_ptr asm ("esp"); stack_ptr; })
+#define GETTIME(low,high) asm ("rdtsc" : "=a" (low), "=d" (high))
+
+#include <sysdeps/generic/memusage.h>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile
new file mode 100644
index 0000000000..4a0c20c051
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile
@@ -0,0 +1,44 @@
+ifeq ($(subdir),csu)
+tests += test-multiarch
+endif
+
+ifeq ($(subdir),string)
+gen-as-const-headers += locale-defines.sym
+sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
+		   memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \
+		   memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
+		   memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
+		   strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
+		   memcmp-ssse3 memcmp-sse4 varshift \
+		   strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \
+		   strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \
+		   strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \
+		   strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \
+		   strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
+		   memchr-sse2 memchr-sse2-bsf \
+		   memrchr-sse2 memrchr-sse2-bsf memrchr-c \
+		   rawmemchr-sse2 rawmemchr-sse2-bsf \
+		   strnlen-sse2 strnlen-c \
+		   strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \
+		   strncase_l-c strncase-c strncase_l-ssse3 \
+		   strcasecmp_l-sse4 strncase_l-sse4 \
+		   bcopy-sse2-unaligned memcpy-sse2-unaligned \
+		   mempcpy-sse2-unaligned memmove-sse2-unaligned \
+		   strcspn-c strpbrk-c strspn-c
+CFLAGS-varshift.c += -msse4
+CFLAGS-strcspn-c.c += -msse4
+CFLAGS-strpbrk-c.c += -msse4
+CFLAGS-strspn-c.c += -msse4
+endif
+
+ifeq ($(subdir),wcsmbs)
+sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \
+		   wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \
+		   wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c
+endif
+
+ifeq ($(subdir),math)
+libm-sysdep_routines += s_fma-fma s_fmaf-fma
+CFLAGS-s_fma-fma.c += -mavx -mfpmath=sse
+CFLAGS-s_fmaf-fma.c += -mavx -mfpmath=sse
+endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S
new file mode 100644
index 0000000000..efef2a10dd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
new file mode 100644
index 0000000000..cbc8b420e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
new file mode 100644
index 0000000000..36aac44b9c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S
new file mode 100644
index 0000000000..877f82c28f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S
@@ -0,0 +1,59 @@
+/* Multiple versions of bcopy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(bcopy)
+	.type	bcopy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__bcopy_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bcopy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bcopy_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bcopy_ssse3_rep)
+2:	ret
+END(bcopy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __bcopy_ia32, @function; \
+	.p2align 4; \
+	.globl __bcopy_ia32; \
+	.hidden __bcopy_ia32; \
+	__bcopy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32
+
+#endif
+
+#include "../bcopy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
new file mode 100644
index 0000000000..507b288bb3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2_rep __bzero_sse2_rep
+#include "memset-sse2-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S
new file mode 100644
index 0000000000..8d04512e4e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2 __bzero_sse2
+#include "memset-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S
new file mode 100644
index 0000000000..9dac490aa2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S
@@ -0,0 +1,62 @@
+/* Multiple versions of bzero
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__bzero)
+	.type	__bzero, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__bzero_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX ( __bzero_sse2)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bzero_sse2_rep)
+2:	ret
+END(__bzero)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __bzero_ia32, @function; \
+	.p2align 4; \
+	.globl __bzero_ia32; \
+	.hidden __bzero_ia32; \
+	__bzero_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __bzero_ia32, .-__bzero_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI___bzero; __GI___bzero = __bzero_ia32
+# endif
+#endif
+
+#include "../bzero.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..e8026a2a78
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
@@ -0,0 +1,376 @@
+/* Enumerate available IFUNC implementations of a function.  i686 version.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ifunc-impl-list.h>
+#include "init-arch.h"
+
+/* Maximum number of IFUNC implementations.  */
+#define MAX_IFUNC	4
+
+/* Fill ARRAY of MAX elements with IFUNC implementations for function
+   NAME and return the number of valid entries.  */
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+			size_t max)
+{
+  assert (max >= MAX_IFUNC);
+
+  size_t i = 0;
+
+  /* Support sysdeps/i386/i686/multiarch/bcopy.S.  */
+  IFUNC_IMPL (i, name, bcopy,
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3),
+			      __bcopy_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3),
+			      __bcopy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSE2),
+			      __bcopy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/bzero.S.  */
+  IFUNC_IMPL (i, name, bzero,
+	      IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
+			      __bzero_sse2_rep)
+	      IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
+			      __bzero_sse2)
+	      IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memchr.S.  */
+  IFUNC_IMPL (i, name, memchr,
+	      IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2),
+			      __memchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2),
+			      __memchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memcmp.S.  */
+  IFUNC_IMPL (i, name, memcmp,
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __memcmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
+			      __memcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memmove_chk.S.  */
+  IFUNC_IMPL (i, name, __memmove_chk,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memmove_chk_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memmove_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memmove_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+			      __memmove_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memmove.S.  */
+  IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+			      __memmove_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+			      __memmove_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSE2),
+			      __memmove_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memrchr.S.  */
+  IFUNC_IMPL (i, name, memrchr,
+	      IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2),
+			      __memrchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2),
+			      __memrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memset_chk.S.  */
+  IFUNC_IMPL (i, name, __memset_chk,
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memset_chk_sse2_rep)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memset_chk_sse2)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memset.S.  */
+  IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
+			      __memset_sse2_rep)
+	      IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
+			      __memset_sse2)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/rawmemchr.S.  */
+  IFUNC_IMPL (i, name, rawmemchr,
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2),
+			      __rawmemchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2),
+			      __rawmemchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/stpncpy.S.  */
+  IFUNC_IMPL (i, name, stpncpy,
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpncpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSE2),
+			      __stpncpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/stpcpy.S.  */
+  IFUNC_IMPL (i, name, stpcpy,
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSE2),
+			      __stpcpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcasecmp.S.  */
+  IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strcasecmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strcasecmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcasecmp_l.S.  */
+  IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strcasecmp_l_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strcasecmp_l_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
+			      __strcasecmp_l_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcat.S.  */
+  IFUNC_IMPL (i, name, strcat,
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
+			      __strcat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSE2),
+			      __strcat_sse2)
+	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strchr.S.  */
+  IFUNC_IMPL (i, name, strchr,
+	      IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2),
+			      __strchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2),
+			      __strchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcmp.S.  */
+  IFUNC_IMPL (i, name, strcmp,
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __strcmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
+			      __strcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcpy.S.  */
+  IFUNC_IMPL (i, name, strcpy,
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
+			      __strcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSE2),
+			      __strcpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcspn.S.  */
+  IFUNC_IMPL (i, name, strcspn,
+	      IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strcspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncase.S.  */
+  IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strncasecmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strncasecmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
+			      __strncasecmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncase_l.S.  */
+  IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strncasecmp_l_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strncasecmp_l_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
+			      __strncasecmp_l_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncat.S.  */
+  IFUNC_IMPL (i, name, strncat,
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
+			      __strncat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSE2),
+			      __strncat_sse2)
+	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncpy.S.  */
+  IFUNC_IMPL (i, name, strncpy,
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
+			      __strncpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSE2),
+			      __strncpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strnlen.S.  */
+  IFUNC_IMPL (i, name, strnlen,
+	      IFUNC_IMPL_ADD (array, i, strnlen, HAS_CPU_FEATURE (SSE2),
+			      __strnlen_sse2)
+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strpbrk.S.  */
+  IFUNC_IMPL (i, name, strpbrk,
+	      IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2),
+			      __strpbrk_sse42)
+	      IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strrchr.S.  */
+  IFUNC_IMPL (i, name, strrchr,
+	      IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2),
+			      __strrchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2),
+			      __strrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strspn.S.  */
+  IFUNC_IMPL (i, name, strspn,
+	      IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcschr.S.  */
+  IFUNC_IMPL (i, name, wcschr,
+	      IFUNC_IMPL_ADD (array, i, wcschr, HAS_CPU_FEATURE (SSE2),
+			      __wcschr_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcscmp.S.  */
+  IFUNC_IMPL (i, name, wcscmp,
+	      IFUNC_IMPL_ADD (array, i, wcscmp, HAS_CPU_FEATURE (SSE2),
+			      __wcscmp_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcscpy.S.  */
+  IFUNC_IMPL (i, name, wcscpy,
+	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
+			      __wcscpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcslen.S.  */
+  IFUNC_IMPL (i, name, wcslen,
+	      IFUNC_IMPL_ADD (array, i, wcslen, HAS_CPU_FEATURE (SSE2),
+			      __wcslen_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcsrchr.S.  */
+  IFUNC_IMPL (i, name, wcsrchr,
+	      IFUNC_IMPL_ADD (array, i, wcsrchr, HAS_CPU_FEATURE (SSE2),
+			      __wcsrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wmemcmp.S.  */
+  IFUNC_IMPL (i, name, wmemcmp,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __wmemcmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
+			      __wmemcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_ia32))
+
+#ifdef SHARED
+  /* Support sysdeps/i386/i686/multiarch/memcpy_chk.S.  */
+  IFUNC_IMPL (i, name, __memcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_chk_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memcpy.S.  */
+  IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSE2),
+			      __memcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S.  */
+  IFUNC_IMPL (i, name, __mempcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_chk_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __mempcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/mempcpy.S.  */
+  IFUNC_IMPL (i, name, mempcpy,
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSE2),
+			      __mempcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strlen.S.  */
+  IFUNC_IMPL (i, name, strlen,
+	      IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2),
+			      __strlen_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2),
+			      __strlen_sse2)
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncmp.S.  */
+  IFUNC_IMPL (i, name, strncmp,
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
+			      __strncmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
+			      __strncmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_ia32))
+#endif
+
+  return i;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym
new file mode 100644
index 0000000000..aebff9a4f9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym
@@ -0,0 +1,11 @@
+#include <locale/localeinfo.h>
+#include <langinfo.h>
+#include <stddef.h>
+
+--
+
+LOCALE_T___LOCALES		offsetof (struct __locale_struct, __locales)
+LC_CTYPE
+_NL_CTYPE_NONASCII_CASE
+LOCALE_DATA_VALUES		offsetof (struct __locale_data, values)
+SIZEOF_VALUES			sizeof (((struct __locale_data *) 0)->values[0])
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
new file mode 100644
index 0000000000..dd316486e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
@@ -0,0 +1,502 @@
+/* Optimized memchr with sse2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+#  define LEN   STR2+4
+#  define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+# endif
+
+# ifndef MEMCHR
+#  define MEMCHR __memchr_sse2_bsf
+# endif
+
+	.text
+ENTRY (MEMCHR)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+# ifndef USE_AS_RAWMEMCHR
+	mov	LEN(%esp), %edx
+	test	%edx, %edx
+	jz	L(return_null_1)
+# endif
+	mov	%ecx, %eax
+
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+	movdqu	(%eax), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	je	L(unaligned_no_match_1)
+/* Check which byte is a match.  */
+	bsf	%ecx, %ecx
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	%ecx, %edx
+	jbe	L(return_null_1)
+# endif
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(unaligned_no_match_1):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$16, %edx
+	jbe	L(return_null_1)
+	PUSH	(%edi)
+	lea	16(%eax), %edi
+	and	$15, %eax
+	and	$-16, %edi
+	add	%eax, %edx
+# else
+	lea	16(%eax), %edx
+	and	$-16, %edx
+# endif
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(return_null_1):
+	xor	%eax, %eax
+	ret
+
+# ifndef USE_AS_RAWMEMCHR
+	CFI_POP	(%edi)
+# endif
+
+	.p2align 4
+L(crosscache):
+/* Handle unaligned string.  */
+
+# ifndef USE_AS_RAWMEMCHR
+	PUSH	(%edi)
+	mov	%eax, %edi
+	and	$15, %ecx
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+# else
+	mov	%eax, %edx
+	and	$15, %ecx
+	and	$-16, %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	sar	%cl, %eax
+	test	%eax, %eax
+	je	L(unaligned_no_match)
+/* Check which byte is a match.  */
+	bsf	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	%eax, %edx
+	jbe	L(return_null)
+	add	%edi, %eax
+	add	%ecx, %eax
+	RETURN
+# else
+	add	%edx, %eax
+	add	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(unaligned_no_match):
+# ifndef USE_AS_RAWMEMCHR
+        /* Calculate the last acceptable address and check for possible
+           addition overflow by using satured math:
+           edx = ecx + edx
+           edx |= -(edx < ecx)  */
+	add	%ecx, %edx
+	sbb	%eax, %eax
+	or	%eax, %edx
+	sub	$16, %edx
+	jbe	L(return_null)
+	add	$16, %edi
+# else
+	add	$16, %edx
+# endif
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+	test	$0x3f, %edi
+# else
+	test	$0x3f, %edx
+# endif
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm3
+# else
+	movdqa	48(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	test	%eax, %eax
+	jnz	L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+	mov	%edi, %ecx
+	and	$-64, %edi
+	and	$63, %ecx
+	add	%ecx, %edx
+# else
+	and	$-64, %edx
+# endif
+
+	.p2align 4
+L(align64_loop):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+	movdqa	16(%edi), %xmm2
+	movdqa	32(%edi), %xmm3
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	(%edx), %xmm0
+	movdqa	16(%edx), %xmm2
+	movdqa	32(%edx), %xmm3
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+	pmovmskb %xmm4, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edi
+# else
+	sub	$64, %edx
+# endif
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+
+	pcmpeqb	%xmm1, %xmm3
+
+# ifndef USE_AS_RAWMEMCHR
+	pcmpeqb	48(%edi), %xmm1
+# else
+	pcmpeqb	48(%edx), %xmm1
+# endif
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	pmovmskb %xmm1, %eax
+	bsf	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	48(%edi, %eax), %eax
+	RETURN
+# else
+	lea	48(%edx, %eax), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	48(%edi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	16(%edi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	xor	%eax, %eax
+	RETURN
+# endif
+	.p2align 4
+L(matches0):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	-16(%eax, %edi), %eax
+	RETURN
+# else
+	lea	-16(%eax, %edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	add	%edi, %eax
+	RETURN
+# else
+	add	%edx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches16):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	16(%eax, %edi), %eax
+	RETURN
+# else
+	lea	16(%eax, %edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches32):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	32(%eax, %edi), %eax
+	RETURN
+# else
+	lea	32(%eax, %edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(matches_1):
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	add	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(matches16_1):
+	sub	$16, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	16(%edi, %eax), %eax
+	RETURN
+
+	.p2align 4
+L(matches32_1):
+	sub	$32, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	32(%edi, %eax), %eax
+	RETURN
+
+	.p2align 4
+L(matches48_1):
+	sub	$48, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	48(%edi, %eax), %eax
+	RETURN
+# endif
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	RETURN
+# else
+	ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S
new file mode 100644
index 0000000000..172d70de13
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S
@@ -0,0 +1,709 @@
+/* Optimized memchr with sse2 without bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef USE_AS_RAWMEMCHR
+#  define ENTRANCE PUSH(%edi);
+#  define PARMS  8
+#  define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+# else
+#  define ENTRANCE
+#  define PARMS  4
+# endif
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+#  define LEN   STR2+4
+# endif
+
+# ifndef MEMCHR
+#  define MEMCHR __memchr_sse2
+# endif
+
+	atom_text_section
+ENTRY (MEMCHR)
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+# ifndef USE_AS_RAWMEMCHR
+	mov	LEN(%esp), %edx
+	test	%edx, %edx
+	jz	L(return_null)
+# endif
+
+	punpcklbw %xmm1, %xmm1
+# ifndef USE_AS_RAWMEMCHR
+	mov	%ecx, %edi
+# else
+	mov	%ecx, %edx
+# endif
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqu	(%edi), %xmm0
+# else
+	movdqu	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(match_case2_prolog)
+
+	sub	$16, %edx
+	jbe	L(return_null)
+	lea	16(%edi), %edi
+	and	$15, %ecx
+	and	$-16, %edi
+	add	%ecx, %edx
+# else
+	jnz	L(match_case1_prolog)
+	lea	16(%edx), %edx
+	and	$-16, %edx
+# endif
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(crosscache):
+	and	$15, %ecx
+# ifndef USE_AS_RAWMEMCHR
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+# else
+	and	$-16, %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	sar	%cl, %eax
+	test	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(match_case2_prolog1)
+        /* "ecx" is less than 16.  Calculate "edx + ecx - 16" by using
+	   "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void
+	   possible addition overflow.  */
+	neg	%ecx
+	add	$16, %ecx
+	sub	%ecx, %edx
+	jbe	L(return_null)
+	lea	16(%edi), %edi
+# else
+	jnz	L(match_case1_prolog1)
+	lea	16(%edx), %edx
+# endif
+
+	.p2align 4
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	64(%edi), %edi
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%edi), %xmm0
+# else
+	lea	64(%edx), %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	64(%edi), %edi
+	mov	%edi, %ecx
+	and	$-64, %edi
+	and	$63, %ecx
+	add	%ecx, %edx
+# else
+	lea	64(%edx), %edx
+	and	$-64, %edx
+# endif
+
+	.p2align 4
+L(align64_loop):
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+	movdqa	16(%edi), %xmm2
+	movdqa	32(%edi), %xmm3
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	(%edx), %xmm0
+	movdqa	16(%edx), %xmm2
+	movdqa	32(%edx), %xmm3
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	pmovmskb %xmm4, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edi
+# else
+	sub	$64, %edx
+# endif
+
+	pmovmskb %xmm0, %eax
+	xor	%ecx, %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	pmovmskb %xmm2, %eax
+	lea	16(%ecx), %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	lea	16(%ecx), %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	pcmpeqb	48(%edi), %xmm1
+# else
+	pcmpeqb	48(%edx), %xmm1
+# endif
+	pmovmskb %xmm1, %eax
+	lea	16(%ecx), %ecx
+
+	.p2align 4
+L(match_case1):
+# ifndef USE_AS_RAWMEMCHR
+	add	%ecx, %edi
+# else
+L(match_case1_prolog1):
+	add	%ecx, %edx
+L(match_case1_prolog):
+# endif
+	test	%al, %al
+	jz	L(match_case1_high)
+	mov	%al, %cl
+	and	$15, %cl
+	jz	L(match_case1_8)
+	test	$0x01, %al
+	jnz	L(ExitCase1_1)
+	test	$0x02, %al
+	jnz	L(ExitCase1_2)
+	test	$0x04, %al
+	jnz	L(ExitCase1_3)
+# ifndef USE_AS_RAWMEMCHR
+	lea	3(%edi), %eax
+	RETURN
+# else
+	lea	3(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_8):
+	test	$0x10, %al
+	jnz	L(ExitCase1_5)
+	test	$0x20, %al
+	jnz	L(ExitCase1_6)
+	test	$0x40, %al
+	jnz	L(ExitCase1_7)
+# ifndef USE_AS_RAWMEMCHR
+	lea	7(%edi), %eax
+	RETURN
+# else
+	lea	7(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_high):
+	mov	%ah, %ch
+	and	$15, %ch
+	jz	L(match_case1_high_8)
+	test	$0x01, %ah
+	jnz	L(ExitCase1_9)
+	test	$0x02, %ah
+	jnz	L(ExitCase1_10)
+	test	$0x04, %ah
+	jnz	L(ExitCase1_11)
+# ifndef USE_AS_RAWMEMCHR
+	lea	11(%edi), %eax
+	RETURN
+# else
+	lea	11(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_high_8):
+	test	$0x10, %ah
+	jnz	L(ExitCase1_13)
+	test	$0x20, %ah
+	jnz	L(ExitCase1_14)
+	test	$0x40, %ah
+	jnz	L(ExitCase1_15)
+# ifndef USE_AS_RAWMEMCHR
+	lea	15(%edi), %eax
+	RETURN
+# else
+	lea	15(%edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$32, %edx
+	jbe	L(return_null)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	48(%edi), %xmm1
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+
+	xor	%eax, %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(ExitCase1_1):
+# ifndef USE_AS_RAWMEMCHR
+	mov	%edi, %eax
+	RETURN
+# else
+	mov	%edx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_2):
+# ifndef USE_AS_RAWMEMCHR
+	lea	1(%edi), %eax
+	RETURN
+# else
+	lea	1(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_3):
+# ifndef USE_AS_RAWMEMCHR
+	lea	2(%edi), %eax
+	RETURN
+# else
+	lea	2(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_5):
+# ifndef USE_AS_RAWMEMCHR
+	lea	4(%edi), %eax
+	RETURN
+# else
+	lea	4(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_6):
+# ifndef USE_AS_RAWMEMCHR
+	lea	5(%edi), %eax
+	RETURN
+# else
+	lea	5(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_7):
+# ifndef USE_AS_RAWMEMCHR
+	lea	6(%edi), %eax
+	RETURN
+# else
+	lea	6(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_9):
+# ifndef USE_AS_RAWMEMCHR
+	lea	8(%edi), %eax
+	RETURN
+# else
+	lea	8(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_10):
+# ifndef USE_AS_RAWMEMCHR
+	lea	9(%edi), %eax
+	RETURN
+# else
+	lea	9(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_11):
+# ifndef USE_AS_RAWMEMCHR
+	lea	10(%edi), %eax
+	RETURN
+# else
+	lea	10(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_13):
+# ifndef USE_AS_RAWMEMCHR
+	lea	12(%edi), %eax
+	RETURN
+# else
+	lea	12(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_14):
+# ifndef USE_AS_RAWMEMCHR
+	lea	13(%edi), %eax
+	RETURN
+# else
+	lea	13(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_15):
+# ifndef USE_AS_RAWMEMCHR
+	lea	14(%edi), %eax
+	RETURN
+# else
+	lea	14(%edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(match_case2):
+	sub	%ecx, %edx
+L(match_case2_prolog1):
+	add	%ecx, %edi
+L(match_case2_prolog):
+	test	%al, %al
+	jz	L(match_case2_high)
+	mov	%al, %cl
+	and	$15, %cl
+	jz	L(match_case2_8)
+	test	$0x01, %al
+	jnz	L(ExitCase2_1)
+	test	$0x02, %al
+	jnz	L(ExitCase2_2)
+	test	$0x04, %al
+	jnz	L(ExitCase2_3)
+	sub	$4, %edx
+	jb	L(return_null)
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_8):
+	test	$0x10, %al
+	jnz	L(ExitCase2_5)
+	test	$0x20, %al
+	jnz	L(ExitCase2_6)
+	test	$0x40, %al
+	jnz	L(ExitCase2_7)
+	sub	$8, %edx
+	jb	L(return_null)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_high):
+	mov	%ah, %ch
+	and	$15, %ch
+	jz	L(match_case2_high_8)
+	test	$0x01, %ah
+	jnz	L(ExitCase2_9)
+	test	$0x02, %ah
+	jnz	L(ExitCase2_10)
+	test	$0x04, %ah
+	jnz	L(ExitCase2_11)
+	sub	$12, %edx
+	jb	L(return_null)
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_high_8):
+	test	$0x10, %ah
+	jnz	L(ExitCase2_13)
+	test	$0x20, %ah
+	jnz	L(ExitCase2_14)
+	test	$0x40, %ah
+	jnz	L(ExitCase2_15)
+	sub	$16, %edx
+	jb	L(return_null)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_1):
+	mov	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_2):
+	sub	$2, %edx
+	jb	L(return_null)
+	lea	1(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_3):
+	sub	$3, %edx
+	jb	L(return_null)
+	lea	2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_5):
+	sub	$5, %edx
+	jb	L(return_null)
+	lea	4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_6):
+	sub	$6, %edx
+	jb	L(return_null)
+	lea	5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_7):
+	sub	$7, %edx
+	jb	L(return_null)
+	lea	6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_9):
+	sub	$9, %edx
+	jb	L(return_null)
+	lea	8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_10):
+	sub	$10, %edx
+	jb	L(return_null)
+	lea	9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_11):
+	sub	$11, %edx
+	jb	L(return_null)
+	lea	10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_13):
+	sub	$13, %edx
+	jb	L(return_null)
+	lea	12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_14):
+	sub	$14, %edx
+	jb	L(return_null)
+	lea	13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_15):
+	sub	$15, %edx
+	jb	L(return_null)
+	lea	14(%edi), %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	RETURN
+# else
+	ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S
new file mode 100644
index 0000000000..bd0dace290
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S
@@ -0,0 +1,65 @@
+/* Multiple versions of memchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__memchr)
+	.type	__memchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	3f
+
+	LOAD_FUNC_GOT_EAX ( __memchr_sse2)
+	ret
+
+2:	LOAD_FUNC_GOT_EAX (__memchr_ia32)
+	ret
+
+3:	LOAD_FUNC_GOT_EAX (__memchr_sse2_bsf)
+	ret
+END(__memchr)
+
+weak_alias(__memchr, memchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memchr_ia32, @function; \
+	.globl __memchr_ia32; \
+	.p2align 4; \
+	__memchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memchr_ia32, .-__memchr_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_memchr; __GI_memchr = __memchr_ia32
+
+#endif
+#include "../../memchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S
new file mode 100644
index 0000000000..2aa13048b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -0,0 +1,1225 @@
+/* memcmp with SSE4.2, wmemcmp with SSE4.2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_sse4_2
+# endif
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define BLK1	PARMS
+# define BLK2	BLK1 + 4
+# define LEN	BLK2 + 4
+# define RETURN	POP (%ebx); ret; CFI_PUSH (%ebx)
+
+
+# ifdef SHARED
+#  define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+/* We first load PC into EBX.  */	\
+	SETUP_PIC_REG(bx);	\
+/* Get the address of the jump table.  */	\
+	addl	$(TABLE - .), %ebx;	\
+/* Get the entry and convert the relative offset to the	\
+	absolute	address.  */	\
+	addl	(%ebx,INDEX,SCALE), %ebx;	\
+/* We loaded the jump table and adjusted EDX/ESI. Go.  */	\
+	jmp	*%ebx
+# else
+#  define JMPTBL(I, B)	I
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (MEMCMP)
+	movl	BLK1(%esp), %eax
+	movl	BLK2(%esp), %edx
+	movl	LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(return0)
+# else
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+# endif
+
+	pxor	%xmm0, %xmm0
+	cmp	$64, %ecx
+	ja	L(64bytesormore)
+	cmp	$8, %ecx
+
+# ifndef USE_AS_WMEMCMP
+	PUSH	(%ebx)
+	jb	L(less8bytes)
+# else
+	jb	L(less8bytes)
+	PUSH	(%ebx)
+# endif
+
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %bl
+	cmpb	(%edx), %bl
+	jne	L(nonzero)
+
+	mov	1(%eax), %bl
+	cmpb	1(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$2, %ecx
+	jz	L(0bytes)
+
+	mov	2(%eax), %bl
+	cmpb	2(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$3, %ecx
+	jz	L(0bytes)
+
+	mov	3(%eax), %bl
+	cmpb	3(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$4, %ecx
+	jz	L(0bytes)
+
+	mov	4(%eax), %bl
+	cmpb	4(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$5, %ecx
+	jz	L(0bytes)
+
+	mov	5(%eax), %bl
+	cmpb	5(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$6, %ecx
+	jz	L(0bytes)
+
+	mov	6(%eax), %bl
+	cmpb	6(%edx), %bl
+	je	L(0bytes)
+
+L(nonzero):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(above)
+	neg	%eax
+L(above):
+	ret
+	CFI_PUSH (%ebx)
+# endif
+
+	.p2align 4
+L(0bytes):
+	POP	(%ebx)
+	xor	%eax, %eax
+	ret
+
+# ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	je	L(return0)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+	.p2align 4
+L(return0):
+	xor	%eax, %eax
+	ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less1bytes):
+	jb	L(0bytesend)
+	movzbl	(%eax), %eax
+	movzbl	(%edx), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(0bytesend):
+	xor	%eax, %eax
+	ret
+# endif
+	.p2align 4
+L(64bytesormore):
+	PUSH	(%ebx)
+	mov	%ecx, %ebx
+	mov	$64, %ecx
+	sub	$64, %ebx
+L(64bytesormore_loop):
+	movdqu	(%eax), %xmm1
+	movdqu	(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_16diff)
+
+	movdqu	16(%eax), %xmm1
+	movdqu	16(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_32diff)
+
+	movdqu	32(%eax), %xmm1
+	movdqu	32(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_48diff)
+
+	movdqu	48(%eax), %xmm1
+	movdqu	48(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_64diff)
+	add	%ecx, %eax
+	add	%ecx, %edx
+	sub	%ecx, %ebx
+	jae	L(64bytesormore_loop)
+	add	%ebx, %ecx
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+# ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+# endif
+	.p2align 4
+L(find_16diff):
+	sub	$16, %ecx
+L(find_32diff):
+	sub	$16, %ecx
+L(find_48diff):
+	sub	$16, %ecx
+L(find_64diff):
+	add	%ecx, %edx
+	add	%ecx, %eax
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# else
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	cmp	-4(%edx), %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(49bytes):
+	movdqu	-49(%eax), %xmm1
+	movdqu	-49(%edx), %xmm2
+	mov	$-49, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(33bytes):
+	movdqu	-33(%eax), %xmm1
+	movdqu	-33(%edx), %xmm2
+	mov	$-33, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(50bytes):
+	mov	$-50, %ebx
+	movdqu	-50(%eax), %xmm1
+	movdqu	-50(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(34bytes):
+	mov	$-34, %ebx
+	movdqu	-34(%eax), %xmm1
+	movdqu	-34(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(51bytes):
+	mov	$-51, %ebx
+	movdqu	-51(%eax), %xmm1
+	movdqu	-51(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(35bytes):
+	mov	$-35, %ebx
+	movdqu	-35(%eax), %xmm1
+	movdqu	-35(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+L(1bytes):
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+	.p2align 4
+L(52bytes):
+	movdqu	-52(%eax), %xmm1
+	movdqu	-52(%edx), %xmm2
+	mov	$-52, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(36bytes):
+	movdqu	-36(%eax), %xmm1
+	movdqu	-36(%edx), %xmm2
+	mov	$-36, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(20bytes):
+	movdqu	-20(%eax), %xmm1
+	movdqu	-20(%edx), %xmm2
+	mov	$-20, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(53bytes):
+	movdqu	-53(%eax), %xmm1
+	movdqu	-53(%edx), %xmm2
+	mov	$-53, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(37bytes):
+	mov	$-37, %ebx
+	movdqu	-37(%eax), %xmm1
+	movdqu	-37(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(21bytes):
+	mov	$-21, %ebx
+	movdqu	-21(%eax), %xmm1
+	movdqu	-21(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(54bytes):
+	movdqu	-54(%eax), %xmm1
+	movdqu	-54(%edx), %xmm2
+	mov	$-54, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(38bytes):
+	mov	$-38, %ebx
+	movdqu	-38(%eax), %xmm1
+	movdqu	-38(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(22bytes):
+	mov	$-22, %ebx
+	movdqu	-22(%eax), %xmm1
+	movdqu	-22(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(55bytes):
+	movdqu	-55(%eax), %xmm1
+	movdqu	-55(%edx), %xmm2
+	mov	$-55, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(39bytes):
+	mov	$-39, %ebx
+	movdqu	-39(%eax), %xmm1
+	movdqu	-39(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(23bytes):
+	mov	$-23, %ebx
+	movdqu	-23(%eax), %xmm1
+	movdqu	-23(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+	.p2align 4
+L(56bytes):
+	movdqu	-56(%eax), %xmm1
+	movdqu	-56(%edx), %xmm2
+	mov	$-56, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(40bytes):
+	mov	$-40, %ebx
+	movdqu	-40(%eax), %xmm1
+	movdqu	-40(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(24bytes):
+	mov	$-24, %ebx
+	movdqu	-24(%eax), %xmm1
+	movdqu	-24(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(57bytes):
+	movdqu	-57(%eax), %xmm1
+	movdqu	-57(%edx), %xmm2
+	mov	$-57, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(41bytes):
+	mov	$-41, %ebx
+	movdqu	-41(%eax), %xmm1
+	movdqu	-41(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(25bytes):
+	mov	$-25, %ebx
+	movdqu	-25(%eax), %xmm1
+	movdqu	-25(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(58bytes):
+	movdqu	-58(%eax), %xmm1
+	movdqu	-58(%edx), %xmm2
+	mov	$-58, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(42bytes):
+	mov	$-42, %ebx
+	movdqu	-42(%eax), %xmm1
+	movdqu	-42(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(26bytes):
+	mov	$-26, %ebx
+	movdqu	-26(%eax), %xmm1
+	movdqu	-26(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(59bytes):
+	movdqu	-59(%eax), %xmm1
+	movdqu	-59(%edx), %xmm2
+	mov	$-59, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(43bytes):
+	mov	$-43, %ebx
+	movdqu	-43(%eax), %xmm1
+	movdqu	-43(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(27bytes):
+	mov	$-27, %ebx
+	movdqu	-27(%eax), %xmm1
+	movdqu	-27(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+	.p2align 4
+L(60bytes):
+	movdqu	-60(%eax), %xmm1
+	movdqu	-60(%edx), %xmm2
+	mov	$-60, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(44bytes):
+	mov	$-44, %ebx
+	movdqu	-44(%eax), %xmm1
+	movdqu	-44(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(28bytes):
+	mov	$-28, %ebx
+	movdqu	-28(%eax), %xmm1
+	movdqu	-28(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-12(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(61bytes):
+	movdqu	-61(%eax), %xmm1
+	movdqu	-61(%edx), %xmm2
+	mov	$-61, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(45bytes):
+	mov	$-45, %ebx
+	movdqu	-45(%eax), %xmm1
+	movdqu	-45(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(29bytes):
+	mov	$-29, %ebx
+	movdqu	-29(%eax), %xmm1
+	movdqu	-29(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(62bytes):
+	movdqu	-62(%eax), %xmm1
+	movdqu	-62(%edx), %xmm2
+	mov	$-62, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(46bytes):
+	mov	$-46, %ebx
+	movdqu	-46(%eax), %xmm1
+	movdqu	-46(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(30bytes):
+	mov	$-30, %ebx
+	movdqu	-30(%eax), %xmm1
+	movdqu	-30(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(63bytes):
+	movdqu	-63(%eax), %xmm1
+	movdqu	-63(%edx), %xmm2
+	mov	$-63, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(47bytes):
+	mov	$-47, %ebx
+	movdqu	-47(%eax), %xmm1
+	movdqu	-47(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(31bytes):
+	mov	$-31, %ebx
+	movdqu	-31(%eax), %xmm1
+	movdqu	-31(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+
+	.p2align 4
+L(64bytes):
+	movdqu	-64(%eax), %xmm1
+	movdqu	-64(%edx), %xmm2
+	mov	$-64, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(48bytes):
+	movdqu	-48(%eax), %xmm1
+	movdqu	-48(%edx), %xmm2
+	mov	$-48, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(32bytes):
+	movdqu	-32(%eax), %xmm1
+	movdqu	-32(%edx), %xmm2
+	mov	$-32, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-16(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-16(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-12(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	mov	(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	mov	4(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	mov	8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	mov	12(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# else
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	cmp	4(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	cmp	8(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	cmp	12(%edx), %ecx
+
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# endif
+
+	.p2align 4
+L(find_diff):
+# ifndef USE_AS_WMEMCMP
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+L(end):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+# else
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(bigger):
+	ret
+# endif
+END (MEMCMP)
+
+	.section .rodata.sse4.2,"a",@progbits
+	.p2align 2
+	.type	L(table_64bytes), @object
+# ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(1bytes), L(table_64bytes))
+	.int	JMPTBL (L(2bytes), L(table_64bytes))
+	.int	JMPTBL (L(3bytes), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(5bytes), L(table_64bytes))
+	.int	JMPTBL (L(6bytes), L(table_64bytes))
+	.int	JMPTBL (L(7bytes), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(9bytes), L(table_64bytes))
+	.int	JMPTBL (L(10bytes), L(table_64bytes))
+	.int	JMPTBL (L(11bytes), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(13bytes), L(table_64bytes))
+	.int	JMPTBL (L(14bytes), L(table_64bytes))
+	.int	JMPTBL (L(15bytes), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(17bytes), L(table_64bytes))
+	.int	JMPTBL (L(18bytes), L(table_64bytes))
+	.int	JMPTBL (L(19bytes), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(21bytes), L(table_64bytes))
+	.int	JMPTBL (L(22bytes), L(table_64bytes))
+	.int	JMPTBL (L(23bytes), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(25bytes), L(table_64bytes))
+	.int	JMPTBL (L(26bytes), L(table_64bytes))
+	.int	JMPTBL (L(27bytes), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(29bytes), L(table_64bytes))
+	.int	JMPTBL (L(30bytes), L(table_64bytes))
+	.int	JMPTBL (L(31bytes), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(33bytes), L(table_64bytes))
+	.int	JMPTBL (L(34bytes), L(table_64bytes))
+	.int	JMPTBL (L(35bytes), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(37bytes), L(table_64bytes))
+	.int	JMPTBL (L(38bytes), L(table_64bytes))
+	.int	JMPTBL (L(39bytes), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(41bytes), L(table_64bytes))
+	.int	JMPTBL (L(42bytes), L(table_64bytes))
+	.int	JMPTBL (L(43bytes), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(45bytes), L(table_64bytes))
+	.int	JMPTBL (L(46bytes), L(table_64bytes))
+	.int	JMPTBL (L(47bytes), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(49bytes), L(table_64bytes))
+	.int	JMPTBL (L(50bytes), L(table_64bytes))
+	.int	JMPTBL (L(51bytes), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(53bytes), L(table_64bytes))
+	.int	JMPTBL (L(54bytes), L(table_64bytes))
+	.int	JMPTBL (L(55bytes), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(57bytes), L(table_64bytes))
+	.int	JMPTBL (L(58bytes), L(table_64bytes))
+	.int	JMPTBL (L(59bytes), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(61bytes), L(table_64bytes))
+	.int	JMPTBL (L(62bytes), L(table_64bytes))
+	.int	JMPTBL (L(63bytes), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+# else
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000000..5ebf5a4d73
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -0,0 +1,2157 @@
+/* memcmp with SSSE3, wmemcmp with SSSE3
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP		__memcmp_ssse3
+# endif
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS		4
+# define BLK1		PARMS
+# define BLK2		BLK1+4
+# define LEN		BLK2+4
+# define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
+# define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	atom_text_section
+ENTRY (MEMCMP)
+	movl	LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(zero)
+# endif
+
+	movl	BLK1(%esp), %eax
+	cmp	$48, %ecx
+	movl	BLK2(%esp), %edx
+	jae	L(48bytesormore)
+
+# ifndef USE_AS_WMEMCMP
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+# endif
+
+	PUSH	(%ebx)
+	add	%ecx, %edx
+	add	%ecx, %eax
+	jmp	L(less48bytes)
+
+	CFI_POP	(%ebx)
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less1bytes):
+	jb	L(zero)
+	movb	(%eax), %cl
+	cmp	(%edx), %cl
+	je	L(zero)
+	mov	$1, %eax
+	ja	L(1bytesend)
+	neg	%eax
+L(1bytesend):
+	ret
+# endif
+
+	.p2align 4
+L(zero):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(48bytesormore):
+	PUSH	(%ebx)
+	PUSH	(%esi)
+	PUSH	(%edi)
+	cfi_remember_state
+	movdqu	(%eax), %xmm3
+	movdqu	(%edx), %xmm0
+	movl	%eax, %edi
+	movl	%edx, %esi
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%edi), %edi
+
+	sub	$0xffff, %edx
+	lea	16(%esi), %esi
+	jnz	L(less16bytes)
+	mov	%edi, %edx
+	and	$0xf, %edx
+	xor	%edx, %edi
+	sub	%edx, %esi
+	add	%edx, %ecx
+	mov	%esi, %edx
+	and	$0xf, %edx
+	jz	L(shr_0)
+	xor	%edx, %esi
+
+# ifndef USE_AS_WMEMCMP
+	cmp	$8, %edx
+	jae	L(next_unaligned_table)
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$1, %edx
+	je	L(shr_1)
+	cmp	$2, %edx
+	je	L(shr_2)
+	cmp	$3, %edx
+	je	L(shr_3)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$5, %edx
+	je	L(shr_5)
+	cmp	$6, %edx
+	je	L(shr_6)
+	jmp	L(shr_7)
+
+	.p2align 2
+L(next_unaligned_table):
+	cmp	$8, %edx
+	je	L(shr_8)
+	cmp	$9, %edx
+	je	L(shr_9)
+	cmp	$10, %edx
+	je	L(shr_10)
+	cmp	$11, %edx
+	je	L(shr_11)
+	cmp	$12, %edx
+	je	L(shr_12)
+	cmp	$13, %edx
+	je	L(shr_13)
+	cmp	$14, %edx
+	je	L(shr_14)
+	jmp	L(shr_15)
+# else
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$8, %edx
+	je	L(shr_8)
+	jmp	L(shr_12)
+# endif
+
+	.p2align 4
+L(shr_0):
+	cmp	$80, %ecx
+	jae	L(shr_0_gobble)
+	lea	-48(%ecx), %ecx
+	xor	%eax, %eax
+	movaps	(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+	movaps	16(%esi), %xmm2
+	pcmpeqb	16(%edi), %xmm2
+	pand	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
+	add	$32, %edi
+	add	$32, %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_0_gobble):
+	lea	-48(%ecx), %ecx
+	movdqa	(%esi), %xmm0
+	xor	%eax, %eax
+	pcmpeqb	(%edi), %xmm0
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm2
+	pcmpeqb	16(%edi), %xmm2
+L(shr_0_gobble_loop):
+	pand	%xmm0, %xmm2
+	sub	$32, %ecx
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	movdqa	32(%esi), %xmm0
+	movdqa	48(%esi), %xmm2
+	sbb	$0xffff, %edx
+	pcmpeqb	32(%edi), %xmm0
+	pcmpeqb	48(%edi), %xmm2
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	jz	L(shr_0_gobble_loop)
+
+	pand	%xmm0, %xmm2
+	cmp	$0, %ecx
+	jge	L(shr_0_gobble_loop_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_0_gobble_loop_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_1):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_1_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$1,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$1,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	1(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_1_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$1,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$1,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_1_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$1,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$1,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_1_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_1_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_1_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	1(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_2):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_2_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$2,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$2,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	2(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_2_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$2,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$2,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_2_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$2,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$2,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_2_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_2_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_2_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	2(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_3):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_3_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$3,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$3,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	3(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_3_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$3,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$3,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_3_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$3,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$3,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_3_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_3_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_3_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	3(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_4):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_4_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$4,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$4,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	4(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_4_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$4,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$4,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_4_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$4,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$4,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_4_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_4_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_4_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	4(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_5):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_5_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$5,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$5,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	5(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_5_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$5,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$5,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_5_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$5,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$5,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_5_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_5_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_5_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	5(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_6):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_6_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$6,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$6,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	6(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_6_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$6,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$6,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_6_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$6,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$6,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_6_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_6_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_6_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	6(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_7):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_7_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$7,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$7,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	7(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_7_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$7,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$7,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_7_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$7,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$7,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_7_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_7_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_7_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	7(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_8):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_8_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$8,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$8,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	8(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_8_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$8,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$8,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_8_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$8,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$8,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_8_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_8_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_8_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	8(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_9):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_9_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$9,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$9,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	9(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_9_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$9,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$9,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_9_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$9,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$9,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_9_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_9_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_9_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	9(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_10):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_10_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$10, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$10,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	10(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_10_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$10, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$10, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_10_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$10,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$10,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_10_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_10_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_10_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	10(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_11):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_11_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$11, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	11(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_11_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$11, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$11, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_11_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$11,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$11,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_11_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_11_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_11_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	11(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_12):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_12_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$12, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	12(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_12_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$12, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$12, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_12_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$12,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$12,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_12_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_12_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_12_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	12(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_13):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_13_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$13, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	13(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_13_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$13, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$13, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_13_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$13,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$13,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_13_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_13_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_13_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	13(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_14):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_14_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$14, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	14(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_14_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$14, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$14, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_14_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$14,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$14,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_14_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_14_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_14_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	14(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_15):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_15_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$15, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	15(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_15_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$15, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$15, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_15_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$15,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$15,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_15_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_15_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_15_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	15(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(exit):
+	pmovmskb %xmm1, %ebx
+	sub	$0xffff, %ebx
+	jz	L(first16bytes)
+	lea	-16(%esi), %esi
+	lea	-16(%edi), %edi
+	mov	%ebx, %edx
+
+L(first16bytes):
+	add	%eax, %esi
+L(less16bytes):
+
+# ifndef USE_AS_WMEMCMP
+	test	%dl, %dl
+	jz	L(next_24_bytes)
+
+	test	$0x01, %dl
+	jnz	L(Byte16)
+
+	test	$0x02, %dl
+	jnz	L(Byte17)
+
+	test	$0x04, %dl
+	jnz	L(Byte18)
+
+	test	$0x08, %dl
+	jnz	L(Byte19)
+
+	test	$0x10, %dl
+	jnz	L(Byte20)
+
+	test	$0x20, %dl
+	jnz	L(Byte21)
+
+	test	$0x40, %dl
+	jnz	L(Byte22)
+L(Byte23):
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte16):
+	movzbl	-16(%edi), %eax
+	movzbl	-16(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte17):
+	movzbl	-15(%edi), %eax
+	movzbl	-15(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte18):
+	movzbl	-14(%edi), %eax
+	movzbl	-14(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte19):
+	movzbl	-13(%edi), %eax
+	movzbl	-13(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte20):
+	movzbl	-12(%edi), %eax
+	movzbl	-12(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte21):
+	movzbl	-11(%edi), %eax
+	movzbl	-11(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte22):
+	movzbl	-10(%edi), %eax
+	movzbl	-10(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(next_24_bytes):
+	lea	8(%edi), %edi
+	lea	8(%esi), %esi
+	test	$0x01, %dh
+	jnz	L(Byte16)
+
+	test	$0x02, %dh
+	jnz	L(Byte17)
+
+	test	$0x04, %dh
+	jnz	L(Byte18)
+
+	test	$0x08, %dh
+	jnz	L(Byte19)
+
+	test	$0x10, %dh
+	jnz	L(Byte20)
+
+	test	$0x20, %dh
+	jnz	L(Byte21)
+
+	test	$0x40, %dh
+	jnz	L(Byte22)
+
+	.p2align 4
+L(Byte31):
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
+	sub	%edx, %eax
+	RETURN_END
+# else
+
+/* special for wmemcmp */
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	-16(%edi), %eax
+	cmp	-16(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word):
+	mov	-12(%edi), %eax
+	cmp	-12(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	-8(%edi), %eax
+	cmp	-8(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word):
+	mov	-4(%edi), %eax
+	cmp	-4(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(nequal):
+	mov	$1, %eax
+	jg	L(nequal_bigger)
+	neg	%eax
+	RETURN
+
+	.p2align 4
+L(nequal_bigger):
+	RETURN_END
+# endif
+
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(more8bytes):
+	cmp	$16, %ecx
+	jae	L(more16bytes)
+	cmp	$8, %ecx
+	je	L(8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$9, %ecx
+	je	L(9bytes)
+	cmp	$10, %ecx
+	je	L(10bytes)
+	cmp	$11, %ecx
+	je	L(11bytes)
+	cmp	$12, %ecx
+	je	L(12bytes)
+	cmp	$13, %ecx
+	je	L(13bytes)
+	cmp	$14, %ecx
+	je	L(14bytes)
+	jmp	L(15bytes)
+# else
+	jmp	L(12bytes)
+# endif
+
+	.p2align 4
+L(more16bytes):
+	cmp	$24, %ecx
+	jae	L(more24bytes)
+	cmp	$16, %ecx
+	je	L(16bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$17, %ecx
+	je	L(17bytes)
+	cmp	$18, %ecx
+	je	L(18bytes)
+	cmp	$19, %ecx
+	je	L(19bytes)
+	cmp	$20, %ecx
+	je	L(20bytes)
+	cmp	$21, %ecx
+	je	L(21bytes)
+	cmp	$22, %ecx
+	je	L(22bytes)
+	jmp	L(23bytes)
+# else
+	jmp	L(20bytes)
+# endif
+
+	.p2align 4
+L(more24bytes):
+	cmp	$32, %ecx
+	jae	L(more32bytes)
+	cmp	$24, %ecx
+	je	L(24bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$25, %ecx
+	je	L(25bytes)
+	cmp	$26, %ecx
+	je	L(26bytes)
+	cmp	$27, %ecx
+	je	L(27bytes)
+	cmp	$28, %ecx
+	je	L(28bytes)
+	cmp	$29, %ecx
+	je	L(29bytes)
+	cmp	$30, %ecx
+	je	L(30bytes)
+	jmp	L(31bytes)
+# else
+	jmp	L(28bytes)
+# endif
+
+	.p2align 4
+L(more32bytes):
+	cmp	$40, %ecx
+	jae	L(more40bytes)
+	cmp	$32, %ecx
+	je	L(32bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$33, %ecx
+	je	L(33bytes)
+	cmp	$34, %ecx
+	je	L(34bytes)
+	cmp	$35, %ecx
+	je	L(35bytes)
+	cmp	$36, %ecx
+	je	L(36bytes)
+	cmp	$37, %ecx
+	je	L(37bytes)
+	cmp	$38, %ecx
+	je	L(38bytes)
+	jmp	L(39bytes)
+# else
+	jmp	L(36bytes)
+# endif
+
+	.p2align 4
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+# else
+	jmp	L(4bytes)
+# endif
+
+	.p2align 4
+L(more40bytes):
+	cmp	$40, %ecx
+	je	L(40bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$41, %ecx
+	je	L(41bytes)
+	cmp	$42, %ecx
+	je	L(42bytes)
+	cmp	$43, %ecx
+	je	L(43bytes)
+	cmp	$44, %ecx
+	je	L(44bytes)
+	cmp	$45, %ecx
+	je	L(45bytes)
+	cmp	$46, %ecx
+	je	L(46bytes)
+	jmp	L(47bytes)
+
+	.p2align 4
+L(44bytes):
+	mov	-44(%eax), %ecx
+	mov	-44(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	mov	-40(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	mov	-36(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	mov	-32(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	mov	-28(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	mov	-24(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	mov	-20(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+# else
+	.p2align 4
+L(44bytes):
+	mov	-44(%eax), %ecx
+	cmp	-44(%edx), %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	cmp	-40(%edx), %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	cmp	-36(%edx), %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	cmp	-32(%edx), %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	cmp	-28(%edx), %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	cmp	-24(%edx), %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	cmp	-20(%edx), %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	xor	%eax, %eax
+	cmp	-4(%edx), %ecx
+	jne	L(find_diff)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+# endif
+
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
+L(45bytes):
+	mov	-45(%eax), %ecx
+	mov	-45(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(41bytes):
+	mov	-41(%eax), %ecx
+	mov	-41(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(37bytes):
+	mov	-37(%eax), %ecx
+	mov	-37(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(33bytes):
+	mov	-33(%eax), %ecx
+	mov	-33(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(29bytes):
+	mov	-29(%eax), %ecx
+	mov	-29(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(25bytes):
+	mov	-25(%eax), %ecx
+	mov	-25(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(21bytes):
+	mov	-21(%eax), %ecx
+	mov	-21(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(46bytes):
+	mov	-46(%eax), %ecx
+	mov	-46(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(42bytes):
+	mov	-42(%eax), %ecx
+	mov	-42(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(38bytes):
+	mov	-38(%eax), %ecx
+	mov	-38(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(34bytes):
+	mov	-34(%eax), %ecx
+	mov	-34(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(30bytes):
+	mov	-30(%eax), %ecx
+	mov	-30(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(26bytes):
+	mov	-26(%eax), %ecx
+	mov	-26(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(22bytes):
+	mov	-22(%eax), %ecx
+	mov	-22(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(47bytes):
+	movl	-47(%eax), %ecx
+	movl	-47(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(43bytes):
+	movl	-43(%eax), %ecx
+	movl	-43(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(39bytes):
+	movl	-39(%eax), %ecx
+	movl	-39(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(35bytes):
+	movl	-35(%eax), %ecx
+	movl	-35(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(31bytes):
+	movl	-31(%eax), %ecx
+	movl	-31(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(27bytes):
+	movl	-27(%eax), %ecx
+	movl	-27(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(23bytes):
+	movl	-23(%eax), %ecx
+	movl	-23(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(find_diff):
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+
+	.p2align 4
+L(end):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+# else
+
+/* for wmemcmp */
+	.p2align 4
+L(find_diff):
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+# endif
+END (MEMCMP)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S
new file mode 100644
index 0000000000..1fc5994a17
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S
@@ -0,0 +1,62 @@
+/* Multiple versions of memcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(memcmp)
+	.type	memcmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcmp_sse4_2)
+2:	ret
+END(memcmp)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcmp_ia32, @function; \
+	.p2align 4; \
+	.globl __memcmp_ia32; \
+	.hidden __memcmp_ia32; \
+	__memcmp_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcmp_ia32, .-__memcmp_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memcmp; __GI_memcmp = __memcmp_ia32
+# endif
+#endif
+
+#include "../memcmp.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..2fe2072cb1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
@@ -0,0 +1,681 @@
+/* memcpy optimized with SSE2 unaligned memory access instructions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+#  define MEMCPY	__memcpy_sse2_unaligned
+#  define MEMCPY_CHK	__memcpy_chk_sse2_unaligned
+# endif
+
+# ifdef USE_AS_BCOPY
+#  define SRC		PARMS
+#  define DEST		SRC+4
+#  define LEN		DEST+4
+# else
+#  define DEST		PARMS
+#  define SRC		DEST+4
+#  define LEN		SRC+4
+# endif
+
+# define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
+  cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
+  cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS		8		/* Preserve EBX.  */
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN	RETURN_END; CFI_PUSH (%ebx)
+
+	.section .text.sse2,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+	cmp	%edx, %eax
+
+# ifdef USE_AS_MEMMOVE
+	jg	L(check_forward)
+
+L(mm_len_0_or_more_backward):
+/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
+	separately.  */
+	cmp	$16, %ecx
+	jbe	L(mm_len_0_16_bytes_backward)
+
+	cmpl	$32, %ecx
+	jg	L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_32_or_more_backward):
+	cmpl	$64, %ecx
+	jg	L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	-16(%eax, %ecx), %xmm2
+	movdqu	-32(%eax, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -16(%edx, %ecx)
+	movdqu	%xmm3, -32(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_64_or_more_backward):
+	cmpl	$128, %ecx
+	jg	L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_128_or_more_backward):
+	add	%ecx, %eax
+	cmp	%edx, %eax
+	movl	SRC(%esp), %eax
+	jle	L(forward)
+	PUSH (%esi)
+	PUSH (%edi)
+	PUSH (%ebx)
+
+/* Aligning the address of destination. */
+	movdqu	(%eax), %xmm4
+	movdqu	16(%eax), %xmm5
+	movdqu	32(%eax), %xmm6
+	movdqu	48(%eax), %xmm7
+	leal	(%edx, %ecx), %esi
+	movdqu	-16(%eax, %ecx), %xmm0
+	subl	$16, %esp
+	movdqu	%xmm0, (%esp)
+	mov	%ecx, %edi
+	movl	%esi, %ecx
+	andl	$-16, %ecx
+	leal	(%ecx), %ebx
+	subl	%edx, %ebx
+	leal	(%eax, %ebx), %eax
+	shrl	$6, %ebx
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %edi
+# else
+#  ifdef SHARED
+	PUSH (%ebx)
+	SETUP_PIC_REG (bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %edi
+	POP (%ebx)
+#  else
+	cmp	__x86_shared_cache_size_half, %edi
+#  endif
+# endif
+	jae	L(mm_large_page_loop_backward)
+
+	.p2align 4
+L(mm_main_loop_backward):
+
+	prefetcht0 -128(%eax)
+
+	movdqu	-64(%eax), %xmm0
+	movdqu	-48(%eax), %xmm1
+	movdqu	-32(%eax), %xmm2
+	movdqu	-16(%eax), %xmm3
+	movaps	%xmm0, -64(%ecx)
+	subl	$64, %eax
+	movaps	%xmm1, -48(%ecx)
+	movaps	%xmm2, -32(%ecx)
+	movaps	%xmm3, -16(%ecx)
+	subl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_main_loop_backward)
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, -16(%esi)
+	movdqu	%xmm4, (%edx)
+	movdqu	%xmm5, 16(%edx)
+	movdqu	%xmm6, 32(%edx)
+	movdqu	%xmm7, 48(%edx)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+
+/* Copy [0..16] and return.  */
+L(mm_len_0_16_bytes_backward):
+	testb	$24, %cl
+	jnz	L(mm_len_9_16_bytes_backward)
+	testb	$4, %cl
+	.p2align 4,,5
+	jnz	L(mm_len_5_8_bytes_backward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	testb	$2, %cl
+	.p2align 4,,1
+	jne	L(mm_len_3_4_bytes_backward)
+	movzbl	-1(%eax,%ecx), %ebx
+	movzbl	(%eax), %eax
+	movb	%bl, -1(%edx,%ecx)
+	movb	%al, (%edx)
+	jmp	L(return)
+
+L(mm_len_3_4_bytes_backward):
+	movzwl	-2(%eax,%ecx), %ebx
+	movzwl	(%eax), %eax
+	movw	%bx, -2(%edx,%ecx)
+	movw	%ax, (%edx)
+	jmp	L(return)
+
+L(mm_len_9_16_bytes_backward):
+	PUSH (%esi)
+	movl	-4(%eax,%ecx), %ebx
+	movl	-8(%eax,%ecx), %esi
+	movl	%ebx, -4(%edx,%ecx)
+	movl	%esi, -8(%edx,%ecx)
+	subl	$8, %ecx
+	POP (%esi)
+	jmp	L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+	movl	(%eax), %ebx
+	movl	-4(%eax,%ecx), %eax
+	movl	%ebx, (%edx)
+	movl	%eax, -4(%edx,%ecx)
+	jmp	L(return)
+
+/* Big length copy backward part.  */
+	.p2align 4
+L(mm_large_page_loop_backward):
+	movdqu	-64(%eax), %xmm0
+	movdqu	-48(%eax), %xmm1
+	movdqu	-32(%eax), %xmm2
+	movdqu	-16(%eax), %xmm3
+	movntdq	%xmm0, -64(%ecx)
+	subl	$64, %eax
+	movntdq	%xmm1, -48(%ecx)
+	movntdq	%xmm2, -32(%ecx)
+	movntdq	%xmm3, -16(%ecx)
+	subl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_large_page_loop_backward)
+	sfence
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, -16(%esi)
+	movdqu	%xmm4, (%edx)
+	movdqu	%xmm5, 16(%edx)
+	movdqu	%xmm6, 32(%edx)
+	movdqu	%xmm7, 48(%edx)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+
+L(check_forward):
+	add	%edx, %ecx
+	cmp	%eax, %ecx
+	movl	LEN(%esp), %ecx
+	jle	L(forward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+	separately.  */
+	cmp	$16, %ecx
+	jbe	L(mm_len_0_16_bytes_forward)
+
+	cmpl	$32, %ecx
+	ja	L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_32_or_more_forward):
+	cmpl	$64, %ecx
+	ja	L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	-16(%eax, %ecx), %xmm2
+	movdqu	-32(%eax, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -16(%edx, %ecx)
+	movdqu	%xmm3, -32(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_64_or_more_forward):
+	cmpl	$128, %ecx
+	ja	L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_128_or_more_forward):
+	PUSH (%esi)
+	PUSH (%edi)
+	PUSH (%ebx)
+
+/* Aligning the address of destination. */
+	movdqu	-16(%eax, %ecx), %xmm4
+	movdqu	-32(%eax, %ecx), %xmm5
+	movdqu	-48(%eax, %ecx), %xmm6
+	movdqu	-64(%eax, %ecx), %xmm7
+	leal	(%edx, %ecx), %esi
+	movdqu	(%eax), %xmm0
+	subl	$16, %esp
+	movdqu	%xmm0, (%esp)
+	mov	%ecx, %edi
+	leal	16(%edx), %ecx
+	andl	$-16, %ecx
+	movl	%ecx, %ebx
+	subl	%edx, %ebx
+	addl	%ebx, %eax
+	movl	%esi, %ebx
+	subl	%ecx, %ebx
+	shrl	$6, %ebx
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %edi
+# else
+#  ifdef SHARED
+	PUSH (%ebx)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %edi
+	POP (%ebx)
+#  else
+	cmp	__x86_shared_cache_size_half, %edi
+#  endif
+# endif
+	jae	L(mm_large_page_loop_forward)
+
+	.p2align 4
+L(mm_main_loop_forward):
+
+	prefetcht0 128(%eax)
+
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqa	%xmm0, (%ecx)
+	addl	$64, %eax
+	movaps	%xmm1, 16(%ecx)
+	movaps	%xmm2, 32(%ecx)
+	movaps	%xmm3, 48(%ecx)
+	addl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_main_loop_forward)
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm4, -16(%esi)
+	movdqu	%xmm5, -32(%esi)
+	movdqu	%xmm6, -48(%esi)
+	movdqu	%xmm7, -64(%esi)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_len_0_16_bytes_forward):
+	testb	$24, %cl
+	jne	L(mm_len_9_16_bytes_forward)
+	testb	$4, %cl
+	.p2align 4,,5
+	jne	L(mm_len_5_8_bytes_forward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	testb	$2, %cl
+	.p2align 4,,1
+	jne	L(mm_len_2_4_bytes_forward)
+	movzbl	-1(%eax,%ecx), %ebx
+	movzbl	(%eax), %eax
+	movb	%bl, -1(%edx,%ecx)
+	movb	%al, (%edx)
+	jmp	L(return)
+
+L(mm_len_2_4_bytes_forward):
+	movzwl	-2(%eax,%ecx), %ebx
+	movzwl	(%eax), %eax
+	movw	%bx, -2(%edx,%ecx)
+	movw	%ax, (%edx)
+	jmp	L(return)
+
+L(mm_len_5_8_bytes_forward):
+	movl	(%eax), %ebx
+	movl	-4(%eax,%ecx), %eax
+	movl	%ebx, (%edx)
+	movl	%eax, -4(%edx,%ecx)
+	jmp	L(return)
+
+L(mm_len_9_16_bytes_forward):
+	movq	(%eax), %xmm0
+	movq	-8(%eax, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_return_pop_all):
+	movl	%edx, %eax
+	POP (%edi)
+	POP (%esi)
+	RETURN
+
+/* Big length copy forward part.  */
+	.p2align 4
+L(mm_large_page_loop_forward):
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movntdq	%xmm0, (%ecx)
+	addl	$64, %eax
+	movntdq	%xmm1, 16(%ecx)
+	movntdq	%xmm2, 32(%ecx)
+	movntdq	%xmm3, 48(%ecx)
+	addl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_large_page_loop_forward)
+	sfence
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm4, -16(%esi)
+	movdqu	%xmm5, -32(%esi)
+	movdqu	%xmm6, -48(%esi)
+	movdqu	%xmm7, -64(%esi)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+# endif
+
+L(forward):
+	cmp	$16, %ecx
+	jbe	L(len_0_16_bytes)
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_shared_cache_size_half, %ecx
+#  endif
+# endif
+	jae     L(large_page)
+
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	cmpl    $32, %ecx
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jbe	L(return)
+
+	movdqu	16(%eax), %xmm0
+	movdqu	-32(%eax, %ecx), %xmm1
+	cmpl    $64, %ecx
+	movdqu	%xmm0, 16(%edx)
+	movdqu	%xmm1, -32(%edx, %ecx)
+	jbe	L(return)
+
+	movdqu	32(%eax), %xmm0
+	movdqu	48(%eax), %xmm1
+	movdqu	-48(%eax, %ecx), %xmm2
+	movdqu	-64(%eax, %ecx), %xmm3
+	cmpl    $128, %ecx
+	movdqu	%xmm0, 32(%edx)
+	movdqu	%xmm1, 48(%edx)
+	movdqu	%xmm2, -48(%edx, %ecx)
+	movdqu	%xmm3, -64(%edx, %ecx)
+	jbe	L(return)
+
+/* Now the main loop: we align the address of the destination.  */
+	leal	64(%edx), %ebx
+	andl	$-64, %ebx
+
+	addl	%edx, %ecx
+	andl	$-64, %ecx
+
+	subl	%edx, %eax
+
+/* We should stop two iterations before the termination
+	(in order not to misprefetch).  */
+	subl	$64, %ecx
+	cmpl	%ebx, %ecx
+	je	L(main_loop_just_one_iteration)
+
+	subl	$64, %ecx
+	cmpl	%ebx, %ecx
+	je	L(main_loop_last_two_iterations)
+
+	.p2align 4
+L(main_loop_cache):
+
+	prefetcht0 128(%ebx, %eax)
+
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqa	%xmm0, (%ebx)
+	movaps	%xmm1, 16(%ebx)
+	movaps	%xmm2, 32(%ebx)
+	movaps	%xmm3, 48(%ebx)
+	lea	64(%ebx), %ebx
+	cmpl	%ebx, %ecx
+	jne	L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqu	64(%ebx, %eax), %xmm4
+	movdqu	80(%ebx, %eax), %xmm5
+	movdqu	96(%ebx, %eax), %xmm6
+	movdqu	112(%ebx, %eax), %xmm7
+	movdqa	%xmm0, (%ebx)
+	movaps	%xmm1, 16(%ebx)
+	movaps	%xmm2, 32(%ebx)
+	movaps	%xmm3, 48(%ebx)
+	movaps	%xmm4, 64(%ebx)
+	movaps	%xmm5, 80(%ebx)
+	movaps	%xmm6, 96(%ebx)
+	movaps	%xmm7, 112(%ebx)
+	jmp	L(return)
+
+L(main_loop_just_one_iteration):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqa	%xmm0, (%ebx)
+	movaps	%xmm1, 16(%ebx)
+	movaps	%xmm2, 32(%ebx)
+	movaps	%xmm3, 48(%ebx)
+	jmp	L(return)
+
+L(large_page):
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+
+	movdqu	64(%eax), %xmm0
+	movdqu	80(%eax), %xmm1
+	movdqu	96(%eax), %xmm2
+	movdqu	112(%eax), %xmm3
+	movdqu	-128(%eax, %ecx), %xmm4
+	movdqu	-112(%eax, %ecx), %xmm5
+	movdqu	-96(%eax, %ecx), %xmm6
+	movdqu	-80(%eax, %ecx), %xmm7
+	movdqu	%xmm0, 64(%edx)
+	movdqu	%xmm1, 80(%edx)
+	movdqu	%xmm2, 96(%edx)
+	movdqu	%xmm3, 112(%edx)
+	movdqu	%xmm4, -128(%edx, %ecx)
+	movdqu	%xmm5, -112(%edx, %ecx)
+	movdqu	%xmm6, -96(%edx, %ecx)
+	movdqu	%xmm7, -80(%edx, %ecx)
+
+/* Now the main loop with non temporal stores. We align
+	the address of the destination.  */
+	leal	128(%edx), %ebx
+	andl	$-128, %ebx
+
+	addl	%edx, %ecx
+	andl	$-128, %ecx
+
+	subl	%edx, %eax
+
+	.p2align 4
+L(main_loop_large_page):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqu	64(%ebx, %eax), %xmm4
+	movdqu	80(%ebx, %eax), %xmm5
+	movdqu	96(%ebx, %eax), %xmm6
+	movdqu	112(%ebx, %eax), %xmm7
+	movntdq	%xmm0, (%ebx)
+	movntdq	%xmm1, 16(%ebx)
+	movntdq	%xmm2, 32(%ebx)
+	movntdq	%xmm3, 48(%ebx)
+	movntdq	%xmm4, 64(%ebx)
+	movntdq	%xmm5, 80(%ebx)
+	movntdq	%xmm6, 96(%ebx)
+	movntdq	%xmm7, 112(%ebx)
+	lea	128(%ebx), %ebx
+	cmpl	%ebx, %ecx
+	jne	L(main_loop_large_page)
+	sfence
+	jmp	L(return)
+
+L(len_0_16_bytes):
+	testb	$24, %cl
+	jne	L(len_9_16_bytes)
+	testb	$4, %cl
+	.p2align 4,,5
+	jne	L(len_5_8_bytes)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	movzbl	(%eax), %ebx
+	testb	$2, %cl
+	movb	%bl, (%edx)
+	je	L(return)
+	movzwl	-2(%eax,%ecx), %ebx
+	movw	%bx, -2(%edx,%ecx)
+	jmp	L(return)
+
+L(len_9_16_bytes):
+	movq	(%eax), %xmm0
+	movq	-8(%eax, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(return)
+
+L(len_5_8_bytes):
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+	movl	-4(%eax,%ecx), %ebx
+	movl	%ebx, -4(%edx,%ecx)
+
+L(return):
+	movl	%edx, %eax
+# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+	RETURN
+
+END (MEMCPY)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
new file mode 100644
index 0000000000..687e083147
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
@@ -0,0 +1,1809 @@
+/* memcpy with SSSE3 and REP string.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3_rep
+# define MEMCPY_CHK	__memcpy_chk_ssse3_rep
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC		PARMS
+# define DEST		SRC+4
+# define LEN		DEST+4
+#else
+# define DEST		PARMS
+# define SRC		DEST+4
+# define LEN		SRC+4
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef SHARED
+# define PARMS		8		/* Preserve EBX.  */
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */				\
+    SETUP_PIC_REG(bx);						\
+    /* Get the address of the jump table.  */			\
+    addl	$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
+    addl	$(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+#else
+# define PARMS		4
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%eax, %edx
+	jb	L(copy_forward)
+	je	L(fwd_write_0bytes)
+	cmp	$48, %ecx
+	jb	L(bk_write_less48bytes)
+	add	%ecx, %eax
+	cmp	%eax, %edx
+	movl	SRC(%esp), %eax
+	jb	L(copy_backward)
+
+L(copy_forward):
+#endif
+	cmp	$48, %ecx
+	jae	L(48bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dl, %al
+	jb	L(bk_write)
+#endif
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+#endif
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(48bytesormore):
+	movdqu	(%eax), %xmm0
+	PUSH (%edi)
+	movl	%edx, %edi
+	and	$-16, %edx
+	PUSH (%esi)
+	cfi_remember_state
+	add	$16, %edx
+	movl	%edi, %esi
+	sub	%edx, %edi
+	add	%edi, %ecx
+	sub	%edi, %eax
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_shared_cache_size_half, %ecx
+# endif
+#endif
+
+	mov	%eax, %edi
+	jae	L(large_page)
+	and	$0xf, %edi
+	jz	L(shl_0)
+
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+	ALIGN (4)
+L(shl_0):
+	movdqu	%xmm0, (%esi)
+	xor	%edi, %edi
+	cmp	$127, %ecx
+	ja	L(shl_0_gobble)
+	lea	-32(%ecx), %ecx
+L(shl_0_loop):
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+L(shl_0_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	add	%edi, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+L(shl_0_gobble):
+
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_data_cache_size_half@GOTOFF(%ebx), %edi
+# else
+	mov	__x86_data_cache_size_half, %edi
+# endif
+#endif
+	mov	%edi, %esi
+	shr	$3, %esi
+	sub	%esi, %edi
+	cmp	%edi, %ecx
+	jae	L(shl_0_gobble_mem_start)
+	sub	$128, %ecx
+	ALIGN (4)
+L(shl_0_gobble_cache_loop):
+	movdqa	(%eax), %xmm0
+	movaps	0x10(%eax), %xmm1
+	movaps	0x20(%eax), %xmm2
+	movaps	0x30(%eax), %xmm3
+	movaps	0x40(%eax), %xmm4
+	movaps	0x50(%eax), %xmm5
+	movaps	0x60(%eax), %xmm6
+	movaps	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm1, 0x10(%edx)
+	movaps	%xmm2, 0x20(%edx)
+	movaps	%xmm3, 0x30(%edx)
+	movaps	%xmm4, 0x40(%edx)
+	movaps	%xmm5, 0x50(%edx)
+	movaps	%xmm6, 0x60(%edx)
+	movaps	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_cache_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_cache_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_cache_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_cache_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_cache_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_cache_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_0_gobble_mem_start):
+	cmp	%al, %dl
+	je	L(copy_page_by_rep)
+	sub	$128, %ecx
+L(shl_0_gobble_mem_loop):
+	prefetchnta 0x1c0(%eax)
+	prefetchnta 0x280(%eax)
+	prefetchnta 0x1c0(%edx)
+	prefetchnta 0x280(%edx)
+
+	movdqa	(%eax), %xmm0
+	movaps	0x10(%eax), %xmm1
+	movaps	0x20(%eax), %xmm2
+	movaps	0x30(%eax), %xmm3
+	movaps	0x40(%eax), %xmm4
+	movaps	0x50(%eax), %xmm5
+	movaps	0x60(%eax), %xmm6
+	movaps	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$0x80, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm1, 0x10(%edx)
+	movaps	%xmm2, 0x20(%edx)
+	movaps	%xmm3, 0x30(%edx)
+	movaps	%xmm4, 0x40(%edx)
+	movaps	%xmm5, 0x50(%edx)
+	movaps	%xmm6, 0x60(%edx)
+	movaps	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_mem_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_mem_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_mem_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_mem_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_mem_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_1):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$1, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_1_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_1_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_1_loop)
+
+L(shl_1_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	1(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_2):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$2, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_2_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_2_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_2_loop)
+
+L(shl_2_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	2(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_3):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$3, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_3_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_3_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_3_loop)
+
+L(shl_3_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	3(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_4):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$4, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_4_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_4_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_4_loop)
+
+L(shl_4_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	4(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_5):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$5, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_5_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_5_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_5_loop)
+
+L(shl_5_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	5(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_6):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$6, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_6_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_6_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_6_loop)
+
+L(shl_6_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	6(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_7):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$7, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_7_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_7_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_7_loop)
+
+L(shl_7_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	7(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_8):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$8, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_8_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_8_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_8_loop)
+
+L(shl_8_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	8(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_9):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$9, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_9_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_9_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_9_loop)
+
+L(shl_9_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	9(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_10):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$10, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_10_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_10_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_10_loop)
+
+L(shl_10_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	10(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_11):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$11, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_11_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_11_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_11_loop)
+
+L(shl_11_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	11(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_12):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$12, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_12_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_12_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_12_loop)
+
+L(shl_12_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	12(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_13):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$13, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_13_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_13_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_13_loop)
+
+L(shl_13_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	13(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_14):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$14, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_14_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_14_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_14_loop)
+
+L(shl_14_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	14(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_15):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$15, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_15_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_15_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_15_loop)
+
+L(shl_15_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	15(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(fwd_write_44bytes):
+	movl	-44(%eax), %ecx
+	movl	%ecx, -44(%edx)
+L(fwd_write_40bytes):
+	movl	-40(%eax), %ecx
+	movl	%ecx, -40(%edx)
+L(fwd_write_36bytes):
+	movl	-36(%eax), %ecx
+	movl	%ecx, -36(%edx)
+L(fwd_write_32bytes):
+	movl	-32(%eax), %ecx
+	movl	%ecx, -32(%edx)
+L(fwd_write_28bytes):
+	movl	-28(%eax), %ecx
+	movl	%ecx, -28(%edx)
+L(fwd_write_24bytes):
+	movl	-24(%eax), %ecx
+	movl	%ecx, -24(%edx)
+L(fwd_write_20bytes):
+	movl	-20(%eax), %ecx
+	movl	%ecx, -20(%edx)
+L(fwd_write_16bytes):
+	movl	-16(%eax), %ecx
+	movl	%ecx, -16(%edx)
+L(fwd_write_12bytes):
+	movl	-12(%eax), %ecx
+	movl	%ecx, -12(%edx)
+L(fwd_write_8bytes):
+	movl	-8(%eax), %ecx
+	movl	%ecx, -8(%edx)
+L(fwd_write_4bytes):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+L(fwd_write_0bytes):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_5bytes):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_45bytes):
+	movl	-45(%eax), %ecx
+	movl	%ecx, -45(%edx)
+L(fwd_write_41bytes):
+	movl	-41(%eax), %ecx
+	movl	%ecx, -41(%edx)
+L(fwd_write_37bytes):
+	movl	-37(%eax), %ecx
+	movl	%ecx, -37(%edx)
+L(fwd_write_33bytes):
+	movl	-33(%eax), %ecx
+	movl	%ecx, -33(%edx)
+L(fwd_write_29bytes):
+	movl	-29(%eax), %ecx
+	movl	%ecx, -29(%edx)
+L(fwd_write_25bytes):
+	movl	-25(%eax), %ecx
+	movl	%ecx, -25(%edx)
+L(fwd_write_21bytes):
+	movl	-21(%eax), %ecx
+	movl	%ecx, -21(%edx)
+L(fwd_write_17bytes):
+	movl	-17(%eax), %ecx
+	movl	%ecx, -17(%edx)
+L(fwd_write_13bytes):
+	movl	-13(%eax), %ecx
+	movl	%ecx, -13(%edx)
+L(fwd_write_9bytes):
+	movl	-9(%eax), %ecx
+	movl	%ecx, -9(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+L(fwd_write_1bytes):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_46bytes):
+	movl	-46(%eax), %ecx
+	movl	%ecx, -46(%edx)
+L(fwd_write_42bytes):
+	movl	-42(%eax), %ecx
+	movl	%ecx, -42(%edx)
+L(fwd_write_38bytes):
+	movl	-38(%eax), %ecx
+	movl	%ecx, -38(%edx)
+L(fwd_write_34bytes):
+	movl	-34(%eax), %ecx
+	movl	%ecx, -34(%edx)
+L(fwd_write_30bytes):
+	movl	-30(%eax), %ecx
+	movl	%ecx, -30(%edx)
+L(fwd_write_26bytes):
+	movl	-26(%eax), %ecx
+	movl	%ecx, -26(%edx)
+L(fwd_write_22bytes):
+	movl	-22(%eax), %ecx
+	movl	%ecx, -22(%edx)
+L(fwd_write_18bytes):
+	movl	-18(%eax), %ecx
+	movl	%ecx, -18(%edx)
+L(fwd_write_14bytes):
+	movl	-14(%eax), %ecx
+	movl	%ecx, -14(%edx)
+L(fwd_write_10bytes):
+	movl	-10(%eax), %ecx
+	movl	%ecx, -10(%edx)
+L(fwd_write_6bytes):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+L(fwd_write_2bytes):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_47bytes):
+	movl	-47(%eax), %ecx
+	movl	%ecx, -47(%edx)
+L(fwd_write_43bytes):
+	movl	-43(%eax), %ecx
+	movl	%ecx, -43(%edx)
+L(fwd_write_39bytes):
+	movl	-39(%eax), %ecx
+	movl	%ecx, -39(%edx)
+L(fwd_write_35bytes):
+	movl	-35(%eax), %ecx
+	movl	%ecx, -35(%edx)
+L(fwd_write_31bytes):
+	movl	-31(%eax), %ecx
+	movl	%ecx, -31(%edx)
+L(fwd_write_27bytes):
+	movl	-27(%eax), %ecx
+	movl	%ecx, -27(%edx)
+L(fwd_write_23bytes):
+	movl	-23(%eax), %ecx
+	movl	%ecx, -23(%edx)
+L(fwd_write_19bytes):
+	movl	-19(%eax), %ecx
+	movl	%ecx, -19(%edx)
+L(fwd_write_15bytes):
+	movl	-15(%eax), %ecx
+	movl	%ecx, -15(%edx)
+L(fwd_write_11bytes):
+	movl	-11(%eax), %ecx
+	movl	%ecx, -11(%edx)
+L(fwd_write_7bytes):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+L(fwd_write_3bytes):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN_END
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(large_page):
+	movdqu	(%eax), %xmm1
+	movdqu	%xmm0, (%esi)
+	movntdq	%xmm1, (%edx)
+	add	$0x10, %eax
+	add	$0x10, %edx
+	sub	$0x10, %ecx
+	cmp	%al, %dl
+	je	L(copy_page_by_rep)
+L(large_page_loop_init):
+	POP (%esi)
+	sub	$0x80, %ecx
+	POP (%edi)
+L(large_page_loop):
+	prefetchnta	0x1c0(%eax)
+	prefetchnta	0x280(%eax)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	movdqu	0x40(%eax), %xmm4
+	movdqu	0x50(%eax), %xmm5
+	movdqu	0x60(%eax), %xmm6
+	movdqu	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	lfence
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	movntdq	%xmm4, 0x40(%edx)
+	movntdq	%xmm5, 0x50(%edx)
+	movntdq	%xmm6, 0x60(%edx)
+	movntdq	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+	jae	L(large_page_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(large_page_less_64bytes)
+
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	lea	0x40(%eax), %eax
+
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	lea	0x40(%edx), %edx
+	sub	$0x40, %ecx
+L(large_page_less_64bytes):
+	cmp	$32, %ecx
+	jb	L(large_page_less_32bytes)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	lea	0x20(%eax), %eax
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	lea	0x20(%edx), %edx
+	sub	$0x20, %ecx
+L(large_page_less_32bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(copy_page_by_rep):
+	mov	%eax, %esi
+	mov	%edx, %edi
+	mov	%ecx, %edx
+	shr	$2, %ecx
+	and	$3, %edx
+	rep	movsl
+	jz	L(copy_page_by_rep_exit)
+	cmp	$2, %edx
+	jb	L(copy_page_by_rep_left_1)
+	movzwl	(%esi), %eax
+	movw	%ax, (%edi)
+	add	$2, %esi
+	add	$2, %edi
+	sub	$2, %edx
+	jz	L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+	movzbl	(%esi), %eax
+	movb	%al, (%edi)
+L(copy_page_by_rep_exit):
+	POP (%esi)
+	POP (%edi)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_44bytes):
+	movl	40(%eax), %ecx
+	movl	%ecx, 40(%edx)
+L(bk_write_40bytes):
+	movl	36(%eax), %ecx
+	movl	%ecx, 36(%edx)
+L(bk_write_36bytes):
+	movl	32(%eax), %ecx
+	movl	%ecx, 32(%edx)
+L(bk_write_32bytes):
+	movl	28(%eax), %ecx
+	movl	%ecx, 28(%edx)
+L(bk_write_28bytes):
+	movl	24(%eax), %ecx
+	movl	%ecx, 24(%edx)
+L(bk_write_24bytes):
+	movl	20(%eax), %ecx
+	movl	%ecx, 20(%edx)
+L(bk_write_20bytes):
+	movl	16(%eax), %ecx
+	movl	%ecx, 16(%edx)
+L(bk_write_16bytes):
+	movl	12(%eax), %ecx
+	movl	%ecx, 12(%edx)
+L(bk_write_12bytes):
+	movl	8(%eax), %ecx
+	movl	%ecx, 8(%edx)
+L(bk_write_8bytes):
+	movl	4(%eax), %ecx
+	movl	%ecx, 4(%edx)
+L(bk_write_4bytes):
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)
+L(bk_write_0bytes):
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_45bytes):
+	movl	41(%eax), %ecx
+	movl	%ecx, 41(%edx)
+L(bk_write_41bytes):
+	movl	37(%eax), %ecx
+	movl	%ecx, 37(%edx)
+L(bk_write_37bytes):
+	movl	33(%eax), %ecx
+	movl	%ecx, 33(%edx)
+L(bk_write_33bytes):
+	movl	29(%eax), %ecx
+	movl	%ecx, 29(%edx)
+L(bk_write_29bytes):
+	movl	25(%eax), %ecx
+	movl	%ecx, 25(%edx)
+L(bk_write_25bytes):
+	movl	21(%eax), %ecx
+	movl	%ecx, 21(%edx)
+L(bk_write_21bytes):
+	movl	17(%eax), %ecx
+	movl	%ecx, 17(%edx)
+L(bk_write_17bytes):
+	movl	13(%eax), %ecx
+	movl	%ecx, 13(%edx)
+L(bk_write_13bytes):
+	movl	9(%eax), %ecx
+	movl	%ecx, 9(%edx)
+L(bk_write_9bytes):
+	movl	5(%eax), %ecx
+	movl	%ecx, 5(%edx)
+L(bk_write_5bytes):
+	movl	1(%eax), %ecx
+	movl	%ecx, 1(%edx)
+L(bk_write_1bytes):
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_46bytes):
+	movl	42(%eax), %ecx
+	movl	%ecx, 42(%edx)
+L(bk_write_42bytes):
+	movl	38(%eax), %ecx
+	movl	%ecx, 38(%edx)
+L(bk_write_38bytes):
+	movl	34(%eax), %ecx
+	movl	%ecx, 34(%edx)
+L(bk_write_34bytes):
+	movl	30(%eax), %ecx
+	movl	%ecx, 30(%edx)
+L(bk_write_30bytes):
+	movl	26(%eax), %ecx
+	movl	%ecx, 26(%edx)
+L(bk_write_26bytes):
+	movl	22(%eax), %ecx
+	movl	%ecx, 22(%edx)
+L(bk_write_22bytes):
+	movl	18(%eax), %ecx
+	movl	%ecx, 18(%edx)
+L(bk_write_18bytes):
+	movl	14(%eax), %ecx
+	movl	%ecx, 14(%edx)
+L(bk_write_14bytes):
+	movl	10(%eax), %ecx
+	movl	%ecx, 10(%edx)
+L(bk_write_10bytes):
+	movl	6(%eax), %ecx
+	movl	%ecx, 6(%edx)
+L(bk_write_6bytes):
+	movl	2(%eax), %ecx
+	movl	%ecx, 2(%edx)
+L(bk_write_2bytes):
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_47bytes):
+	movl	43(%eax), %ecx
+	movl	%ecx, 43(%edx)
+L(bk_write_43bytes):
+	movl	39(%eax), %ecx
+	movl	%ecx, 39(%edx)
+L(bk_write_39bytes):
+	movl	35(%eax), %ecx
+	movl	%ecx, 35(%edx)
+L(bk_write_35bytes):
+	movl	31(%eax), %ecx
+	movl	%ecx, 31(%edx)
+L(bk_write_31bytes):
+	movl	27(%eax), %ecx
+	movl	%ecx, 27(%edx)
+L(bk_write_27bytes):
+	movl	23(%eax), %ecx
+	movl	%ecx, 23(%edx)
+L(bk_write_23bytes):
+	movl	19(%eax), %ecx
+	movl	%ecx, 19(%edx)
+L(bk_write_19bytes):
+	movl	15(%eax), %ecx
+	movl	%ecx, 15(%edx)
+L(bk_write_15bytes):
+	movl	11(%eax), %ecx
+	movl	%ecx, 11(%edx)
+L(bk_write_11bytes):
+	movl	7(%eax), %ecx
+	movl	%ecx, 7(%edx)
+L(bk_write_7bytes):
+	movl	3(%eax), %ecx
+	movl	%ecx, 3(%edx)
+L(bk_write_3bytes):
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN_END
+
+
+	.pushsection .rodata.ssse3,"a",@progbits
+	ALIGN (2)
+L(table_48bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+	ALIGN (2)
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	ALIGN (2)
+L(table_48_bytes_bwd):
+	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+	.popsection
+
+#ifdef USE_AS_MEMMOVE
+	ALIGN (4)
+L(copy_backward):
+	PUSH (%esi)
+	movl	%eax, %esi
+	add	%ecx, %edx
+	add	%ecx, %esi
+	testl	$0x3, %edx
+	jnz	L(bk_align)
+
+L(bk_aligned_4):
+	cmp	$64, %ecx
+	jae	L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+	cmp	$32, %ecx
+	jb	L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+	/* Copy 32 bytes at a time.  */
+	sub	$32, %ecx
+	movl	-4(%esi), %eax
+	movl	%eax, -4(%edx)
+	movl	-8(%esi), %eax
+	movl	%eax, -8(%edx)
+	movl	-12(%esi), %eax
+	movl	%eax, -12(%edx)
+	movl	-16(%esi), %eax
+	movl	%eax, -16(%edx)
+	movl	-20(%esi), %eax
+	movl	%eax, -20(%edx)
+	movl	-24(%esi), %eax
+	movl	%eax, -24(%edx)
+	movl	-28(%esi), %eax
+	movl	%eax, -28(%edx)
+	movl	-32(%esi), %eax
+	movl	%eax, -32(%edx)
+	sub	$32, %edx
+	sub	$32, %esi
+
+L(bk_write_less32bytes):
+	movl	%esi, %eax
+	sub	%ecx, %edx
+	sub	%ecx, %eax
+	POP (%esi)
+L(bk_write_less48bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+	CFI_PUSH (%esi)
+	ALIGN (4)
+L(bk_align):
+	cmp	$8, %ecx
+	jbe	L(bk_write_less32bytes)
+	testl	$1, %edx
+	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+	   then (EDX & 2) must be != 0.  */
+	jz	L(bk_got2)
+	sub	$1, %esi
+	sub	$1, %ecx
+	sub	$1, %edx
+	movzbl	(%esi), %eax
+	movb	%al, (%edx)
+
+	testl	$2, %edx
+	jz	L(bk_aligned_4)
+
+L(bk_got2):
+	sub	$2, %esi
+	sub	$2, %ecx
+	sub	$2, %edx
+	movzwl	(%esi), %eax
+	movw	%ax, (%edx)
+	jmp	L(bk_aligned_4)
+
+	ALIGN (4)
+L(bk_write_more64bytes):
+	/* Check alignment of last byte.  */
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes.  */
+L(bk_ssse3_align):
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+	cmp	$64, %ecx
+	jb	L(bk_write_more32bytes)
+
+L(bk_ssse3_cpy):
+	sub	$64, %esi
+	sub	$64, %ecx
+	sub	$64, %edx
+	movdqu	0x30(%esi), %xmm3
+	movdqa	%xmm3, 0x30(%edx)
+	movdqu	0x20(%esi), %xmm2
+	movdqa	%xmm2, 0x20(%edx)
+	movdqu	0x10(%esi), %xmm1
+	movdqa	%xmm1, 0x10(%edx)
+	movdqu	(%esi), %xmm0
+	movdqa	%xmm0, (%edx)
+	cmp	$64, %ecx
+	jae	L(bk_ssse3_cpy)
+	jmp	L(bk_write_64bytesless)
+
+#endif
+
+END (MEMCPY)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
new file mode 100644
index 0000000000..53e8a6ca1d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -0,0 +1,3162 @@
+/* memcpy with SSSE3
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+#  define MEMCPY		__memcpy_ssse3
+#  define MEMCPY_CHK	__memcpy_chk_ssse3
+# endif
+
+# ifdef USE_AS_BCOPY
+#  define SRC		PARMS
+#  define DEST		SRC+4
+#  define LEN		DEST+4
+# else
+#  define DEST		PARMS
+#  define SRC		DEST+4
+#  define LEN		SRC+4
+# endif
+
+# define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
+  cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
+  cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# ifdef SHARED
+#  define PARMS		8		/* Preserve EBX.  */
+#  define ENTRANCE	PUSH (%ebx);
+#  define RETURN_END	POP (%ebx); ret
+#  define RETURN		RETURN_END; CFI_PUSH (%ebx)
+#  define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump table with relative offsets.  INDEX is a register contains the
+	index into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */		\
+	SETUP_PIC_REG(bx);		\
+    /* Get the address of the jump table.  */		\
+	addl	$(TABLE - .), %ebx;		\
+    /* Get the entry and convert the relative offset to the		\
+	absolute	address.  */		\
+	addl	(%ebx, INDEX, SCALE), %ebx;		\
+    /* We loaded the jump table.  Go.  */		\
+	jmp	*%ebx
+# else
+
+#  define PARMS		4
+#  define ENTRANCE
+#  define RETURN_END	ret
+#  define RETURN		RETURN_END
+#  define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute offsets.  INDEX is a register contains the index into the
+	jump table.  SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	jmp	*TABLE(, INDEX, SCALE)
+# endif
+
+	.section .text.ssse3,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+# ifdef USE_AS_MEMMOVE
+	cmp	%eax, %edx
+	jb	L(copy_forward)
+	je	L(fwd_write_0bytes)
+	cmp	$32, %ecx
+	jae	L(memmove_bwd)
+	jmp	L(bk_write_less32bytes_2)
+
+	.p2align 4
+L(memmove_bwd):
+	add	%ecx, %eax
+	cmp	%eax, %edx
+	movl	SRC(%esp), %eax
+	jb	L(copy_backward)
+
+L(copy_forward):
+# endif
+	cmp	$48, %ecx
+	jae	L(48bytesormore)
+
+L(fwd_write_less32bytes):
+# ifndef USE_AS_MEMMOVE
+	cmp	%dl, %al
+	jb	L(bk_write)
+# endif
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+# ifndef USE_AS_MEMMOVE
+	.p2align 4
+L(bk_write):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+# endif
+
+	.p2align 4
+L(48bytesormore):
+# ifndef USE_AS_MEMMOVE
+	movlpd	(%eax), %xmm0
+	movlpd	8(%eax), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+# else
+	movdqu	(%eax), %xmm0
+# endif
+	PUSH (%edi)
+	movl	%edx, %edi
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %edi
+	add	%edi, %ecx
+	sub	%edi, %eax
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_shared_cache_size_half, %ecx
+#  endif
+# endif
+
+	mov	%eax, %edi
+	jae	L(large_page)
+	and	$0xf, %edi
+	jz	L(shl_0)
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+	.p2align 4
+L(shl_0):
+# ifdef USE_AS_MEMMOVE
+	movl	DEST+4(%esp), %edi
+	movdqu	%xmm0, (%edi)
+# endif
+	xor	%edi, %edi
+	cmp	$127, %ecx
+	ja	L(shl_0_gobble)
+	lea	-32(%ecx), %ecx
+
+	.p2align 4
+L(shl_0_loop):
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+
+L(shl_0_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	add	%edi, %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_0_gobble):
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	POP	(%edi)
+	lea	-128(%ecx), %ecx
+	jae	L(shl_0_gobble_mem_loop)
+
+	.p2align 4
+L(shl_0_gobble_cache_loop):
+	movdqa	(%eax), %xmm0
+	movdqa	0x10(%eax), %xmm1
+	movdqa	0x20(%eax), %xmm2
+	movdqa	0x30(%eax), %xmm3
+	movdqa	0x40(%eax), %xmm4
+	movdqa	0x50(%eax), %xmm5
+	movdqa	0x60(%eax), %xmm6
+	movdqa	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	%xmm2, 0x20(%edx)
+	movdqa	%xmm3, 0x30(%edx)
+	movdqa	%xmm4, 0x40(%edx)
+	movdqa	%xmm5, 0x50(%edx)
+	movdqa	%xmm6, 0x60(%edx)
+	movdqa	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_cache_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+
+L(shl_0_cache_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_cache_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+
+L(shl_0_cache_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_cache_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+
+L(shl_0_cache_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	.p2align 4
+L(shl_0_gobble_mem_loop):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x280(%eax)
+	prefetcht0 0x1c0(%edx)
+
+	movdqa	(%eax), %xmm0
+	movdqa	0x10(%eax), %xmm1
+	movdqa	0x20(%eax), %xmm2
+	movdqa	0x30(%eax), %xmm3
+	movdqa	0x40(%eax), %xmm4
+	movdqa	0x50(%eax), %xmm5
+	movdqa	0x60(%eax), %xmm6
+	movdqa	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$0x80, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	%xmm2, 0x20(%edx)
+	movdqa	%xmm3, 0x30(%edx)
+	movdqa	%xmm4, 0x40(%edx)
+	movdqa	%xmm5, 0x50(%edx)
+	movdqa	%xmm6, 0x60(%edx)
+	movdqa	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_mem_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_mem_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+
+L(shl_0_mem_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_mem_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+
+L(shl_0_mem_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
+
+	.p2align 4
+L(shl_1):
+# ifndef USE_AS_MEMMOVE
+	movaps	-1(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-1(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_1_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl1LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	15(%eax), %xmm2
+	movaps	31(%eax), %xmm3
+	movaps	47(%eax), %xmm4
+	movaps	63(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	palignr	$1, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl1LoopStart)
+
+L(Shl1LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	15(%eax), %xmm2
+	movaps	31(%eax), %xmm3
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_1_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-1(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_1_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_1_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_1_no_prefetch_loop)
+
+L(sh_1_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	1(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_2):
+# ifndef USE_AS_MEMMOVE
+	movaps	-2(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-2(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_2_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl2LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	14(%eax), %xmm2
+	movaps	30(%eax), %xmm3
+	movaps	46(%eax), %xmm4
+	movaps	62(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	palignr	$2, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl2LoopStart)
+
+L(Shl2LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	14(%eax), %xmm2
+	movaps	30(%eax), %xmm3
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_2_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-2(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_2_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_2_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_2_no_prefetch_loop)
+
+L(sh_2_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	2(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_3):
+# ifndef USE_AS_MEMMOVE
+	movaps	-3(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-3(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_3_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl3LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	13(%eax), %xmm2
+	movaps	29(%eax), %xmm3
+	movaps	45(%eax), %xmm4
+	movaps	61(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	palignr	$3, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl3LoopStart)
+
+L(Shl3LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	13(%eax), %xmm2
+	movaps	29(%eax), %xmm3
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_3_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-3(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_3_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_3_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_3_no_prefetch_loop)
+
+L(sh_3_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	3(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_4):
+# ifndef USE_AS_MEMMOVE
+	movaps	-4(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-4(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_4_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl4LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	12(%eax), %xmm2
+	movaps	28(%eax), %xmm3
+	movaps	44(%eax), %xmm4
+	movaps	60(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	palignr	$4, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl4LoopStart)
+
+L(Shl4LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	12(%eax), %xmm2
+	movaps	28(%eax), %xmm3
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_4_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-4(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_4_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_4_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_4_no_prefetch_loop)
+
+L(sh_4_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	4(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_5):
+# ifndef USE_AS_MEMMOVE
+	movaps	-5(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-5(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_5_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl5LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	11(%eax), %xmm2
+	movaps	27(%eax), %xmm3
+	movaps	43(%eax), %xmm4
+	movaps	59(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	palignr	$5, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl5LoopStart)
+
+L(Shl5LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	11(%eax), %xmm2
+	movaps	27(%eax), %xmm3
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_5_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-5(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_5_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_5_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_5_no_prefetch_loop)
+
+L(sh_5_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	5(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_6):
+# ifndef USE_AS_MEMMOVE
+	movaps	-6(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-6(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_6_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl6LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	10(%eax), %xmm2
+	movaps	26(%eax), %xmm3
+	movaps	42(%eax), %xmm4
+	movaps	58(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	palignr	$6, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl6LoopStart)
+
+L(Shl6LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	10(%eax), %xmm2
+	movaps	26(%eax), %xmm3
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_6_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-6(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_6_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_6_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_6_no_prefetch_loop)
+
+L(sh_6_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	6(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_7):
+# ifndef USE_AS_MEMMOVE
+	movaps	-7(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-7(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_7_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl7LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	9(%eax), %xmm2
+	movaps	25(%eax), %xmm3
+	movaps	41(%eax), %xmm4
+	movaps	57(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	palignr	$7, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl7LoopStart)
+
+L(Shl7LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	9(%eax), %xmm2
+	movaps	25(%eax), %xmm3
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_7_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-7(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_7_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_7_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_7_no_prefetch_loop)
+
+L(sh_7_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	7(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_8):
+# ifndef USE_AS_MEMMOVE
+	movaps	-8(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-8(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_8_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl8LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	8(%eax), %xmm2
+	movaps	24(%eax), %xmm3
+	movaps	40(%eax), %xmm4
+	movaps	56(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	palignr	$8, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl8LoopStart)
+
+L(LoopLeave8):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	8(%eax), %xmm2
+	movaps	24(%eax), %xmm3
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_8_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-8(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_8_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_8_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_8_no_prefetch_loop)
+
+L(sh_8_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	8(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_9):
+# ifndef USE_AS_MEMMOVE
+	movaps	-9(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-9(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_9_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl9LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	7(%eax), %xmm2
+	movaps	23(%eax), %xmm3
+	movaps	39(%eax), %xmm4
+	movaps	55(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	palignr	$9, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl9LoopStart)
+
+L(Shl9LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	7(%eax), %xmm2
+	movaps	23(%eax), %xmm3
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_9_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-9(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_9_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_9_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_9_no_prefetch_loop)
+
+L(sh_9_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	9(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_10):
+# ifndef USE_AS_MEMMOVE
+	movaps	-10(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-10(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_10_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl10LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	6(%eax), %xmm2
+	movaps	22(%eax), %xmm3
+	movaps	38(%eax), %xmm4
+	movaps	54(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	palignr	$10, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl10LoopStart)
+
+L(Shl10LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	6(%eax), %xmm2
+	movaps	22(%eax), %xmm3
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_10_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-10(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_10_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_10_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_10_no_prefetch_loop)
+
+L(sh_10_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	10(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_11):
+# ifndef USE_AS_MEMMOVE
+	movaps	-11(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-11(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_11_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl11LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	5(%eax), %xmm2
+	movaps	21(%eax), %xmm3
+	movaps	37(%eax), %xmm4
+	movaps	53(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	palignr	$11, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl11LoopStart)
+
+L(Shl11LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	5(%eax), %xmm2
+	movaps	21(%eax), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_11_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-11(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_11_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_11_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_11_no_prefetch_loop)
+
+L(sh_11_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	11(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_12):
+# ifndef USE_AS_MEMMOVE
+	movaps	-12(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-12(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_12_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl12LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	4(%eax), %xmm2
+	movaps	20(%eax), %xmm3
+	movaps	36(%eax), %xmm4
+	movaps	52(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	palignr	$12, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl12LoopStart)
+
+L(Shl12LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	4(%eax), %xmm2
+	movaps	20(%eax), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_12_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-12(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_12_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_12_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_12_no_prefetch_loop)
+
+L(sh_12_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	12(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_13):
+# ifndef USE_AS_MEMMOVE
+	movaps	-13(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-13(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_13_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl13LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	3(%eax), %xmm2
+	movaps	19(%eax), %xmm3
+	movaps	35(%eax), %xmm4
+	movaps	51(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	palignr	$13, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl13LoopStart)
+
+L(Shl13LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	3(%eax), %xmm2
+	movaps	19(%eax), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_13_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-13(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_13_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_13_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_13_no_prefetch_loop)
+
+L(sh_13_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	13(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_14):
+# ifndef USE_AS_MEMMOVE
+	movaps	-14(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-14(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_14_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl14LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	2(%eax), %xmm2
+	movaps	18(%eax), %xmm3
+	movaps	34(%eax), %xmm4
+	movaps	50(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	palignr	$14, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl14LoopStart)
+
+L(Shl14LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	2(%eax), %xmm2
+	movaps	18(%eax), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_14_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-14(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_14_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_14_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_14_no_prefetch_loop)
+
+L(sh_14_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	14(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_15):
+# ifndef USE_AS_MEMMOVE
+	movaps	-15(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-15(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_15_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl15LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	1(%eax), %xmm2
+	movaps	17(%eax), %xmm3
+	movaps	33(%eax), %xmm4
+	movaps	49(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	palignr	$15, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl15LoopStart)
+
+L(Shl15LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	1(%eax), %xmm2
+	movaps	17(%eax), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_15_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-15(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_15_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_15_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_15_no_prefetch_loop)
+
+L(sh_15_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	15(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_end_0):
+	lea	32(%ecx), %ecx
+	lea	(%edx, %ecx), %edx
+	lea	(%eax, %ecx), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	.p2align 4
+L(fwd_write_44bytes):
+	movq	-44(%eax), %xmm0
+	movq	%xmm0, -44(%edx)
+L(fwd_write_36bytes):
+	movq	-36(%eax), %xmm0
+	movq	%xmm0, -36(%edx)
+L(fwd_write_28bytes):
+	movq	-28(%eax), %xmm0
+	movq	%xmm0, -28(%edx)
+L(fwd_write_20bytes):
+	movq	-20(%eax), %xmm0
+	movq	%xmm0, -20(%edx)
+L(fwd_write_12bytes):
+	movq	-12(%eax), %xmm0
+	movq	%xmm0, -12(%edx)
+L(fwd_write_4bytes):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_40bytes):
+	movq	-40(%eax), %xmm0
+	movq	%xmm0, -40(%edx)
+L(fwd_write_32bytes):
+	movq	-32(%eax), %xmm0
+	movq	%xmm0, -32(%edx)
+L(fwd_write_24bytes):
+	movq	-24(%eax), %xmm0
+	movq	%xmm0, -24(%edx)
+L(fwd_write_16bytes):
+	movq	-16(%eax), %xmm0
+	movq	%xmm0, -16(%edx)
+L(fwd_write_8bytes):
+	movq	-8(%eax), %xmm0
+	movq	%xmm0, -8(%edx)
+L(fwd_write_0bytes):
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_5bytes):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_45bytes):
+	movq	-45(%eax), %xmm0
+	movq	%xmm0, -45(%edx)
+L(fwd_write_37bytes):
+	movq	-37(%eax), %xmm0
+	movq	%xmm0, -37(%edx)
+L(fwd_write_29bytes):
+	movq	-29(%eax), %xmm0
+	movq	%xmm0, -29(%edx)
+L(fwd_write_21bytes):
+	movq	-21(%eax), %xmm0
+	movq	%xmm0, -21(%edx)
+L(fwd_write_13bytes):
+	movq	-13(%eax), %xmm0
+	movq	%xmm0, -13(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_41bytes):
+	movq	-41(%eax), %xmm0
+	movq	%xmm0, -41(%edx)
+L(fwd_write_33bytes):
+	movq	-33(%eax), %xmm0
+	movq	%xmm0, -33(%edx)
+L(fwd_write_25bytes):
+	movq	-25(%eax), %xmm0
+	movq	%xmm0, -25(%edx)
+L(fwd_write_17bytes):
+	movq	-17(%eax), %xmm0
+	movq	%xmm0, -17(%edx)
+L(fwd_write_9bytes):
+	movq	-9(%eax), %xmm0
+	movq	%xmm0, -9(%edx)
+L(fwd_write_1bytes):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_46bytes):
+	movq	-46(%eax), %xmm0
+	movq	%xmm0, -46(%edx)
+L(fwd_write_38bytes):
+	movq	-38(%eax), %xmm0
+	movq	%xmm0, -38(%edx)
+L(fwd_write_30bytes):
+	movq	-30(%eax), %xmm0
+	movq	%xmm0, -30(%edx)
+L(fwd_write_22bytes):
+	movq	-22(%eax), %xmm0
+	movq	%xmm0, -22(%edx)
+L(fwd_write_14bytes):
+	movq	-14(%eax), %xmm0
+	movq	%xmm0, -14(%edx)
+L(fwd_write_6bytes):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_42bytes):
+	movq	-42(%eax), %xmm0
+	movq	%xmm0, -42(%edx)
+L(fwd_write_34bytes):
+	movq	-34(%eax), %xmm0
+	movq	%xmm0, -34(%edx)
+L(fwd_write_26bytes):
+	movq	-26(%eax), %xmm0
+	movq	%xmm0, -26(%edx)
+L(fwd_write_18bytes):
+	movq	-18(%eax), %xmm0
+	movq	%xmm0, -18(%edx)
+L(fwd_write_10bytes):
+	movq	-10(%eax), %xmm0
+	movq	%xmm0, -10(%edx)
+L(fwd_write_2bytes):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_47bytes):
+	movq	-47(%eax), %xmm0
+	movq	%xmm0, -47(%edx)
+L(fwd_write_39bytes):
+	movq	-39(%eax), %xmm0
+	movq	%xmm0, -39(%edx)
+L(fwd_write_31bytes):
+	movq	-31(%eax), %xmm0
+	movq	%xmm0, -31(%edx)
+L(fwd_write_23bytes):
+	movq	-23(%eax), %xmm0
+	movq	%xmm0, -23(%edx)
+L(fwd_write_15bytes):
+	movq	-15(%eax), %xmm0
+	movq	%xmm0, -15(%edx)
+L(fwd_write_7bytes):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_43bytes):
+	movq	-43(%eax), %xmm0
+	movq	%xmm0, -43(%edx)
+L(fwd_write_35bytes):
+	movq	-35(%eax), %xmm0
+	movq	%xmm0, -35(%edx)
+L(fwd_write_27bytes):
+	movq	-27(%eax), %xmm0
+	movq	%xmm0, -27(%edx)
+L(fwd_write_19bytes):
+	movq	-19(%eax), %xmm0
+	movq	%xmm0, -19(%edx)
+L(fwd_write_11bytes):
+	movq	-11(%eax), %xmm0
+	movq	%xmm0, -11(%edx)
+L(fwd_write_3bytes):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_40bytes_align):
+	movdqa	-40(%eax), %xmm0
+	movdqa	%xmm0, -40(%edx)
+L(fwd_write_24bytes_align):
+	movdqa	-24(%eax), %xmm0
+	movdqa	%xmm0, -24(%edx)
+L(fwd_write_8bytes_align):
+	movq	-8(%eax), %xmm0
+	movq	%xmm0, -8(%edx)
+L(fwd_write_0bytes_align):
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_32bytes_align):
+	movdqa	-32(%eax), %xmm0
+	movdqa	%xmm0, -32(%edx)
+L(fwd_write_16bytes_align):
+	movdqa	-16(%eax), %xmm0
+	movdqa	%xmm0, -16(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_5bytes_align):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_45bytes_align):
+	movdqa	-45(%eax), %xmm0
+	movdqa	%xmm0, -45(%edx)
+L(fwd_write_29bytes_align):
+	movdqa	-29(%eax), %xmm0
+	movdqa	%xmm0, -29(%edx)
+L(fwd_write_13bytes_align):
+	movq	-13(%eax), %xmm0
+	movq	%xmm0, -13(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_37bytes_align):
+	movdqa	-37(%eax), %xmm0
+	movdqa	%xmm0, -37(%edx)
+L(fwd_write_21bytes_align):
+	movdqa	-21(%eax), %xmm0
+	movdqa	%xmm0, -21(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_41bytes_align):
+	movdqa	-41(%eax), %xmm0
+	movdqa	%xmm0, -41(%edx)
+L(fwd_write_25bytes_align):
+	movdqa	-25(%eax), %xmm0
+	movdqa	%xmm0, -25(%edx)
+L(fwd_write_9bytes_align):
+	movq	-9(%eax), %xmm0
+	movq	%xmm0, -9(%edx)
+L(fwd_write_1bytes_align):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_33bytes_align):
+	movdqa	-33(%eax), %xmm0
+	movdqa	%xmm0, -33(%edx)
+L(fwd_write_17bytes_align):
+	movdqa	-17(%eax), %xmm0
+	movdqa	%xmm0, -17(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_46bytes_align):
+	movdqa	-46(%eax), %xmm0
+	movdqa	%xmm0, -46(%edx)
+L(fwd_write_30bytes_align):
+	movdqa	-30(%eax), %xmm0
+	movdqa	%xmm0, -30(%edx)
+L(fwd_write_14bytes_align):
+	movq	-14(%eax), %xmm0
+	movq	%xmm0, -14(%edx)
+L(fwd_write_6bytes_align):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_38bytes_align):
+	movdqa	-38(%eax), %xmm0
+	movdqa	%xmm0, -38(%edx)
+L(fwd_write_22bytes_align):
+	movdqa	-22(%eax), %xmm0
+	movdqa	%xmm0, -22(%edx)
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_42bytes_align):
+	movdqa	-42(%eax), %xmm0
+	movdqa	%xmm0, -42(%edx)
+L(fwd_write_26bytes_align):
+	movdqa	-26(%eax), %xmm0
+	movdqa	%xmm0, -26(%edx)
+L(fwd_write_10bytes_align):
+	movq	-10(%eax), %xmm0
+	movq	%xmm0, -10(%edx)
+L(fwd_write_2bytes_align):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_34bytes_align):
+	movdqa	-34(%eax), %xmm0
+	movdqa	%xmm0, -34(%edx)
+L(fwd_write_18bytes_align):
+	movdqa	-18(%eax), %xmm0
+	movdqa	%xmm0, -18(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_47bytes_align):
+	movdqa	-47(%eax), %xmm0
+	movdqa	%xmm0, -47(%edx)
+L(fwd_write_31bytes_align):
+	movdqa	-31(%eax), %xmm0
+	movdqa	%xmm0, -31(%edx)
+L(fwd_write_15bytes_align):
+	movq	-15(%eax), %xmm0
+	movq	%xmm0, -15(%edx)
+L(fwd_write_7bytes_align):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_39bytes_align):
+	movdqa	-39(%eax), %xmm0
+	movdqa	%xmm0, -39(%edx)
+L(fwd_write_23bytes_align):
+	movdqa	-23(%eax), %xmm0
+	movdqa	%xmm0, -23(%edx)
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_43bytes_align):
+	movdqa	-43(%eax), %xmm0
+	movdqa	%xmm0, -43(%edx)
+L(fwd_write_27bytes_align):
+	movdqa	-27(%eax), %xmm0
+	movdqa	%xmm0, -27(%edx)
+L(fwd_write_11bytes_align):
+	movq	-11(%eax), %xmm0
+	movq	%xmm0, -11(%edx)
+L(fwd_write_3bytes_align):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_35bytes_align):
+	movdqa	-35(%eax), %xmm0
+	movdqa	%xmm0, -35(%edx)
+L(fwd_write_19bytes_align):
+	movdqa	-19(%eax), %xmm0
+	movdqa	%xmm0, -19(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_44bytes_align):
+	movdqa	-44(%eax), %xmm0
+	movdqa	%xmm0, -44(%edx)
+L(fwd_write_28bytes_align):
+	movdqa	-28(%eax), %xmm0
+	movdqa	%xmm0, -28(%edx)
+L(fwd_write_12bytes_align):
+	movq	-12(%eax), %xmm0
+	movq	%xmm0, -12(%edx)
+L(fwd_write_4bytes_align):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_36bytes_align):
+	movdqa	-36(%eax), %xmm0
+	movdqa	%xmm0, -36(%edx)
+L(fwd_write_20bytes_align):
+	movdqa	-20(%eax), %xmm0
+	movdqa	%xmm0, -20(%edx)
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN_END
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(large_page):
+	movdqu	(%eax), %xmm1
+# ifdef USE_AS_MEMMOVE
+	movl	DEST+4(%esp), %edi
+	movdqu	%xmm0, (%edi)
+# endif
+	lea	16(%eax), %eax
+	movntdq	%xmm1, (%edx)
+	lea	16(%edx), %edx
+	lea	-0x90(%ecx), %ecx
+	POP (%edi)
+
+	.p2align 4
+L(large_page_loop):
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	movdqu	0x40(%eax), %xmm4
+	movdqu	0x50(%eax), %xmm5
+	movdqu	0x60(%eax), %xmm6
+	movdqu	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	movntdq	%xmm4, 0x40(%edx)
+	movntdq	%xmm5, 0x50(%edx)
+	movntdq	%xmm6, 0x60(%edx)
+	movntdq	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+	jae	L(large_page_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(large_page_less_64bytes)
+
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	lea	0x40(%eax), %eax
+
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	lea	0x40(%edx), %edx
+	sub	$0x40, %ecx
+L(large_page_less_64bytes):
+	cmp	$32, %ecx
+	jb	L(large_page_less_32bytes)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	lea	0x20(%eax), %eax
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	lea	0x20(%edx), %edx
+	sub	$0x20, %ecx
+L(large_page_less_32bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	.p2align 4
+L(bk_write_44bytes):
+	movq	36(%eax), %xmm0
+	movq	%xmm0, 36(%edx)
+L(bk_write_36bytes):
+	movq	28(%eax), %xmm0
+	movq	%xmm0, 28(%edx)
+L(bk_write_28bytes):
+	movq	20(%eax), %xmm0
+	movq	%xmm0, 20(%edx)
+L(bk_write_20bytes):
+	movq	12(%eax), %xmm0
+	movq	%xmm0, 12(%edx)
+L(bk_write_12bytes):
+	movq	4(%eax), %xmm0
+	movq	%xmm0, 4(%edx)
+L(bk_write_4bytes):
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)
+L(bk_write_0bytes):
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_40bytes):
+	movq	32(%eax), %xmm0
+	movq	%xmm0, 32(%edx)
+L(bk_write_32bytes):
+	movq	24(%eax), %xmm0
+	movq	%xmm0, 24(%edx)
+L(bk_write_24bytes):
+	movq	16(%eax), %xmm0
+	movq	%xmm0, 16(%edx)
+L(bk_write_16bytes):
+	movq	8(%eax), %xmm0
+	movq	%xmm0, 8(%edx)
+L(bk_write_8bytes):
+	movq	(%eax), %xmm0
+	movq	%xmm0, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_45bytes):
+	movq	37(%eax), %xmm0
+	movq	%xmm0, 37(%edx)
+L(bk_write_37bytes):
+	movq	29(%eax), %xmm0
+	movq	%xmm0, 29(%edx)
+L(bk_write_29bytes):
+	movq	21(%eax), %xmm0
+	movq	%xmm0, 21(%edx)
+L(bk_write_21bytes):
+	movq	13(%eax), %xmm0
+	movq	%xmm0, 13(%edx)
+L(bk_write_13bytes):
+	movq	5(%eax), %xmm0
+	movq	%xmm0, 5(%edx)
+L(bk_write_5bytes):
+	movl	1(%eax), %ecx
+	movl	%ecx, 1(%edx)
+L(bk_write_1bytes):
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_41bytes):
+	movq	33(%eax), %xmm0
+	movq	%xmm0, 33(%edx)
+L(bk_write_33bytes):
+	movq	25(%eax), %xmm0
+	movq	%xmm0, 25(%edx)
+L(bk_write_25bytes):
+	movq	17(%eax), %xmm0
+	movq	%xmm0, 17(%edx)
+L(bk_write_17bytes):
+	movq	9(%eax), %xmm0
+	movq	%xmm0, 9(%edx)
+L(bk_write_9bytes):
+	movq	1(%eax), %xmm0
+	movq	%xmm0, 1(%edx)
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_46bytes):
+	movq	38(%eax), %xmm0
+	movq	%xmm0, 38(%edx)
+L(bk_write_38bytes):
+	movq	30(%eax), %xmm0
+	movq	%xmm0, 30(%edx)
+L(bk_write_30bytes):
+	movq	22(%eax), %xmm0
+	movq	%xmm0, 22(%edx)
+L(bk_write_22bytes):
+	movq	14(%eax), %xmm0
+	movq	%xmm0, 14(%edx)
+L(bk_write_14bytes):
+	movq	6(%eax), %xmm0
+	movq	%xmm0, 6(%edx)
+L(bk_write_6bytes):
+	movl	2(%eax), %ecx
+	movl	%ecx, 2(%edx)
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_42bytes):
+	movq	34(%eax), %xmm0
+	movq	%xmm0, 34(%edx)
+L(bk_write_34bytes):
+	movq	26(%eax), %xmm0
+	movq	%xmm0, 26(%edx)
+L(bk_write_26bytes):
+	movq	18(%eax), %xmm0
+	movq	%xmm0, 18(%edx)
+L(bk_write_18bytes):
+	movq	10(%eax), %xmm0
+	movq	%xmm0, 10(%edx)
+L(bk_write_10bytes):
+	movq	2(%eax), %xmm0
+	movq	%xmm0, 2(%edx)
+L(bk_write_2bytes):
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_47bytes):
+	movq	39(%eax), %xmm0
+	movq	%xmm0, 39(%edx)
+L(bk_write_39bytes):
+	movq	31(%eax), %xmm0
+	movq	%xmm0, 31(%edx)
+L(bk_write_31bytes):
+	movq	23(%eax), %xmm0
+	movq	%xmm0, 23(%edx)
+L(bk_write_23bytes):
+	movq	15(%eax), %xmm0
+	movq	%xmm0, 15(%edx)
+L(bk_write_15bytes):
+	movq	7(%eax), %xmm0
+	movq	%xmm0, 7(%edx)
+L(bk_write_7bytes):
+	movl	3(%eax), %ecx
+	movl	%ecx, 3(%edx)
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_43bytes):
+	movq	35(%eax), %xmm0
+	movq	%xmm0, 35(%edx)
+L(bk_write_35bytes):
+	movq	27(%eax), %xmm0
+	movq	%xmm0, 27(%edx)
+L(bk_write_27bytes):
+	movq	19(%eax), %xmm0
+	movq	%xmm0, 19(%edx)
+L(bk_write_19bytes):
+	movq	11(%eax), %xmm0
+	movq	%xmm0, 11(%edx)
+L(bk_write_11bytes):
+	movq	3(%eax), %xmm0
+	movq	%xmm0, 3(%edx)
+L(bk_write_3bytes):
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN_END
+
+
+	.pushsection .rodata.ssse3,"a",@progbits
+	.p2align 2
+L(table_48bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+	.p2align 2
+L(table_48bytes_fwd_align):
+	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
+
+	.p2align 2
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	.p2align 2
+L(table_48_bytes_bwd):
+	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+	.popsection
+
+# ifdef USE_AS_MEMMOVE
+	.p2align 4
+L(copy_backward):
+	PUSH (%edi)
+	movl	%eax, %edi
+	lea	(%ecx,%edx,1),%edx
+	lea	(%ecx,%edi,1),%edi
+	testl	$0x3, %edx
+	jnz	L(bk_align)
+
+L(bk_aligned_4):
+	cmp	$64, %ecx
+	jae	L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+	cmp	$32, %ecx
+	jb	L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+	/* Copy 32 bytes at a time.  */
+	sub	$32, %ecx
+	movq	-8(%edi), %xmm0
+	movq	%xmm0, -8(%edx)
+	movq	-16(%edi), %xmm0
+	movq	%xmm0, -16(%edx)
+	movq	-24(%edi), %xmm0
+	movq	%xmm0, -24(%edx)
+	movq	-32(%edi), %xmm0
+	movq	%xmm0, -32(%edx)
+	sub	$32, %edx
+	sub	$32, %edi
+
+L(bk_write_less32bytes):
+	movl	%edi, %eax
+	sub	%ecx, %edx
+	sub	%ecx, %eax
+	POP (%edi)
+L(bk_write_less32bytes_2):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(bk_align):
+	cmp	$8, %ecx
+	jbe	L(bk_write_less32bytes)
+	testl	$1, %edx
+	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+	then	(EDX & 2) must be != 0.  */
+	jz	L(bk_got2)
+	sub	$1, %edi
+	sub	$1, %ecx
+	sub	$1, %edx
+	movzbl	(%edi), %eax
+	movb	%al, (%edx)
+
+	testl	$2, %edx
+	jz	L(bk_aligned_4)
+
+L(bk_got2):
+	sub	$2, %edi
+	sub	$2, %ecx
+	sub	$2, %edx
+	movzwl	(%edi), %eax
+	movw	%ax, (%edx)
+	jmp	L(bk_aligned_4)
+
+	.p2align 4
+L(bk_write_more64bytes):
+	/* Check alignment of last byte.  */
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes.  */
+L(bk_ssse3_align):
+	sub	$4, %edi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%edi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %edi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%edi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %edi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%edi), %eax
+	movl	%eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+	cmp	$64, %ecx
+	jb	L(bk_write_more32bytes)
+
+	.p2align 4
+L(bk_ssse3_cpy):
+	sub	$64, %edi
+	sub	$64, %ecx
+	sub	$64, %edx
+	movdqu	0x30(%edi), %xmm3
+	movdqa	%xmm3, 0x30(%edx)
+	movdqu	0x20(%edi), %xmm2
+	movdqa	%xmm2, 0x20(%edx)
+	movdqu	0x10(%edi), %xmm1
+	movdqa	%xmm1, 0x10(%edx)
+	movdqu	(%edi), %xmm0
+	movdqa	%xmm0, (%edx)
+	cmp	$64, %ecx
+	jae	L(bk_ssse3_cpy)
+	jmp	L(bk_write_64bytesless)
+
+# endif
+
+END (MEMCPY)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S
new file mode 100644
index 0000000000..f725944620
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S
@@ -0,0 +1,78 @@
+/* Multiple versions of memcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need memcpy before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(memcpy)
+	.type	memcpy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcpy_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_ssse3_rep)
+2:	ret
+END(memcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcpy_ia32, @function; \
+	.p2align 4; \
+	.globl __memcpy_ia32; \
+	.hidden __memcpy_ia32; \
+	__memcpy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memcpy_chk_ia32, @function; \
+	.globl __memcpy_chk_ia32; \
+	.p2align 4; \
+	__memcpy_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_memcpy; __GI_memcpy = __memcpy_ia32
+#endif
+
+#include "../memcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S
new file mode 100644
index 0000000000..1b4fbe2e6f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -0,0 +1,50 @@
+/* Multiple versions of __memcpy_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch memcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+	.text
+ENTRY(__memcpy_chk)
+	.type	__memcpy_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3_rep)
+2:	ret
+END(__memcpy_chk)
+# else
+#  include "../memcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
new file mode 100644
index 0000000000..3873594cb2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_sse2_unaligned
+#define MEMCPY_CHK	__memmove_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
new file mode 100644
index 0000000000..d202fc4a13
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3_rep
+#define MEMCPY_CHK	__memmove_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S
new file mode 100644
index 0000000000..295430b1ef
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3
+#define MEMCPY_CHK	__memmove_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S
new file mode 100644
index 0000000000..6eb418ca7f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S
@@ -0,0 +1,89 @@
+/* Multiple versions of memmove
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(memmove)
+	.type	memmove, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memmove_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_ssse3_rep)
+2:	ret
+END(memmove)
+
+# ifdef SHARED
+#  undef ENTRY
+#  define ENTRY(name) \
+	.type __memmove_ia32, @function; \
+	.p2align 4; \
+	.globl __memmove_ia32; \
+	.hidden __memmove_ia32; \
+	__memmove_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# else
+#  undef ENTRY
+#  define ENTRY(name) \
+	.type __memmove_ia32, @function; \
+	.globl __memmove_ia32; \
+	.p2align 4; \
+	__memmove_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# endif
+
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memmove_ia32, .-__memmove_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memmove_chk_ia32, @function; \
+	.globl __memmove_chk_ia32; \
+	.p2align 4; \
+	__memmove_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memmove; __GI_memmove = __memmove_ia32
+# endif
+#endif
+
+#include "../memmove.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S
new file mode 100644
index 0000000000..314834c4c6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -0,0 +1,94 @@
+/* Multiple versions of __memmove_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__memmove_chk)
+	.type	__memmove_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3_rep)
+2:	ret
+END(__memmove_chk)
+
+# ifndef SHARED
+	.type __memmove_chk_sse2_unaligned, @function
+	.p2align 4;
+__memmove_chk_sse2_unaligned:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_sse2_unaligned
+	cfi_endproc
+	.size __memmove_chk_sse2_unaligned, .-__memmove_chk_sse2_unaligned
+
+	.type __memmove_chk_ssse3, @function
+	.p2align 4;
+__memmove_chk_ssse3:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ssse3
+	cfi_endproc
+	.size __memmove_chk_ssse3, .-__memmove_chk_ssse3
+
+	.type __memmove_chk_ssse3_rep, @function
+	.p2align 4;
+__memmove_chk_ssse3_rep:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ssse3_rep
+	cfi_endproc
+	.size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep
+
+	.type __memmove_chk_ia32, @function
+	.p2align 4;
+__memmove_chk_ia32:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ia32
+	cfi_endproc
+	.size __memmove_chk_ia32, .-__memmove_chk_ia32
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..a1cea50771
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_sse2_unaligned
+#define MEMCPY_CHK	__mempcpy_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
new file mode 100644
index 0000000000..5357b33e18
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_ssse3_rep
+#define MEMCPY_CHK	__mempcpy_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
new file mode 100644
index 0000000000..822d98e954
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_ssse3
+#define MEMCPY_CHK	__mempcpy_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S
new file mode 100644
index 0000000000..06e377fbc9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -0,0 +1,81 @@
+/* Multiple versions of mempcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need mempcpy before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(__mempcpy)
+	.type	__mempcpy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__mempcpy_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_ssse3_rep)
+2:	ret
+END(__mempcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __mempcpy_ia32, @function; \
+	.p2align 4; \
+	.globl __mempcpy_ia32; \
+	.hidden __mempcpy_ia32; \
+	__mempcpy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __mempcpy_chk_ia32, @function; \
+	.globl __mempcpy_chk_ia32; \
+	.p2align 4; \
+	__mempcpy_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32
+
+# undef libc_hidden_def
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_def(name) \
+	.globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32
+# define libc_hidden_builtin_def(name) \
+	.globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32
+#endif
+
+#include "../mempcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S
new file mode 100644
index 0000000000..e13e5248a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -0,0 +1,50 @@
+/* Multiple versions of __mempcpy_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch mempcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+	.text
+ENTRY(__mempcpy_chk)
+	.type	__mempcpy_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3_rep)
+2:	ret
+END(__mempcpy_chk)
+# else
+#  include "../mempcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c
new file mode 100644
index 0000000000..ef7bbbe792
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c
@@ -0,0 +1,7 @@
+#if IS_IN (libc)
+# define MEMRCHR  __memrchr_ia32
+# include <string.h>
+extern void *__memrchr_ia32 (const void *, int, size_t);
+#endif
+
+#include "string/memrchr.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
new file mode 100644
index 0000000000..dbbe94fd08
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
@@ -0,0 +1,417 @@
+/* Optimized memrchr with sse2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN   STR2+4
+
+# define MEMCHR __memrchr_sse2_bsf
+
+	.text
+ENTRY (MEMCHR)
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+	mov	LEN(%esp), %edx
+
+	sub	$16, %edx
+	jbe	L(length_less16)
+
+	punpcklbw %xmm1, %xmm1
+	add	%edx, %ecx
+	punpcklbw %xmm1, %xmm1
+
+	movdqu	(%ecx), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	sub	$64, %ecx
+	mov	%ecx, %eax
+	and	$15, %eax
+	jz	L(loop_prolog)
+
+	add	$16, %ecx
+	add	$16, %edx
+	sub	%eax, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	mov	%ecx, %eax
+	and	$63, %eax
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	add	$64, %ecx
+	add	$64, %edx
+	sub	%eax, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%ecx), %xmm0
+	movdqa	16(%ecx), %xmm2
+	movdqa	32(%ecx), %xmm3
+	movdqa	48(%ecx), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm3, %xmm0
+	pmaxub	%xmm4, %xmm2
+	pmaxub	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	(%ecx), %xmm1
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	pmovmskb %xmm1, %eax
+	bsr	%eax, %eax
+
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches0_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	32(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches0):
+	bsr	%eax, %eax
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16):
+	bsr	%eax, %eax
+	lea	16(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches32):
+	bsr	%eax, %eax
+	lea	32(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches48):
+	bsr	%eax, %eax
+	lea	48(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches0_1):
+	bsr	%eax, %eax
+	sub	$64, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	bsr	%eax, %eax
+	sub	$48, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	16(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	bsr	%eax, %eax
+	sub	$32, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	32(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	bsr	%eax, %eax
+	sub	$16, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	48(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16_offset0):
+	mov	%dl, %cl
+	pcmpeqb	(%eax), %xmm1
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+	mov	%edx, %ecx
+
+	pmovmskb %xmm1, %edx
+
+	and	%ecx, %edx
+	test	%edx, %edx
+	jz	L(return_null)
+
+	bsr	%edx, %ecx
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(length_less16):
+	punpcklbw %xmm1, %xmm1
+	mov	%ecx, %eax
+	punpcklbw %xmm1, %xmm1
+	add	$16, %edx
+	jz	L(return_null)
+
+	pshufd	$0, %xmm1, %xmm1
+	and	$15, %ecx
+	jz	L(length_less16_offset0)
+
+	PUSH	(%edi)
+	mov	%cl, %dh
+	add	%dl, %dh
+	and	$-16, %eax
+
+	sub	$16, %dh
+	ja	L(length_less16_part2)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	sar	%cl, %edi
+	add	%ecx, %eax
+	mov	%dl, %cl
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2):
+	movdqa	16(%eax), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %edi
+
+	mov	%cl, %ch
+
+	mov	%dh, %cl
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+
+	test	%edi, %edi
+	jnz	L(length_less16_part2_return)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	mov	%ch, %cl
+	sar	%cl, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	xor	%ch, %ch
+	add	%ecx, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2_return):
+	bsr	%edi, %edi
+	lea	16(%eax, %edi), %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ret_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S
new file mode 100644
index 0000000000..5f7853f683
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S
@@ -0,0 +1,724 @@
+/* Optimized memrchr with sse2 without bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN   STR2+4
+
+	atom_text_section
+ENTRY (__memrchr_sse2)
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+	mov	LEN(%esp), %edx
+
+	sub	$16, %edx
+	jbe	L(length_less16)
+
+	punpcklbw %xmm1, %xmm1
+	add	%edx, %ecx
+	punpcklbw %xmm1, %xmm1
+
+	movdqu	(%ecx), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	sub	$64, %ecx
+	mov	%ecx, %eax
+	and	$15, %eax
+	jz	L(loop_prolog)
+
+	lea	16(%ecx), %ecx
+	lea	16(%edx), %edx
+	sub	%eax, %edx
+	and	$-16, %ecx
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	mov	%ecx, %eax
+	and	$63, %eax
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	lea	64(%ecx), %ecx
+	lea	64(%edx), %edx
+	and	$-64, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%ecx), %xmm0
+	movdqa	16(%ecx), %xmm2
+	movdqa	32(%ecx), %xmm3
+	movdqa	48(%ecx), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm3, %xmm0
+	pmaxub	%xmm4, %xmm2
+	pmaxub	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	(%ecx), %xmm1
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	pmovmskb %xmm1, %eax
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches0_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	32(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches16):
+	lea	16(%ecx), %ecx
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches32):
+	lea	32(%ecx), %ecx
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches48):
+	lea	48(%ecx), %ecx
+
+	.p2align 4
+L(exit_dispatch):
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_8):
+	test	$0x80, %al
+	jnz	L(exit_8)
+	test	$0x40, %al
+	jnz	L(exit_7)
+	test	$0x20, %al
+	jnz	L(exit_6)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_high):
+	mov	%ah, %dh
+	and	$15 << 4, %dh
+	jnz	L(exit_dispatch_high_8)
+	test	$0x08, %ah
+	jnz	L(exit_12)
+	test	$0x04, %ah
+	jnz	L(exit_11)
+	test	$0x02, %ah
+	jnz	L(exit_10)
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_high_8):
+	test	$0x80, %ah
+	jnz	L(exit_16)
+	test	$0x40, %ah
+	jnz	L(exit_15)
+	test	$0x20, %ah
+	jnz	L(exit_14)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_2):
+	lea	1(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_3):
+	lea	2(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_4):
+	lea	3(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_6):
+	lea	5(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_7):
+	lea	6(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_8):
+	lea	7(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_10):
+	lea	9(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_11):
+	lea	10(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_12):
+	lea	11(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_14):
+	lea	13(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_15):
+	lea	14(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_16):
+	lea	15(%ecx), %eax
+	ret
+
+	.p2align 4
+L(matches0_1):
+	lea	-64(%edx), %edx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	lea	-48(%edx), %edx
+	lea	16(%ecx), %ecx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	lea	-32(%edx), %edx
+	lea	32(%ecx), %ecx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	lea	-16(%edx), %edx
+	lea	48(%ecx), %ecx
+
+	.p2align 4
+L(exit_dispatch_1):
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_8):
+	test	$0x80, %al
+	jnz	L(exit_1_8)
+	test	$0x40, %al
+	jnz	L(exit_1_7)
+	test	$0x20, %al
+	jnz	L(exit_1_6)
+	add	$4, %edx
+	jl	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_high):
+	mov	%ah, %al
+	and	$15 << 4, %al
+	jnz	L(exit_dispatch_1_high_8)
+	test	$0x08, %ah
+	jnz	L(exit_1_12)
+	test	$0x04, %ah
+	jnz	L(exit_1_11)
+	test	$0x02, %ah
+	jnz	L(exit_1_10)
+	add	$8, %edx
+	jl	L(return_null)
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_high_8):
+	test	$0x80, %ah
+	jnz	L(exit_1_16)
+	test	$0x40, %ah
+	jnz	L(exit_1_15)
+	test	$0x20, %ah
+	jnz	L(exit_1_14)
+	add	$12, %edx
+	jl	L(return_null)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_2):
+	add	$1, %edx
+	jl	L(return_null)
+	lea	1(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_3):
+	add	$2, %edx
+	jl	L(return_null)
+	lea	2(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_4):
+	add	$3, %edx
+	jl	L(return_null)
+	lea	3(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_6):
+	add	$5, %edx
+	jl	L(return_null)
+	lea	5(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_7):
+	add	$6, %edx
+	jl	L(return_null)
+	lea	6(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_8):
+	add	$7, %edx
+	jl	L(return_null)
+	lea	7(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_10):
+	add	$9, %edx
+	jl	L(return_null)
+	lea	9(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_11):
+	add	$10, %edx
+	jl	L(return_null)
+	lea	10(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_12):
+	add	$11, %edx
+	jl	L(return_null)
+	lea	11(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_14):
+	add	$13, %edx
+	jl	L(return_null)
+	lea	13(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_15):
+	add	$14, %edx
+	jl	L(return_null)
+	lea	14(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_16):
+	add	$15, %edx
+	jl	L(return_null)
+	lea	15(%ecx), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16_offset0):
+	mov	%dl, %cl
+	pcmpeqb	(%eax), %xmm1
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	mov	%eax, %ecx
+	pmovmskb %xmm1, %eax
+
+	and	%edx, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16):
+	punpcklbw %xmm1, %xmm1
+	add	$16, %edx
+	je	L(return_null)
+	punpcklbw %xmm1, %xmm1
+
+	mov	%ecx, %eax
+	pshufd	$0, %xmm1, %xmm1
+
+	and	$15, %ecx
+	jz	L(length_less16_offset0)
+
+	PUSH	(%edi)
+
+	mov	%cl, %dh
+	add	%dl, %dh
+	and	$-16, %eax
+
+	sub	$16, %dh
+	ja	L(length_less16_part2)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	sar	%cl, %edi
+	add	%ecx, %eax
+	mov	%dl, %cl
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2):
+	movdqa	16(%eax), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %edi
+
+	mov	%cl, %ch
+
+	mov	%dh, %cl
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+
+	test	%edi, %edi
+	jnz	L(length_less16_part2_return)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	mov	%ch, %cl
+	sar	%cl, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	xor	%ch, %ch
+	add	%ecx, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2_return):
+	bsr	%edi, %edi
+	lea	16(%eax, %edi), %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ret_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+END (__memrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S
new file mode 100644
index 0000000000..d4253a553b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S
@@ -0,0 +1,45 @@
+/* Multiple versions of memrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__memrchr)
+	.type	__memrchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	3f
+
+	LOAD_FUNC_GOT_EAX (__memrchr_sse2)
+	ret
+
+2:	LOAD_FUNC_GOT_EAX (__memrchr_ia32)
+	ret
+
+3:	LOAD_FUNC_GOT_EAX (__memrchr_sse2_bsf)
+	ret
+END(__memrchr)
+
+weak_alias(__memrchr, memrchr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
new file mode 100644
index 0000000000..3221077e49
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
@@ -0,0 +1,811 @@
+/* memset with SSE2 and REP string.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST		PARMS
+# define LEN		DEST+4
+# define SETRTNVAL
+#else
+# define DEST		PARMS
+# define CHR		DEST+4
+# define LEN		CHR+4
+# define SETRTNVAL	movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define PARMS		8		/* Preserve EBX.  */
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.   */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    /* We first load PC into EBX.  */				\
+    SETUP_PIC_REG(bx);						\
+    /* Get the address of the jump table.  */			\
+    add		$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    add		(%ebx,%ecx,4), %ebx;				\
+    add		%ecx, %edx;					\
+    /* We loaded the jump table and adjusted EDX. Go.  */	\
+    jmp		*%ebx
+#else
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define PARMS		4
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    add		%ecx, %edx;					\
+    jmp		*TABLE(,%ecx,4)
+#endif
+
+	.section .text.sse2,"ax",@progbits
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2_rep)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2_rep)
+#endif
+ENTRY (__memset_sse2_rep)
+	ENTRANCE
+
+	movl	LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	CHR(%esp), %eax
+	movb	%al, %ah
+	/* Fill the whole EAX with pattern.  */
+	movl	%eax, %edx
+	shl	$16, %eax
+	or	%edx, %eax
+#endif
+	movl	DEST(%esp), %edx
+	cmp	$32, %ecx
+	jae	L(32bytesormore)
+
+L(write_less32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_less_32bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
+	.popsection
+
+	ALIGN (4)
+L(write_28bytes):
+	movl	%eax, -28(%edx)
+L(write_24bytes):
+	movl	%eax, -24(%edx)
+L(write_20bytes):
+	movl	%eax, -20(%edx)
+L(write_16bytes):
+	movl	%eax, -16(%edx)
+L(write_12bytes):
+	movl	%eax, -12(%edx)
+L(write_8bytes):
+	movl	%eax, -8(%edx)
+L(write_4bytes):
+	movl	%eax, -4(%edx)
+L(write_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_29bytes):
+	movl	%eax, -29(%edx)
+L(write_25bytes):
+	movl	%eax, -25(%edx)
+L(write_21bytes):
+	movl	%eax, -21(%edx)
+L(write_17bytes):
+	movl	%eax, -17(%edx)
+L(write_13bytes):
+	movl	%eax, -13(%edx)
+L(write_9bytes):
+	movl	%eax, -9(%edx)
+L(write_5bytes):
+	movl	%eax, -5(%edx)
+L(write_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_30bytes):
+	movl	%eax, -30(%edx)
+L(write_26bytes):
+	movl	%eax, -26(%edx)
+L(write_22bytes):
+	movl	%eax, -22(%edx)
+L(write_18bytes):
+	movl	%eax, -18(%edx)
+L(write_14bytes):
+	movl	%eax, -14(%edx)
+L(write_10bytes):
+	movl	%eax, -10(%edx)
+L(write_6bytes):
+	movl	%eax, -6(%edx)
+L(write_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_31bytes):
+	movl	%eax, -31(%edx)
+L(write_27bytes):
+	movl	%eax, -27(%edx)
+L(write_23bytes):
+	movl	%eax, -23(%edx)
+L(write_19bytes):
+	movl	%eax, -19(%edx)
+L(write_15bytes):
+	movl	%eax, -15(%edx)
+L(write_11bytes):
+	movl	%eax, -11(%edx)
+L(write_7bytes):
+	movl	%eax, -7(%edx)
+L(write_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(32bytesormore):
+	/* Fill xmm0 with the pattern.  */
+#ifdef USE_AS_BZERO
+	pxor	%xmm0, %xmm0
+#else
+	movd	%eax, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+#endif
+	testl	$0xf, %edx
+	jz	L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned.  */
+L(not_aligned_16):
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %eax
+	add	%eax, %ecx
+	movd	%xmm0, %eax
+
+	ALIGN (4)
+L(aligned_16):
+	cmp	$128, %ecx
+	jae	L(128bytesormore)
+
+L(aligned_16_less128bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytesormore):
+	PUSH (%edi)
+#ifdef DATA_CACHE_SIZE
+	PUSH (%ebx)
+	mov	$DATA_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_data_cache_size@GOTOFF(%ebx), %ebx
+# else
+	PUSH (%ebx)
+	mov	__x86_data_cache_size, %ebx
+# endif
+#endif
+	mov	%ebx, %edi
+	shr	$4, %ebx
+	sub	%ebx, %edi
+#if defined DATA_CACHE_SIZE || !defined SHARED
+	POP (%ebx)
+#endif
+/*
+ * When data size approximate the end of L1 cache,
+ * fast string will prefetch and combine data efficiently.
+ */
+	cmp	%edi, %ecx
+	jae	L(128bytesormore_endof_L1)
+	subl	$128, %ecx
+L(128bytesormore_normal):
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jb	L(128bytesless_normal)
+
+
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jae	L(128bytesormore_normal)
+
+L(128bytesless_normal):
+	POP (%edi)
+	add	$128, %ecx
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(128bytesormore_endof_L1):
+	mov	%edx, %edi
+	mov	%ecx, %edx
+	shr	$2, %ecx
+	and	$3, %edx
+	rep stosl
+	jz	L(copy_page_by_rep_exit)
+	cmp	$2, %edx
+	jb	L(copy_page_by_rep_left_1)
+	movw	%ax, (%edi)
+	add	$2, %edi
+	sub	$2, %edx
+	jz	L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+	movb	%al, (%edi)
+L(copy_page_by_rep_exit):
+	POP (%edi)
+	SETRTNVAL
+	RETURN
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_16_128bytes):
+	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+	.popsection
+
+	ALIGN (4)
+L(aligned_16_112bytes):
+	movdqa	%xmm0, -112(%edx)
+L(aligned_16_96bytes):
+	movdqa	%xmm0, -96(%edx)
+L(aligned_16_80bytes):
+	movdqa	%xmm0, -80(%edx)
+L(aligned_16_64bytes):
+	movdqa	%xmm0, -64(%edx)
+L(aligned_16_48bytes):
+	movdqa	%xmm0, -48(%edx)
+L(aligned_16_32bytes):
+	movdqa	%xmm0, -32(%edx)
+L(aligned_16_16bytes):
+	movdqa	%xmm0, -16(%edx)
+L(aligned_16_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_113bytes):
+	movdqa	%xmm0, -113(%edx)
+L(aligned_16_97bytes):
+	movdqa	%xmm0, -97(%edx)
+L(aligned_16_81bytes):
+	movdqa	%xmm0, -81(%edx)
+L(aligned_16_65bytes):
+	movdqa	%xmm0, -65(%edx)
+L(aligned_16_49bytes):
+	movdqa	%xmm0, -49(%edx)
+L(aligned_16_33bytes):
+	movdqa	%xmm0, -33(%edx)
+L(aligned_16_17bytes):
+	movdqa	%xmm0, -17(%edx)
+L(aligned_16_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_114bytes):
+	movdqa	%xmm0, -114(%edx)
+L(aligned_16_98bytes):
+	movdqa	%xmm0, -98(%edx)
+L(aligned_16_82bytes):
+	movdqa	%xmm0, -82(%edx)
+L(aligned_16_66bytes):
+	movdqa	%xmm0, -66(%edx)
+L(aligned_16_50bytes):
+	movdqa	%xmm0, -50(%edx)
+L(aligned_16_34bytes):
+	movdqa	%xmm0, -34(%edx)
+L(aligned_16_18bytes):
+	movdqa	%xmm0, -18(%edx)
+L(aligned_16_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_115bytes):
+	movdqa	%xmm0, -115(%edx)
+L(aligned_16_99bytes):
+	movdqa	%xmm0, -99(%edx)
+L(aligned_16_83bytes):
+	movdqa	%xmm0, -83(%edx)
+L(aligned_16_67bytes):
+	movdqa	%xmm0, -67(%edx)
+L(aligned_16_51bytes):
+	movdqa	%xmm0, -51(%edx)
+L(aligned_16_35bytes):
+	movdqa	%xmm0, -35(%edx)
+L(aligned_16_19bytes):
+	movdqa	%xmm0, -19(%edx)
+L(aligned_16_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_116bytes):
+	movdqa	%xmm0, -116(%edx)
+L(aligned_16_100bytes):
+	movdqa	%xmm0, -100(%edx)
+L(aligned_16_84bytes):
+	movdqa	%xmm0, -84(%edx)
+L(aligned_16_68bytes):
+	movdqa	%xmm0, -68(%edx)
+L(aligned_16_52bytes):
+	movdqa	%xmm0, -52(%edx)
+L(aligned_16_36bytes):
+	movdqa	%xmm0, -36(%edx)
+L(aligned_16_20bytes):
+	movdqa	%xmm0, -20(%edx)
+L(aligned_16_4bytes):
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_117bytes):
+	movdqa	%xmm0, -117(%edx)
+L(aligned_16_101bytes):
+	movdqa	%xmm0, -101(%edx)
+L(aligned_16_85bytes):
+	movdqa	%xmm0, -85(%edx)
+L(aligned_16_69bytes):
+	movdqa	%xmm0, -69(%edx)
+L(aligned_16_53bytes):
+	movdqa	%xmm0, -53(%edx)
+L(aligned_16_37bytes):
+	movdqa	%xmm0, -37(%edx)
+L(aligned_16_21bytes):
+	movdqa	%xmm0, -21(%edx)
+L(aligned_16_5bytes):
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_118bytes):
+	movdqa	%xmm0, -118(%edx)
+L(aligned_16_102bytes):
+	movdqa	%xmm0, -102(%edx)
+L(aligned_16_86bytes):
+	movdqa	%xmm0, -86(%edx)
+L(aligned_16_70bytes):
+	movdqa	%xmm0, -70(%edx)
+L(aligned_16_54bytes):
+	movdqa	%xmm0, -54(%edx)
+L(aligned_16_38bytes):
+	movdqa	%xmm0, -38(%edx)
+L(aligned_16_22bytes):
+	movdqa	%xmm0, -22(%edx)
+L(aligned_16_6bytes):
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_119bytes):
+	movdqa	%xmm0, -119(%edx)
+L(aligned_16_103bytes):
+	movdqa	%xmm0, -103(%edx)
+L(aligned_16_87bytes):
+	movdqa	%xmm0, -87(%edx)
+L(aligned_16_71bytes):
+	movdqa	%xmm0, -71(%edx)
+L(aligned_16_55bytes):
+	movdqa	%xmm0, -55(%edx)
+L(aligned_16_39bytes):
+	movdqa	%xmm0, -39(%edx)
+L(aligned_16_23bytes):
+	movdqa	%xmm0, -23(%edx)
+L(aligned_16_7bytes):
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_120bytes):
+	movdqa	%xmm0, -120(%edx)
+L(aligned_16_104bytes):
+	movdqa	%xmm0, -104(%edx)
+L(aligned_16_88bytes):
+	movdqa	%xmm0, -88(%edx)
+L(aligned_16_72bytes):
+	movdqa	%xmm0, -72(%edx)
+L(aligned_16_56bytes):
+	movdqa	%xmm0, -56(%edx)
+L(aligned_16_40bytes):
+	movdqa	%xmm0, -40(%edx)
+L(aligned_16_24bytes):
+	movdqa	%xmm0, -24(%edx)
+L(aligned_16_8bytes):
+	movq	%xmm0, -8(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_121bytes):
+	movdqa	%xmm0, -121(%edx)
+L(aligned_16_105bytes):
+	movdqa	%xmm0, -105(%edx)
+L(aligned_16_89bytes):
+	movdqa	%xmm0, -89(%edx)
+L(aligned_16_73bytes):
+	movdqa	%xmm0, -73(%edx)
+L(aligned_16_57bytes):
+	movdqa	%xmm0, -57(%edx)
+L(aligned_16_41bytes):
+	movdqa	%xmm0, -41(%edx)
+L(aligned_16_25bytes):
+	movdqa	%xmm0, -25(%edx)
+L(aligned_16_9bytes):
+	movq	%xmm0, -9(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_122bytes):
+	movdqa	%xmm0, -122(%edx)
+L(aligned_16_106bytes):
+	movdqa	%xmm0, -106(%edx)
+L(aligned_16_90bytes):
+	movdqa	%xmm0, -90(%edx)
+L(aligned_16_74bytes):
+	movdqa	%xmm0, -74(%edx)
+L(aligned_16_58bytes):
+	movdqa	%xmm0, -58(%edx)
+L(aligned_16_42bytes):
+	movdqa	%xmm0, -42(%edx)
+L(aligned_16_26bytes):
+	movdqa	%xmm0, -26(%edx)
+L(aligned_16_10bytes):
+	movq	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_123bytes):
+	movdqa	%xmm0, -123(%edx)
+L(aligned_16_107bytes):
+	movdqa	%xmm0, -107(%edx)
+L(aligned_16_91bytes):
+	movdqa	%xmm0, -91(%edx)
+L(aligned_16_75bytes):
+	movdqa	%xmm0, -75(%edx)
+L(aligned_16_59bytes):
+	movdqa	%xmm0, -59(%edx)
+L(aligned_16_43bytes):
+	movdqa	%xmm0, -43(%edx)
+L(aligned_16_27bytes):
+	movdqa	%xmm0, -27(%edx)
+L(aligned_16_11bytes):
+	movq	%xmm0, -11(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_124bytes):
+	movdqa	%xmm0, -124(%edx)
+L(aligned_16_108bytes):
+	movdqa	%xmm0, -108(%edx)
+L(aligned_16_92bytes):
+	movdqa	%xmm0, -92(%edx)
+L(aligned_16_76bytes):
+	movdqa	%xmm0, -76(%edx)
+L(aligned_16_60bytes):
+	movdqa	%xmm0, -60(%edx)
+L(aligned_16_44bytes):
+	movdqa	%xmm0, -44(%edx)
+L(aligned_16_28bytes):
+	movdqa	%xmm0, -28(%edx)
+L(aligned_16_12bytes):
+	movq	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_125bytes):
+	movdqa	%xmm0, -125(%edx)
+L(aligned_16_109bytes):
+	movdqa	%xmm0, -109(%edx)
+L(aligned_16_93bytes):
+	movdqa	%xmm0, -93(%edx)
+L(aligned_16_77bytes):
+	movdqa	%xmm0, -77(%edx)
+L(aligned_16_61bytes):
+	movdqa	%xmm0, -61(%edx)
+L(aligned_16_45bytes):
+	movdqa	%xmm0, -45(%edx)
+L(aligned_16_29bytes):
+	movdqa	%xmm0, -29(%edx)
+L(aligned_16_13bytes):
+	movq	%xmm0, -13(%edx)
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_126bytes):
+	movdqa	%xmm0, -126(%edx)
+L(aligned_16_110bytes):
+	movdqa	%xmm0, -110(%edx)
+L(aligned_16_94bytes):
+	movdqa	%xmm0, -94(%edx)
+L(aligned_16_78bytes):
+	movdqa	%xmm0, -78(%edx)
+L(aligned_16_62bytes):
+	movdqa	%xmm0, -62(%edx)
+L(aligned_16_46bytes):
+	movdqa	%xmm0, -46(%edx)
+L(aligned_16_30bytes):
+	movdqa	%xmm0, -30(%edx)
+L(aligned_16_14bytes):
+	movq	%xmm0, -14(%edx)
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_127bytes):
+	movdqa	%xmm0, -127(%edx)
+L(aligned_16_111bytes):
+	movdqa	%xmm0, -111(%edx)
+L(aligned_16_95bytes):
+	movdqa	%xmm0, -95(%edx)
+L(aligned_16_79bytes):
+	movdqa	%xmm0, -79(%edx)
+L(aligned_16_63bytes):
+	movdqa	%xmm0, -63(%edx)
+L(aligned_16_47bytes):
+	movdqa	%xmm0, -47(%edx)
+L(aligned_16_31bytes):
+	movdqa	%xmm0, -31(%edx)
+L(aligned_16_15bytes):
+	movq	%xmm0, -15(%edx)
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN_END
+
+END (__memset_sse2_rep)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S
new file mode 100644
index 0000000000..d7b8be9114
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S
@@ -0,0 +1,860 @@
+/* memset with SSE2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST		PARMS
+# define LEN		DEST+4
+# define SETRTNVAL
+#else
+# define DEST		PARMS
+# define CHR		DEST+4
+# define LEN		CHR+4
+# define SETRTNVAL	movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define PARMS		8		/* Preserve EBX.  */
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.   */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    /* We first load PC into EBX.  */				\
+    SETUP_PIC_REG(bx);						\
+    /* Get the address of the jump table.  */			\
+    add		$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    add		(%ebx,%ecx,4), %ebx;				\
+    add		%ecx, %edx;					\
+    /* We loaded the jump table and adjusted EDX. Go.  */	\
+    jmp		*%ebx
+#else
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define PARMS		4
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    add		%ecx, %edx;					\
+    jmp		*TABLE(,%ecx,4)
+#endif
+
+	.section .text.sse2,"ax",@progbits
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2)
+#endif
+ENTRY (__memset_sse2)
+	ENTRANCE
+
+	movl	LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	CHR(%esp), %eax
+	movb	%al, %ah
+	/* Fill the whole EAX with pattern.  */
+	movl	%eax, %edx
+	shl	$16, %eax
+	or	%edx, %eax
+#endif
+	movl	DEST(%esp), %edx
+	cmp	$32, %ecx
+	jae	L(32bytesormore)
+
+L(write_less32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_less_32bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
+	.popsection
+
+	ALIGN (4)
+L(write_28bytes):
+	movl	%eax, -28(%edx)
+L(write_24bytes):
+	movl	%eax, -24(%edx)
+L(write_20bytes):
+	movl	%eax, -20(%edx)
+L(write_16bytes):
+	movl	%eax, -16(%edx)
+L(write_12bytes):
+	movl	%eax, -12(%edx)
+L(write_8bytes):
+	movl	%eax, -8(%edx)
+L(write_4bytes):
+	movl	%eax, -4(%edx)
+L(write_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_29bytes):
+	movl	%eax, -29(%edx)
+L(write_25bytes):
+	movl	%eax, -25(%edx)
+L(write_21bytes):
+	movl	%eax, -21(%edx)
+L(write_17bytes):
+	movl	%eax, -17(%edx)
+L(write_13bytes):
+	movl	%eax, -13(%edx)
+L(write_9bytes):
+	movl	%eax, -9(%edx)
+L(write_5bytes):
+	movl	%eax, -5(%edx)
+L(write_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_30bytes):
+	movl	%eax, -30(%edx)
+L(write_26bytes):
+	movl	%eax, -26(%edx)
+L(write_22bytes):
+	movl	%eax, -22(%edx)
+L(write_18bytes):
+	movl	%eax, -18(%edx)
+L(write_14bytes):
+	movl	%eax, -14(%edx)
+L(write_10bytes):
+	movl	%eax, -10(%edx)
+L(write_6bytes):
+	movl	%eax, -6(%edx)
+L(write_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_31bytes):
+	movl	%eax, -31(%edx)
+L(write_27bytes):
+	movl	%eax, -27(%edx)
+L(write_23bytes):
+	movl	%eax, -23(%edx)
+L(write_19bytes):
+	movl	%eax, -19(%edx)
+L(write_15bytes):
+	movl	%eax, -15(%edx)
+L(write_11bytes):
+	movl	%eax, -11(%edx)
+L(write_7bytes):
+	movl	%eax, -7(%edx)
+L(write_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(32bytesormore):
+	/* Fill xmm0 with the pattern.  */
+#ifdef USE_AS_BZERO
+	pxor	%xmm0, %xmm0
+#else
+	movd	%eax, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+#endif
+	testl	$0xf, %edx
+	jz	L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned.  */
+L(not_aligned_16):
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %eax
+	add	%eax, %ecx
+	movd	%xmm0, %eax
+
+	ALIGN (4)
+L(aligned_16):
+	cmp	$128, %ecx
+	jae	L(128bytesormore)
+
+L(aligned_16_less128bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytesormore):
+#ifdef SHARED_CACHE_SIZE
+	PUSH (%ebx)
+	mov	$SHARED_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
+# else
+	PUSH (%ebx)
+	mov	__x86_shared_cache_size, %ebx
+# endif
+#endif
+	cmp	%ebx, %ecx
+	jae	L(128bytesormore_nt_start)
+
+
+#ifdef DATA_CACHE_SIZE
+	POP (%ebx)
+# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
+	cmp	$DATA_CACHE_SIZE, %ecx
+#else
+# ifdef SHARED
+#  define RESTORE_EBX_STATE
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
+# else
+	POP (%ebx)
+#  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
+	cmp	__x86_data_cache_size, %ecx
+# endif
+#endif
+
+	jae	L(128bytes_L2_normal)
+	subl	$128, %ecx
+L(128bytesormore_normal):
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jb	L(128bytesless_normal)
+
+
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jae	L(128bytesormore_normal)
+
+L(128bytesless_normal):
+	add	$128, %ecx
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytes_L2_normal):
+	prefetcht0	0x380(%edx)
+	prefetcht0	0x3c0(%edx)
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm0, 0x10(%edx)
+	movaps	%xmm0, 0x20(%edx)
+	movaps	%xmm0, 0x30(%edx)
+	movaps	%xmm0, 0x40(%edx)
+	movaps	%xmm0, 0x50(%edx)
+	movaps	%xmm0, 0x60(%edx)
+	movaps	%xmm0, 0x70(%edx)
+	add	$128, %edx
+	cmp	$128, %ecx
+	jae	L(128bytes_L2_normal)
+
+L(128bytesless_L2_normal):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	RESTORE_EBX_STATE
+L(128bytesormore_nt_start):
+	sub	%ebx, %ecx
+	ALIGN (4)
+L(128bytesormore_shared_cache_loop):
+	prefetcht0	0x3c0(%edx)
+	prefetcht0	0x380(%edx)
+	sub	$0x80, %ebx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	add	$0x80, %edx
+	cmp	$0x80, %ebx
+	jae	L(128bytesormore_shared_cache_loop)
+	cmp	$0x80, %ecx
+	jb	L(shared_cache_loop_end)
+	ALIGN (4)
+L(128bytesormore_nt):
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm0, 0x10(%edx)
+	movntdq	%xmm0, 0x20(%edx)
+	movntdq	%xmm0, 0x30(%edx)
+	movntdq	%xmm0, 0x40(%edx)
+	movntdq	%xmm0, 0x50(%edx)
+	movntdq	%xmm0, 0x60(%edx)
+	movntdq	%xmm0, 0x70(%edx)
+	add	$0x80, %edx
+	cmp	$0x80, %ecx
+	jae	L(128bytesormore_nt)
+	sfence
+L(shared_cache_loop_end):
+#if defined DATA_CACHE_SIZE || !defined SHARED
+	POP (%ebx)
+#endif
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_16_128bytes):
+	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+	.popsection
+
+	ALIGN (4)
+L(aligned_16_112bytes):
+	movdqa	%xmm0, -112(%edx)
+L(aligned_16_96bytes):
+	movdqa	%xmm0, -96(%edx)
+L(aligned_16_80bytes):
+	movdqa	%xmm0, -80(%edx)
+L(aligned_16_64bytes):
+	movdqa	%xmm0, -64(%edx)
+L(aligned_16_48bytes):
+	movdqa	%xmm0, -48(%edx)
+L(aligned_16_32bytes):
+	movdqa	%xmm0, -32(%edx)
+L(aligned_16_16bytes):
+	movdqa	%xmm0, -16(%edx)
+L(aligned_16_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_113bytes):
+	movdqa	%xmm0, -113(%edx)
+L(aligned_16_97bytes):
+	movdqa	%xmm0, -97(%edx)
+L(aligned_16_81bytes):
+	movdqa	%xmm0, -81(%edx)
+L(aligned_16_65bytes):
+	movdqa	%xmm0, -65(%edx)
+L(aligned_16_49bytes):
+	movdqa	%xmm0, -49(%edx)
+L(aligned_16_33bytes):
+	movdqa	%xmm0, -33(%edx)
+L(aligned_16_17bytes):
+	movdqa	%xmm0, -17(%edx)
+L(aligned_16_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_114bytes):
+	movdqa	%xmm0, -114(%edx)
+L(aligned_16_98bytes):
+	movdqa	%xmm0, -98(%edx)
+L(aligned_16_82bytes):
+	movdqa	%xmm0, -82(%edx)
+L(aligned_16_66bytes):
+	movdqa	%xmm0, -66(%edx)
+L(aligned_16_50bytes):
+	movdqa	%xmm0, -50(%edx)
+L(aligned_16_34bytes):
+	movdqa	%xmm0, -34(%edx)
+L(aligned_16_18bytes):
+	movdqa	%xmm0, -18(%edx)
+L(aligned_16_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_115bytes):
+	movdqa	%xmm0, -115(%edx)
+L(aligned_16_99bytes):
+	movdqa	%xmm0, -99(%edx)
+L(aligned_16_83bytes):
+	movdqa	%xmm0, -83(%edx)
+L(aligned_16_67bytes):
+	movdqa	%xmm0, -67(%edx)
+L(aligned_16_51bytes):
+	movdqa	%xmm0, -51(%edx)
+L(aligned_16_35bytes):
+	movdqa	%xmm0, -35(%edx)
+L(aligned_16_19bytes):
+	movdqa	%xmm0, -19(%edx)
+L(aligned_16_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_116bytes):
+	movdqa	%xmm0, -116(%edx)
+L(aligned_16_100bytes):
+	movdqa	%xmm0, -100(%edx)
+L(aligned_16_84bytes):
+	movdqa	%xmm0, -84(%edx)
+L(aligned_16_68bytes):
+	movdqa	%xmm0, -68(%edx)
+L(aligned_16_52bytes):
+	movdqa	%xmm0, -52(%edx)
+L(aligned_16_36bytes):
+	movdqa	%xmm0, -36(%edx)
+L(aligned_16_20bytes):
+	movdqa	%xmm0, -20(%edx)
+L(aligned_16_4bytes):
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_117bytes):
+	movdqa	%xmm0, -117(%edx)
+L(aligned_16_101bytes):
+	movdqa	%xmm0, -101(%edx)
+L(aligned_16_85bytes):
+	movdqa	%xmm0, -85(%edx)
+L(aligned_16_69bytes):
+	movdqa	%xmm0, -69(%edx)
+L(aligned_16_53bytes):
+	movdqa	%xmm0, -53(%edx)
+L(aligned_16_37bytes):
+	movdqa	%xmm0, -37(%edx)
+L(aligned_16_21bytes):
+	movdqa	%xmm0, -21(%edx)
+L(aligned_16_5bytes):
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_118bytes):
+	movdqa	%xmm0, -118(%edx)
+L(aligned_16_102bytes):
+	movdqa	%xmm0, -102(%edx)
+L(aligned_16_86bytes):
+	movdqa	%xmm0, -86(%edx)
+L(aligned_16_70bytes):
+	movdqa	%xmm0, -70(%edx)
+L(aligned_16_54bytes):
+	movdqa	%xmm0, -54(%edx)
+L(aligned_16_38bytes):
+	movdqa	%xmm0, -38(%edx)
+L(aligned_16_22bytes):
+	movdqa	%xmm0, -22(%edx)
+L(aligned_16_6bytes):
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_119bytes):
+	movdqa	%xmm0, -119(%edx)
+L(aligned_16_103bytes):
+	movdqa	%xmm0, -103(%edx)
+L(aligned_16_87bytes):
+	movdqa	%xmm0, -87(%edx)
+L(aligned_16_71bytes):
+	movdqa	%xmm0, -71(%edx)
+L(aligned_16_55bytes):
+	movdqa	%xmm0, -55(%edx)
+L(aligned_16_39bytes):
+	movdqa	%xmm0, -39(%edx)
+L(aligned_16_23bytes):
+	movdqa	%xmm0, -23(%edx)
+L(aligned_16_7bytes):
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_120bytes):
+	movdqa	%xmm0, -120(%edx)
+L(aligned_16_104bytes):
+	movdqa	%xmm0, -104(%edx)
+L(aligned_16_88bytes):
+	movdqa	%xmm0, -88(%edx)
+L(aligned_16_72bytes):
+	movdqa	%xmm0, -72(%edx)
+L(aligned_16_56bytes):
+	movdqa	%xmm0, -56(%edx)
+L(aligned_16_40bytes):
+	movdqa	%xmm0, -40(%edx)
+L(aligned_16_24bytes):
+	movdqa	%xmm0, -24(%edx)
+L(aligned_16_8bytes):
+	movq	%xmm0, -8(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_121bytes):
+	movdqa	%xmm0, -121(%edx)
+L(aligned_16_105bytes):
+	movdqa	%xmm0, -105(%edx)
+L(aligned_16_89bytes):
+	movdqa	%xmm0, -89(%edx)
+L(aligned_16_73bytes):
+	movdqa	%xmm0, -73(%edx)
+L(aligned_16_57bytes):
+	movdqa	%xmm0, -57(%edx)
+L(aligned_16_41bytes):
+	movdqa	%xmm0, -41(%edx)
+L(aligned_16_25bytes):
+	movdqa	%xmm0, -25(%edx)
+L(aligned_16_9bytes):
+	movq	%xmm0, -9(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_122bytes):
+	movdqa	%xmm0, -122(%edx)
+L(aligned_16_106bytes):
+	movdqa	%xmm0, -106(%edx)
+L(aligned_16_90bytes):
+	movdqa	%xmm0, -90(%edx)
+L(aligned_16_74bytes):
+	movdqa	%xmm0, -74(%edx)
+L(aligned_16_58bytes):
+	movdqa	%xmm0, -58(%edx)
+L(aligned_16_42bytes):
+	movdqa	%xmm0, -42(%edx)
+L(aligned_16_26bytes):
+	movdqa	%xmm0, -26(%edx)
+L(aligned_16_10bytes):
+	movq	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_123bytes):
+	movdqa	%xmm0, -123(%edx)
+L(aligned_16_107bytes):
+	movdqa	%xmm0, -107(%edx)
+L(aligned_16_91bytes):
+	movdqa	%xmm0, -91(%edx)
+L(aligned_16_75bytes):
+	movdqa	%xmm0, -75(%edx)
+L(aligned_16_59bytes):
+	movdqa	%xmm0, -59(%edx)
+L(aligned_16_43bytes):
+	movdqa	%xmm0, -43(%edx)
+L(aligned_16_27bytes):
+	movdqa	%xmm0, -27(%edx)
+L(aligned_16_11bytes):
+	movq	%xmm0, -11(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_124bytes):
+	movdqa	%xmm0, -124(%edx)
+L(aligned_16_108bytes):
+	movdqa	%xmm0, -108(%edx)
+L(aligned_16_92bytes):
+	movdqa	%xmm0, -92(%edx)
+L(aligned_16_76bytes):
+	movdqa	%xmm0, -76(%edx)
+L(aligned_16_60bytes):
+	movdqa	%xmm0, -60(%edx)
+L(aligned_16_44bytes):
+	movdqa	%xmm0, -44(%edx)
+L(aligned_16_28bytes):
+	movdqa	%xmm0, -28(%edx)
+L(aligned_16_12bytes):
+	movq	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_125bytes):
+	movdqa	%xmm0, -125(%edx)
+L(aligned_16_109bytes):
+	movdqa	%xmm0, -109(%edx)
+L(aligned_16_93bytes):
+	movdqa	%xmm0, -93(%edx)
+L(aligned_16_77bytes):
+	movdqa	%xmm0, -77(%edx)
+L(aligned_16_61bytes):
+	movdqa	%xmm0, -61(%edx)
+L(aligned_16_45bytes):
+	movdqa	%xmm0, -45(%edx)
+L(aligned_16_29bytes):
+	movdqa	%xmm0, -29(%edx)
+L(aligned_16_13bytes):
+	movq	%xmm0, -13(%edx)
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_126bytes):
+	movdqa	%xmm0, -126(%edx)
+L(aligned_16_110bytes):
+	movdqa	%xmm0, -110(%edx)
+L(aligned_16_94bytes):
+	movdqa	%xmm0, -94(%edx)
+L(aligned_16_78bytes):
+	movdqa	%xmm0, -78(%edx)
+L(aligned_16_62bytes):
+	movdqa	%xmm0, -62(%edx)
+L(aligned_16_46bytes):
+	movdqa	%xmm0, -46(%edx)
+L(aligned_16_30bytes):
+	movdqa	%xmm0, -30(%edx)
+L(aligned_16_14bytes):
+	movq	%xmm0, -14(%edx)
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_127bytes):
+	movdqa	%xmm0, -127(%edx)
+L(aligned_16_111bytes):
+	movdqa	%xmm0, -111(%edx)
+L(aligned_16_95bytes):
+	movdqa	%xmm0, -95(%edx)
+L(aligned_16_79bytes):
+	movdqa	%xmm0, -79(%edx)
+L(aligned_16_63bytes):
+	movdqa	%xmm0, -63(%edx)
+L(aligned_16_47bytes):
+	movdqa	%xmm0, -47(%edx)
+L(aligned_16_31bytes):
+	movdqa	%xmm0, -31(%edx)
+L(aligned_16_15bytes):
+	movq	%xmm0, -15(%edx)
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN_END
+
+END (__memset_sse2)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S
new file mode 100644
index 0000000000..f601663a9f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S
@@ -0,0 +1,75 @@
+/* Multiple versions of memset
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(memset)
+	.type	memset, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memset_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_sse2)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_sse2_rep)
+2:	ret
+END(memset)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memset_ia32, @function; \
+	.globl __memset_ia32; \
+	.p2align 4; \
+	__memset_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memset_ia32, .-__memset_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memset_chk_ia32, @function; \
+	.globl __memset_chk_ia32; \
+	.p2align 4; \
+	__memset_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memset_chk_ia32, .-__memset_chk_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memset; __GI_memset = __memset_ia32
+# endif
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S
new file mode 100644
index 0000000000..573cf4208a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S
@@ -0,0 +1,82 @@
+/* Multiple versions of __memset_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__memset_chk)
+	.type	__memset_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memset_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_chk_sse2)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_chk_sse2_rep)
+2:	ret
+END(__memset_chk)
+
+# ifdef SHARED
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+	.section .gnu.warning.__memset_zero_constant_len_parameter
+	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+# else
+	.text
+	.type __memset_chk_sse2, @function
+	.p2align 4;
+__memset_chk_sse2:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_sse2
+	cfi_endproc
+	.size __memset_chk_sse2, .-__memset_chk_sse2
+
+	.type __memset_chk_sse2_rep, @function
+	.p2align 4;
+__memset_chk_sse2_rep:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_sse2_rep
+	cfi_endproc
+	.size __memset_chk_sse2_rep, .-__memset_chk_sse2_rep
+
+	.type __memset_chk_ia32, @function
+	.p2align 4;
+__memset_chk_ia32:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_ia32
+	cfi_endproc
+	.size __memset_chk_ia32, .-__memset_chk_ia32
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
new file mode 100644
index 0000000000..88c0e5776c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2_bsf
+#include "memchr-sse2-bsf.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
new file mode 100644
index 0000000000..038c74896b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2
+#include "memchr-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S
new file mode 100644
index 0000000000..0a41d63ee8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S
@@ -0,0 +1,65 @@
+/* Multiple versions of rawmemchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__rawmemchr)
+	.type	__rawmemchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	3f
+
+	LOAD_FUNC_GOT_EAX (__rawmemchr_sse2)
+	ret
+
+2:	LOAD_FUNC_GOT_EAX (__rawmemchr_ia32)
+	ret
+
+3:	LOAD_FUNC_GOT_EAX (__rawmemchr_sse2_bsf)
+	ret
+END(__rawmemchr)
+
+weak_alias(__rawmemchr, rawmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __rawmemchr_ia32, @function; \
+	.globl __rawmemchr_ia32; \
+	.p2align 4; \
+	__rawmemchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __rawmemchr_ia32, .-__rawmemchr_ia32
+
+# undef libc_hidden_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_def(name) \
+	.globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_ia32
+
+#endif
+#include "../../rawmemchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c
new file mode 100644
index 0000000000..1aa5440644
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c
@@ -0,0 +1 @@
+#include <string/strnlen.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c
new file mode 100644
index 0000000000..2e9619f97c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c
@@ -0,0 +1,27 @@
+/* FMA version of fma.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+double
+__fma_fma (double x, double y, double z)
+{
+  asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c
new file mode 100644
index 0000000000..411ebb2ba9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c
@@ -0,0 +1,34 @@
+/* Multiple versions of fma.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include <math.h>
+#include <init-arch.h>
+
+extern double __fma_ia32 (double x, double y, double z) attribute_hidden;
+extern double __fma_fma (double x, double y, double z) attribute_hidden;
+
+libm_ifunc (__fma,
+	    HAS_ARCH_FEATURE (FMA_Usable) ? __fma_fma : __fma_ia32);
+weak_alias (__fma, fma)
+
+#define __fma __fma_ia32
+
+#include <sysdeps/ieee754/ldbl-96/s_fma.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
new file mode 100644
index 0000000000..ee57abfda2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
@@ -0,0 +1,27 @@
+/* FMA version of fmaf.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+float
+__fmaf_fma (float x, float y, float z)
+{
+  asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c
new file mode 100644
index 0000000000..00b0fbcfc5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c
@@ -0,0 +1,34 @@
+/* Multiple versions of fmaf.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include <math.h>
+#include <init-arch.h>
+
+extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden;
+extern float __fmaf_fma (float x, float y, float z) attribute_hidden;
+
+libm_ifunc (__fmaf,
+	    HAS_ARCH_FEATURE (FMA_Usable) ? __fmaf_fma : __fmaf_ia32);
+weak_alias (__fmaf, fmaf)
+
+#define __fmaf __fmaf_ia32
+
+#include <sysdeps/ieee754/dbl-64/s_fmaf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c
new file mode 100644
index 0000000000..7db31b02f8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/sched_cpucount.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
new file mode 100644
index 0000000000..46ca1b3074
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
new file mode 100644
index 0000000000..d971c2da38
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S
new file mode 100644
index 0000000000..ee81ab6ae3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S
@@ -0,0 +1,9 @@
+/* Multiple versions of stpcpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
new file mode 100644
index 0000000000..37a703cb76
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
new file mode 100644
index 0000000000..14ed16f6b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S
new file mode 100644
index 0000000000..2698ca6a8c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S
@@ -0,0 +1,8 @@
+/* Multiple versions of stpncpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c
new file mode 100644
index 0000000000..753c6ec84a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c
@@ -0,0 +1,12 @@
+#include <string.h>
+
+extern __typeof (strcasecmp) __strcasecmp_nonascii;
+
+#define __strcasecmp __strcasecmp_nonascii
+#include <string/strcasecmp.c>
+
+strong_alias (__strcasecmp_nonascii, __strcasecmp_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+   the IFUNC.  */
+strong_alias (__strcasecmp_nonascii, __GI___strcasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S
new file mode 100644
index 0000000000..ec59276408
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S
@@ -0,0 +1,39 @@
+/* Entry point for multi-version x86 strcasecmp.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY(__strcasecmp)
+	.type	__strcasecmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strcasecmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strcasecmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	LOAD_FUNC_GOT_EAX (__strcasecmp_sse4_2)
+2:	ret
+END(__strcasecmp)
+
+weak_alias (__strcasecmp, strcasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c
new file mode 100644
index 0000000000..d4fcd2b4a1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c
@@ -0,0 +1,13 @@
+#include <string.h>
+
+extern __typeof (strcasecmp_l) __strcasecmp_l_nonascii;
+
+#define __strcasecmp_l __strcasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL    1
+#include <string/strcasecmp.c>
+
+strong_alias (__strcasecmp_l_nonascii, __strcasecmp_l_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+   the IFUNC.  */
+strong_alias (__strcasecmp_l_nonascii, __GI___strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S
new file mode 100644
index 0000000000..411d4153f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCASECMP_L 1
+#include "strcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S
new file mode 100644
index 0000000000..a22b93c518
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCASECMP_L 1
+#include "strcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S
new file mode 100644
index 0000000000..711c09b0dc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S
@@ -0,0 +1,7 @@
+/* Multiple versions of strcasecmp_l
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCMP __strcasecmp_l
+#define USE_AS_STRCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strcasecmp_l, strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S
new file mode 100644
index 0000000000..6359c7330c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S
@@ -0,0 +1,1245 @@
+/* strcat with SSE2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifdef SHARED
+#  define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into ECX and branch to it.  TABLE is a
+	jump table with relative offsets.  INDEX is a register contains the
+	index into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	/* We first load PC into ECX.  */	\
+	SETUP_PIC_REG(cx);	\
+	/* Get the address of the jump table.  */	\
+	addl	$(TABLE - .), %ecx;	\
+	/* Get the entry and convert the relative offset to the	\
+	absolute address.  */	\
+	addl	(%ecx,INDEX,SCALE), %ecx;	\
+	/* We loaded the jump table and adjusted ECX. Go.  */	\
+	jmp	*%ecx
+# else
+#  define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute offsets.  INDEX is a register contains the index into the
+	jump table.  SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_sse2
+# endif
+
+# define PARMS  4
+# define STR1  PARMS+4
+# define STR2  STR1+4
+
+# ifdef USE_AS_STRNCAT
+#  define LEN    STR2+8
+#  define STR3   STR1+4
+# else
+#  define STR3   STR1
+# endif
+
+# define USE_AS_STRCAT
+# ifdef USE_AS_STRNCAT
+#  define RETURN  POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi);
+# else
+#  define RETURN  POP(%esi); ret; CFI_PUSH(%esi);
+# endif
+
+.text
+ENTRY (STRCAT)
+	PUSH	(%esi)
+	mov	STR1(%esp), %eax
+	mov	STR2(%esp), %esi
+# ifdef USE_AS_STRNCAT
+	PUSH	(%ebx)
+	movl	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(ExitZero)
+# endif
+	cmpb	$0, (%esi)
+	mov	%esi, %ecx
+	mov	%eax, %edx
+	jz	L(ExitZero)
+
+	and	$63, %ecx
+	and	$63, %edx
+	cmp	$32, %ecx
+	ja	L(StrlenCore7_1)
+	cmp	$48, %edx
+	ja	L(alignment_prolog)
+
+	pxor	%xmm0, %xmm0
+	pxor	%xmm4, %xmm4
+	pxor	%xmm7, %xmm7
+	movdqu	(%eax), %xmm1
+	movdqu	(%esi), %xmm5
+	pcmpeqb	%xmm1, %xmm0
+	movdqu	16(%esi), %xmm6
+	pmovmskb %xmm0, %ecx
+	pcmpeqb	%xmm5, %xmm4
+	pcmpeqb	%xmm6, %xmm7
+	test	%ecx, %ecx
+	jnz	L(exit_less16_)
+	mov	%eax, %ecx
+	and	$-16, %eax
+	jmp	L(loop_prolog)
+
+L(alignment_prolog):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm4, %xmm4
+	mov	%edx, %ecx
+	pxor	%xmm7, %xmm7
+	and	$15, %ecx
+	and	$-16, %eax
+	pcmpeqb	(%eax), %xmm0
+	movdqu	(%esi), %xmm5
+	movdqu	16(%esi), %xmm6
+	pmovmskb %xmm0, %edx
+	pcmpeqb	%xmm5, %xmm4
+	shr	%cl, %edx
+	pcmpeqb	%xmm6, %xmm7
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	add	%eax, %ecx
+
+	pxor	%xmm0, %xmm0
+L(loop_prolog):
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	.p2align 4
+L(align16_loop):
+	pcmpeqb	16(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(align16_loop)
+	bsf	%edx, %edx
+	add	%edx, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit16):
+	bsf	%edx, %edx
+	lea	16(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit32):
+	bsf	%edx, %edx
+	lea	32(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit48):
+	bsf	%edx, %edx
+	lea	48(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_less16):
+	bsf	%edx, %edx
+	add	%ecx, %eax
+	add	%edx, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_less16_):
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+
+	.p2align 4
+L(StartStrcpyPart):
+	pmovmskb %xmm4, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	movdqu	%xmm5, (%eax)
+	pmovmskb %xmm7, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes1)
+
+	mov	%esi, %ecx
+	and	$-16, %esi
+	and	$15, %ecx
+	pxor	%xmm0, %xmm0
+# ifdef USE_AS_STRNCAT
+	add	%ecx, %ebx
+	sbb	%edx, %edx
+	or	%edx, %ebx
+# endif
+	sub	%ecx, %eax
+	jmp	L(Unalign16Both)
+
+L(StrlenCore7_1):
+	mov	%eax, %ecx
+	pxor	%xmm0, %xmm0
+	and	$15, %ecx
+	and	$-16, %eax
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	shr	%cl, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16_1)
+	add	%eax, %ecx
+
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+
+	.p2align 4
+L(align16_loop_1):
+	pcmpeqb	16(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16_1)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32_1)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48_1)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(align16_loop_1)
+	bsf	%edx, %edx
+	add	%edx, %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit16_1):
+	bsf	%edx, %edx
+	lea	16(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit32_1):
+	bsf	%edx, %edx
+	lea	32(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit48_1):
+	bsf	%edx, %edx
+	lea	48(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit_less16_1):
+	bsf	%edx, %edx
+	add	%ecx, %eax
+	add	%edx, %eax
+
+	.p2align 4
+L(StartStrcpyPart_1):
+	mov	%esi, %ecx
+	and	$15, %ecx
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+# ifdef USE_AS_STRNCAT
+	cmp	$48, %ebx
+	ja      L(BigN)
+# endif
+	pcmpeqb	(%esi), %xmm1
+# ifdef USE_AS_STRNCAT
+	add	%ecx, %ebx
+# endif
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%eax)
+	sub	%ecx, %eax
+
+	.p2align 4
+L(Unalign16Both):
+	mov	$16, %ecx
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%eax, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$48, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+L(Unalign16BothBigN):
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%eax, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm4
+	movdqu	%xmm3, (%eax, %ecx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm1
+	movdqu	%xmm4, (%eax, %ecx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%eax, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%eax, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movdqu	%xmm3, (%eax, %ecx)
+	mov	%esi, %edx
+	lea	16(%esi, %ecx), %esi
+	and	$-0x40, %esi
+	sub	%esi, %edx
+	sub	%edx, %eax
+# ifdef USE_AS_STRNCAT
+	lea	128(%ebx, %edx), %ebx
+# endif
+	movaps	(%esi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%esi), %xmm5
+	movaps	32(%esi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%esi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+# ifdef USE_AS_STRNCAT
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(Unaligned64Leave)
+
+	.p2align 4
+L(Unaligned64Loop_start):
+	add	$64, %eax
+	add	$64, %esi
+	movdqu	%xmm4, -64(%eax)
+	movaps	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%eax)
+	movaps	16(%esi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%esi), %xmm3
+	movdqu	%xmm6, -32(%eax)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%eax)
+	movaps	48(%esi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+# ifdef USE_AS_STRNCAT
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jz	L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%ecx, %ecx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%eax)
+	movdqu	%xmm5, 16(%eax)
+	movdqu	%xmm6, 32(%eax)
+	add	$48, %esi
+	add	$48, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+# ifdef USE_AS_STRNCAT
+	.p2align 4
+L(BigN):
+	pcmpeqb	(%esi), %xmm1
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%eax)
+	sub	%ecx, %eax
+	sub     $48, %ebx
+	add     %ecx, %ebx
+
+	mov	$16, %ecx
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%eax, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+	jmp	L(Unalign16BothBigN)
+# endif
+
+/*------------end of main part-------------------------------*/
+
+/* Case1 */
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%ecx, %eax
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %esi
+	add	$16, %eax
+L(CopyFrom1To16BytesTail1):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+	bsf	%edx, %edx
+	add	%ecx, %esi
+	add	$16, %edx
+	sub	%ecx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%eax)
+	add	$16, %esi
+	add	$16, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%edx, %edx
+	movdqu	%xmm4, (%eax)
+	movdqu	%xmm5, 16(%eax)
+	add	$32, %esi
+	add	$32, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+# ifdef USE_AS_STRNCAT
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%ecx, %eax
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	add	$16, %edx
+	sub	%ecx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%ecx, %eax
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32BytesCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %eax
+	add	$16, %esi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+# endif
+
+# ifdef USE_AS_STRNCAT
+	.p2align 4
+L(StrncatExit0):
+	movb	%bh, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+# endif
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit1):
+	movb	%bh, 1(%eax)
+# endif
+L(Exit1):
+# ifdef USE_AS_STRNCAT
+	movb	(%esi), %dh
+# endif
+	movb	%dh, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit2):
+	movb	%bh, 2(%eax)
+# endif
+L(Exit2):
+	movw	(%esi), %dx
+	movw	%dx, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit3):
+	movb	%bh, 3(%eax)
+# endif
+L(Exit3):
+	movw	(%esi), %cx
+	movw	%cx, (%eax)
+# ifdef USE_AS_STRNCAT
+	movb	2(%esi), %dh
+# endif
+	movb	%dh, 2(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit4):
+	movb	%bh, 4(%eax)
+# endif
+L(Exit4):
+	movl	(%esi), %edx
+	movl	%edx, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit5):
+	movb	%bh, 5(%eax)
+# endif
+L(Exit5):
+	movl	(%esi), %ecx
+# ifdef USE_AS_STRNCAT
+	movb	4(%esi), %dh
+# endif
+	movb	%dh, 4(%eax)
+	movl	%ecx, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit6):
+	movb	%bh, 6(%eax)
+# endif
+L(Exit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%eax)
+	movw	%dx, 4(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit7):
+	movb	%bh, 7(%eax)
+# endif
+L(Exit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%eax)
+	movl	%edx, 3(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit8):
+	movb	%bh, 8(%eax)
+# endif
+L(Exit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit9):
+	movb	%bh, 9(%eax)
+# endif
+L(Exit9):
+	movlpd	(%esi), %xmm0
+# ifdef USE_AS_STRNCAT
+	movb	8(%esi), %dh
+# endif
+	movb	%dh, 8(%eax)
+	movlpd	%xmm0, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit10):
+	movb	%bh, 10(%eax)
+# endif
+L(Exit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%eax)
+	movw	%dx, 8(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit11):
+	movb	%bh, 11(%eax)
+# endif
+L(Exit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%eax)
+	movl	%edx, 7(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit12):
+	movb	%bh, 12(%eax)
+# endif
+L(Exit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%eax)
+	movl	%edx, 8(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit13):
+	movb	%bh, 13(%eax)
+# endif
+L(Exit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%eax)
+	movlpd	%xmm1, 5(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit14):
+	movb	%bh, 14(%eax)
+# endif
+L(Exit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%eax)
+	movlpd	%xmm1, 6(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit15):
+	movb	%bh, 15(%eax)
+# endif
+L(Exit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%eax)
+	movlpd	%xmm1, 7(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit16):
+	movb	%bh, 16(%eax)
+# endif
+L(Exit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit17):
+	movb	%bh, 17(%eax)
+# endif
+L(Exit17):
+	movdqu	(%esi), %xmm0
+# ifdef USE_AS_STRNCAT
+	movb	16(%esi), %dh
+# endif
+	movdqu	%xmm0, (%eax)
+	movb	%dh, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit18):
+	movb	%bh, 18(%eax)
+# endif
+L(Exit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%eax)
+	movw	%cx, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit19):
+	movb	%bh, 19(%eax)
+# endif
+L(Exit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movl	%ecx, 15(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit20):
+	movb	%bh, 20(%eax)
+# endif
+L(Exit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movl	%ecx, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit21):
+	movb	%bh, 21(%eax)
+# endif
+L(Exit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+# ifdef USE_AS_STRNCAT
+	movb	20(%esi), %dh
+# endif
+	movdqu	%xmm0, (%eax)
+	movl	%ecx, 16(%eax)
+	movb	%dh, 20(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit22):
+	movb	%bh, 22(%eax)
+# endif
+L(Exit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm3, 14(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit23):
+	movb	%bh, 23(%eax)
+# endif
+L(Exit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm3, 15(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit24):
+	movb	%bh, 24(%eax)
+# endif
+L(Exit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit25):
+	movb	%bh, 25(%eax)
+# endif
+L(Exit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+# ifdef USE_AS_STRNCAT
+	movb	24(%esi), %dh
+# endif
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movb	%dh, 24(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit26):
+	movb	%bh, 26(%eax)
+# endif
+L(Exit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movw	%cx, 24(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit27):
+	movb	%bh, 27(%eax)
+# endif
+L(Exit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movl	%ecx, 23(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit28):
+	movb	%bh, 28(%eax)
+# endif
+L(Exit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movl	%ecx, 24(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit29):
+	movb	%bh, 29(%eax)
+# endif
+L(Exit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 13(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit30):
+	movb	%bh, 30(%eax)
+# endif
+L(Exit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 14(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit31):
+	movb	%bh, 31(%eax)
+# endif
+L(Exit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 15(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit32):
+	movb	%bh, 32(%eax)
+# endif
+L(Exit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+# ifdef USE_AS_STRNCAT
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%ebx), %ecx
+	and	$-16, %ecx
+	add	$48, %ebx
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%eax)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%eax)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%eax)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%eax)
+	xor	%bh, %bh
+	movb	%bh, 64(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	xor	%ecx, %ecx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm4, (%eax)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm5, 16(%eax)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm6, 32(%eax)
+	lea	16(%eax, %ecx), %eax
+	lea	16(%esi, %ecx), %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+# endif
+	.p2align 4
+L(ExitZero):
+	RETURN
+
+END (STRCAT)
+
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int	JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+# ifdef USE_AS_STRNCAT
+L(ExitStrncatTable):
+	.int	JMPTBL(L(StrncatExit0), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit1), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit2), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit3), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit4), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit5), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit6), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit7), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit8), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit9), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit10), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit11), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit12), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit13), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit14), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit15), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit16), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit17), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit18), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit19), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit20), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit21), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit22), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit23), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit24), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit25), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit26), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit27), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit28), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit29), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit30), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit31), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit32), L(ExitStrncatTable))
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S
new file mode 100644
index 0000000000..59ffbc60a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S
@@ -0,0 +1,572 @@
+/* strcat with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY	or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_ssse3
+# endif
+
+# define PARMS  4
+# define STR1  PARMS+4
+# define STR2  STR1+4
+
+# ifdef USE_AS_STRNCAT
+#  define LEN STR2+8
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+	PUSH	(%edi)
+	mov	STR1(%esp), %edi
+	mov	%edi, %edx
+
+# define RETURN  jmp L(StartStrcpyPart)
+# include "strlen-sse2.S"
+
+L(StartStrcpyPart):
+	mov	STR2(%esp), %ecx
+	lea	(%edi, %eax), %edx
+# ifdef USE_AS_STRNCAT
+	PUSH	(%ebx)
+	mov	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(StrncatExit0)
+	cmp	$8, %ebx
+	jbe	L(StrncatExit8Bytes)
+# endif
+	cmpb	$0, (%ecx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%ecx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%ecx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%ecx)
+	jz	L(Exit4)
+	cmpb	$0, 4(%ecx)
+	jz	L(Exit5)
+	cmpb	$0, 5(%ecx)
+	jz	L(Exit6)
+	cmpb	$0, 6(%ecx)
+	jz	L(Exit7)
+	cmpb	$0, 7(%ecx)
+	jz	L(Exit8)
+	cmpb	$0, 8(%ecx)
+	jz	L(Exit9)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	jb	L(StrncatExit15Bytes)
+# endif
+	cmpb	$0, 9(%ecx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%ecx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%ecx)
+	jz	L(Exit12)
+	cmpb	$0, 12(%ecx)
+	jz	L(Exit13)
+	cmpb	$0, 13(%ecx)
+	jz	L(Exit14)
+	cmpb	$0, 14(%ecx)
+	jz	L(Exit15)
+	cmpb	$0, 15(%ecx)
+	jz	L(Exit16)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	je	L(StrncatExit16)
+
+#  define RETURN1	\
+	POP	(%ebx);	\
+	POP	(%edi);	\
+	ret;	\
+	CFI_PUSH	(%ebx);	\
+	CFI_PUSH	(%edi)
+#  define USE_AS_STRNCPY
+# else
+#  define RETURN1  POP (%edi); ret; CFI_PUSH (%edi)
+# endif
+# include "strcpy-ssse3.S"
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit1):
+	movb	%bh, 1(%edx)
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit2):
+	movb	%bh, 2(%edx)
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit3):
+	movb	%bh, 3(%edx)
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit4):
+	movb	%bh, 4(%edx)
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit5):
+	movb	%bh, 5(%edx)
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit6):
+	movb	%bh, 6(%edx)
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit7):
+	movb	%bh, 7(%edx)
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit8):
+	movb	%bh, 8(%edx)
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit9):
+	movb	%bh, 9(%edx)
+L(Exit9):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit10):
+	movb	%bh, 10(%edx)
+L(Exit10):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit11):
+	movb	%bh, 11(%edx)
+L(Exit11):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit12):
+	movb	%bh, 12(%edx)
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit13):
+	movb	%bh, 13(%edx)
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit14):
+	movb	%bh, 14(%edx)
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit15):
+	movb	%bh, 15(%edx)
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit16):
+	movb	%bh, 16(%edx)
+L(Exit16):
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+# ifdef USE_AS_STRNCPY
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%esi, %ecx
+	lea	(%esi, %edx), %esi
+	lea	-9(%ebx), %edx
+	and	$1<<7, %dh
+	or	%al, %dh
+	test	%dh, %dh
+	lea	(%esi), %edx
+	POP	(%esi)
+	jz	L(ExitHighCase2)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	xor	%cl, %cl
+	movb	%cl, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHighCase2):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %ebx
+	je	L(StrncatExit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	CFI_PUSH(%esi)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+
+	cmp	$8, %ebx
+	ja	L(ExitHighCase3)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	%bh, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHighCase3):
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	cmp	$15, %ebx
+	je	L(StrncatExit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+	movb	%bh, 16(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit0):
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit15Bytes):
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	cmpb	$0, 9(%ecx)
+	jz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	cmpb	$0, 10(%ecx)
+	jz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	cmpb	$0, 11(%ecx)
+	jz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	cmpb	$0, 12(%ecx)
+	jz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	cmpb	$0, 13(%ecx)
+	jz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	lea	14(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	movb	%bh, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit8Bytes):
+	cmpb	$0, (%ecx)
+	jz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	cmpb	$0, 1(%ecx)
+	jz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	cmpb	$0, 2(%ecx)
+	jz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	cmpb	$0, 3(%ecx)
+	jz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	cmpb	$0, 4(%ecx)
+	jz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	cmpb	$0, 5(%ecx)
+	jz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	cmpb	$0, 6(%ecx)
+	jz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	movb	%bh, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+# endif
+END (STRCAT)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S
new file mode 100644
index 0000000000..8412cb6f23
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S
@@ -0,0 +1,92 @@
+/* Multiple versions of strcat
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef USE_AS_STRNCAT
+# ifndef STRCAT
+#  define STRCAT strcat
+# endif
+#endif
+
+#ifdef USE_AS_STRNCAT
+# define STRCAT_SSSE3	__strncat_ssse3
+# define STRCAT_SSE2		__strncat_sse2
+# define STRCAT_IA32		__strncat_ia32
+# define __GI_STRCAT		__GI_strncat
+#else
+# define STRCAT_SSSE3	__strcat_ssse3
+# define STRCAT_SSE2		__strcat_sse2
+# define STRCAT_IA32		__strcat_ia32
+# define __GI_STRCAT		__GI_strcat
+#endif
+
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncat in static library since we
+   need strncat before the initialization happened.  */
+#if IS_IN (libc)
+
+	.text
+ENTRY(STRCAT)
+	.type	STRCAT, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCAT_IA32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCAT_SSE2)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCAT_SSSE3)
+2:	ret
+END(STRCAT)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCAT_IA32, @function; \
+	.align 16; \
+	.globl STRCAT_IA32; \
+	.hidden STRCAT_IA32; \
+	STRCAT_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCAT_IA32, .-STRCAT_IA32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcat calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCAT; __GI_STRCAT = STRCAT_IA32
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI___STRCAT; __GI___STRCAT = STRCAT_IA32
+
+# endif
+#endif
+
+#ifndef USE_AS_STRNCAT
+# include "../../strcat.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
new file mode 100644
index 0000000000..95fd7c084e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
@@ -0,0 +1,158 @@
+/* strchr with SSE2 with bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  8
+# define ENTRANCE PUSH(%edi)
+# define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	.text
+ENTRY (__strchr_sse2_bsf)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$15, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	je	L(loop)
+
+/* Handle unaligned string.  */
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	je	L(unaligned_no_match)
+	/* Check which byte is a match.  */
+	bsf	%eax, %eax
+	/* Is there a NULL? */
+	test	%edx, %edx
+	je	L(unaligned_match)
+	bsf	%edx, %edx
+	cmpl	%edx, %eax
+	/* Return NULL if NULL comes first.  */
+	ja	L(return_null)
+L(unaligned_match):
+	add	%edi, %eax
+	add	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(unaligned_no_match):
+	test	%edx, %edx
+	jne	L(return_null)
+	pxor	%xmm2, %xmm2
+
+	add	$16, %edi
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	jmp	L(loop)
+
+L(matches):
+	pmovmskb %xmm2, %edx
+	test	%eax, %eax
+	jz	L(return_null)
+	bsf	%eax, %eax
+	/* There is a match.  First find where NULL is.  */
+	test	%edx, %edx
+	je	L(match)
+	bsf	%edx, %ecx
+	/* Check if NULL comes first.  */
+	cmpl	%ecx, %eax
+	ja	L(return_null)
+L(match):
+	sub	$16, %edi
+	add	%edi, %eax
+	RETURN
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+END (__strchr_sse2_bsf)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S
new file mode 100644
index 0000000000..1f9e875b04
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S
@@ -0,0 +1,348 @@
+/* strchr SSE2 without bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  8
+# define ENTRANCE PUSH(%edi)
+# define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	atom_text_section
+ENTRY (__strchr_sse2)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$15, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	je	L(loop)
+
+/* Handle unaligned string.  */
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	jz	L(unaligned_no_match)
+	/* Check which byte is a match.  */
+	/* Is there a NULL? */
+	add	%ecx, %edi
+	test	%edx, %edx
+	jz	L(match_case1)
+	jmp	L(match_case2)
+
+	.p2align 4
+L(unaligned_no_match):
+	test	%edx, %edx
+	jne	L(return_null)
+
+	pxor	%xmm2, %xmm2
+	add	$16, %edi
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+	jmp	L(loop)
+
+L(matches):
+	/* There is a match.  First find where NULL is.  */
+	test	%edx, %edx
+	jz	L(match_case1)
+
+	.p2align 4
+L(match_case2):
+	test	%al, %al
+	jz	L(match_higth_case2)
+
+	mov	%al, %cl
+	and	$15, %cl
+	jnz	L(match_case2_4)
+
+	mov	%dl, %ch
+	and	$15, %ch
+	jnz	L(return_null)
+
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x10, %dl
+	jnz	L(return_null)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x20, %dl
+	jnz	L(return_null)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	test	$0x40, %dl
+	jnz	L(return_null)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_4):
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x01, %dl
+	jnz	L(return_null)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x02, %dl
+	jnz	L(return_null)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x04, %dl
+	jnz	L(return_null)
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_higth_case2):
+	test	%dl, %dl
+	jnz	L(return_null)
+
+	mov	%ah, %cl
+	and	$15, %cl
+	jnz	L(match_case2_12)
+
+	mov	%dh, %ch
+	and	$15, %ch
+	jnz	L(return_null)
+
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x10, %dh
+	jnz	L(return_null)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x20, %dh
+	jnz	L(return_null)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	test	$0x40, %dh
+	jnz	L(return_null)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_12):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x01, %dh
+	jnz	L(return_null)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x02, %dh
+	jnz	L(return_null)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x04, %dh
+	jnz	L(return_null)
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case1):
+	test	%al, %al
+	jz	L(match_higth_case1)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_higth_case1):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	lea	(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	lea	1(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	lea	2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	lea	4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	lea	5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	lea	6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	lea	8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	lea	9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	lea	10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	lea	12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	lea	13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	lea	14(%edi), %eax
+	RETURN
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+END (__strchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S
new file mode 100644
index 0000000000..5b97b1c767
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S
@@ -0,0 +1,57 @@
+/* Multiple versions of strchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(strchr)
+	.type	strchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strchr_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strchr_sse2)
+2:	ret
+END(strchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strchr_ia32, @function; \
+	.globl __strchr_ia32; \
+	.p2align 4; \
+	__strchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strchr_ia32, .-__strchr_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strchr; __GI_strchr = __strchr_ia32
+#endif
+
+#include "../../i586/strchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S
new file mode 100644
index 0000000000..cd26058671
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S
@@ -0,0 +1,804 @@
+/* strcmp with SSE4.2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_STRNCMP
+# ifndef STRCMP
+#  define STRCMP	__strncmp_sse4_2
+# endif
+# define STR1		8
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define RETURN		POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# define REM		%ebp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strcasecmp_l_sse4_2
+# endif
+# ifdef PIC
+#  define STR1		12
+# else
+#  define STR1		8
+# endif
+# define STR2		STR1+4
+# define LOCALE		12	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (%edi); POP (%ebx); ret; \
+			.p2align 4; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+# else
+#  define RETURN	POP (%edi); ret; .p2align 4; CFI_PUSH (%edi)
+# endif
+# define NONASCII	__strcasecmp_nonascii
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strncasecmp_l_sse4_2
+# endif
+# ifdef PIC
+#  define STR1		16
+# else
+#  define STR1		12
+# endif
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define LOCALE		16	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (%edi); POP (REM); POP (%ebx); ret; \
+			.p2align 4; \
+			CFI_PUSH (%ebx); CFI_PUSH (REM); CFI_PUSH (%edi)
+# else
+#  define RETURN	POP (%edi); POP (REM); ret; \
+			.p2align 4; CFI_PUSH (REM); CFI_PUSH (%edi)
+# endif
+# define REM		%ebp
+# define NONASCII	__strncasecmp_nonascii
+#else
+# ifndef STRCMP
+#  define STRCMP	__strcmp_sse4_2
+# endif
+# define STR1		4
+# define STR2		STR1+4
+# define RETURN		ret; .p2align 4
+#endif
+
+	.section .text.sse4.2,"ax",@progbits
+
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (__strcasecmp_sse4_2)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strcasecmp_nonascii
+# else
+	jne	__strcasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strcasecmp_sse4_2)
+#endif
+
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (__strncasecmp_sse4_2)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strncasecmp_nonascii
+# else
+	jne	__strncasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strncasecmp_sse4_2)
+#endif
+
+	ENTRY (STRCMP)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movl	LOCALE(%esp), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+	jne	NONASCII
+
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+# endif
+L(ascii):
+	.section .rodata.cst16,"aM",@progbits,16
+	.align 16
+.Lbelowupper:
+	.quad	0x4040404040404040
+	.quad	0x4040404040404040
+.Ltopupper:
+	.quad	0x5b5b5b5b5b5b5b5b
+	.quad	0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+# ifdef PIC
+#  define UCLOW_reg .Lbelowupper@GOTOFF(%ebx)
+#  define UCHIGH_reg .Ltopupper@GOTOFF(%ebx)
+#  define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx)
+# else
+#  define UCLOW_reg .Lbelowupper
+#  define UCHIGH_reg .Ltopupper
+#  define LCQWORD_reg .Ltouppermask
+# endif
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	PUSH	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	PUSH	(%edi)
+#endif
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %eax
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	movl	CNT(%esp), REM
+	test	REM, REM
+	je	L(eq)
+#endif
+	mov	%dx, %cx
+	and	$0xfff, %cx
+	cmp	$0xff0, %cx
+	ja	L(first4bytes)
+	movdqu	(%edx), %xmm2
+	mov	%eax, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(first4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+	movdqa	reg1, %xmm3;						      \
+	movdqa	UCHIGH_reg, %xmm4;					      \
+	movdqa	reg2, %xmm5;						      \
+	movdqa	UCHIGH_reg, %xmm6;					      \
+	pcmpgtb	UCLOW_reg, %xmm3;					      \
+	pcmpgtb	reg1, %xmm4;						      \
+	pcmpgtb	UCLOW_reg, %xmm5;					      \
+	pcmpgtb	reg2, %xmm6;						      \
+	pand	%xmm4, %xmm3;						      \
+	pand	%xmm6, %xmm5;						      \
+	pand	LCQWORD_reg, %xmm3;					      \
+	pand	LCQWORD_reg, %xmm5;					      \
+	por	%xmm3, reg1;						      \
+	por	%xmm5, reg2
+
+	movdqu	(%eax), %xmm1
+	TOLOWER (%xmm2, %xmm1)
+	movd	%xmm2, %ecx
+	movd	%xmm1, %edi
+	movdqa	%xmm2, %xmm3
+	movdqa	%xmm1, %xmm4
+	cmpl	%edi, %ecx
+#else
+# define TOLOWER(reg1, reg)
+
+	movd	%xmm2, %ecx
+	cmp	(%eax), %ecx
+#endif
+	jne	L(less4bytes)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	movdqu	(%eax), %xmm1
+#endif
+	pxor	%xmm2, %xmm1
+	pxor	%xmm0, %xmm0
+	ptest	%xmm1, %xmm0
+	jnc	L(less16bytes)
+	pcmpeqb	%xmm0, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, REM
+	jbe	L(eq)
+#endif
+	add	$16, %edx
+	add	$16, %eax
+L(first4bytes):
+	movzbl	(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, (%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	je	L(eq)
+#endif
+
+	movzbl	1(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	1(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 1(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	je	L(eq)
+#endif
+	movzbl	2(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	2(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 2(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	je	L(eq)
+#endif
+	movzbl	3(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	3(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 3(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	je	L(eq)
+#endif
+	movzbl	4(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	4(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 4(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	je	L(eq)
+#endif
+	movzbl	5(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	5(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 5(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	je	L(eq)
+#endif
+	movzbl	6(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	6(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 6(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	je	L(eq)
+#endif
+	movzbl	7(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	7(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 7(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$8, REM
+	je	L(eq)
+#endif
+	add	$8, %eax
+	add	$8, %edx
+
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	PUSH	(%edi)
+#endif
+	PUSH	(%esi)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cfi_remember_state
+#endif
+	mov	%edx, %edi
+	mov	%eax, %esi
+	xorl	%eax, %eax
+L(check_offset):
+	movl	%edi, %edx
+	movl	%esi, %ecx
+	andl	$0xfff, %edx
+	andl	$0xfff, %ecx
+	cmpl	%edx, %ecx
+	cmovl	%edx, %ecx
+	lea	-0xff0(%ecx), %edx
+	sub	%edx, %edi
+	sub	%edx, %esi
+	testl	%edx, %edx
+	jg	L(crosspage)
+L(loop):
+	movdqu	(%esi,%edx), %xmm2
+	movdqu	(%edi,%edx), %xmm1
+	TOLOWER (%xmm2, %xmm1)
+	pcmpistri	$0x1a, %xmm2, %xmm1
+	jbe	L(end)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, REM
+	jbe	L(more16byteseq)
+#endif
+
+	add	$16, %edx
+	jle	L(loop)
+L(crosspage):
+	movzbl	(%edi,%edx), %eax
+	movzbl	(%esi,%edx), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+# endif
+#endif
+	subl	%ecx, %eax
+	jne	L(ret)
+	testl	%ecx, %ecx
+	je	L(ret)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$1, REM
+	jbe	L(more16byteseq)
+#endif
+	inc	%edx
+	cmp	$15, %edx
+	jle	L(crosspage)
+	add	%edx, %edi
+	add	%edx, %esi
+	jmp	L(check_offset)
+
+	.p2align 4
+L(end):
+	jnc	L(ret)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	%ecx, REM
+	jbe	L(more16byteseq)
+#endif
+	lea	(%ecx,%edx), %ecx
+	movzbl	(%edi,%ecx), %eax
+	movzbl	(%esi,%ecx), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+# endif
+#endif
+	subl	%ecx, %eax
+L(ret):
+	POP	(%esi)
+	POP	(%edi)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	POP	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	POP	(%ebx)
+# endif
+#endif
+	ret
+
+	.p2align 4
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cfi_restore_state
+L(more16byteseq):
+	POP	(%esi)
+# ifdef USE_AS_STRNCMP
+	POP	(%edi)
+# endif
+#endif
+L(eq):
+	xorl	%eax, %eax
+	RETURN
+
+L(neq):
+	mov	$1, %eax
+	ja	L(neq_bigger)
+	neg	%eax
+L(neq_bigger):
+	RETURN
+
+L(less16bytes):
+	add	$0xfefefeff, %ecx
+	jnc	L(less4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movd	%xmm3, %edi
+	xor	%edi, %ecx
+#else
+	xor	(%edx), %ecx
+#endif
+	or	$0xfefefeff, %ecx
+	add	$1, %ecx
+	jnz	L(less4bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	jbe	L(eq)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	psrldq	$4, %xmm3
+	psrldq	$4, %xmm4
+	movd	%xmm3, %ecx
+	movd	%xmm4, %edi
+	cmp	%edi, %ecx
+	mov	%ecx, %edi
+#else
+	mov	4(%edx), %ecx
+	cmp	4(%eax), %ecx
+#endif
+	jne	L(more4bytes)
+	add	$0xfefefeff, %ecx
+	jnc	L(more4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	xor	%edi, %ecx
+#else
+	xor	4(%edx), %ecx
+#endif
+	or	$0xfefefeff, %ecx
+	add	$1, %ecx
+	jnz	L(more4bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$8, REM
+	jbe	L(eq)
+#endif
+
+	add	$8, %edx
+	add	$8, %eax
+L(less4bytes):
+
+	movzbl	(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, (%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	je	L(eq)
+#endif
+	movzbl	1(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	1(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 1(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	je	L(eq)
+#endif
+
+	movzbl	2(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	2(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 2(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	je	L(eq)
+#endif
+	movzbl	3(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	3(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 3(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+L(more4bytes):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	je	L(eq)
+#endif
+	movzbl	4(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	4(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 4(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	je	L(eq)
+#endif
+	movzbl	5(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	5(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 5(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	je	L(eq)
+#endif
+	movzbl	6(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	6(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 6(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	je	L(eq)
+#endif
+	movzbl	7(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	7(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 7(%edx)
+#endif
+	jne	L(neq)
+	jmp	L(eq)
+
+END (STRCMP)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
new file mode 100644
index 0000000000..b25cc3e068
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
@@ -0,0 +1,2810 @@
+/* strcmp with SSSE3
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_STRNCMP
+# ifndef STRCMP
+#  define STRCMP	__strncmp_ssse3
+# endif
+# define STR1		8
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define RETURN		POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	mov	$16, %esi;				\
+	sub	%ecx, %esi;				\
+	cmp	%esi, REM;				\
+	jbe	L(more8byteseq);			\
+	sub	%esi, REM
+# define FLAGS		%ebx
+# define REM		%ebp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strcasecmp_l_ssse3
+# endif
+# ifdef PIC
+#  define STR1		8
+# else
+#  define STR1		4
+# endif
+# define STR2		STR1+4
+# define LOCALE		12	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (%ebx); ret; .p2align 4; CFI_PUSH (%ebx)
+# else
+#  define RETURN	ret; .p2align 4
+# endif
+# define UPDATE_STRNCMP_COUNTER
+# define FLAGS		(%esp)
+# define NONASCII	__strcasecmp_nonascii
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strncasecmp_l_ssse3
+# endif
+# ifdef PIC
+#  define STR1		12
+# else
+#  define STR1		8
+# endif
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define LOCALE		16	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (REM); POP (%ebx); ret; \
+			.p2align 4; CFI_PUSH (%ebx); CFI_PUSH (REM)
+# else
+#  define RETURN	POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# endif
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	mov	$16, %esi;				\
+	sub	%ecx, %esi;				\
+	cmp	%esi, REM;				\
+	jbe	L(more8byteseq);			\
+	sub	%esi, REM
+# define FLAGS		(%esp)
+# define REM		%ebp
+# define NONASCII	__strncasecmp_nonascii
+#else
+# ifndef STRCMP
+#  define STRCMP	__strcmp_ssse3
+# endif
+# define STR1		4
+# define STR2		STR1+4
+# define RETURN		ret; .p2align 4
+# define UPDATE_STRNCMP_COUNTER
+# define FLAGS		%ebx
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (__strcasecmp_ssse3)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strcasecmp_nonascii
+# else
+	jne	__strcasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strcasecmp_ssse3)
+#endif
+
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (__strncasecmp_ssse3)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strncasecmp_nonascii
+# else
+	jne	__strncasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strncasecmp_ssse3)
+#endif
+
+ENTRY (STRCMP)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movl	LOCALE(%esp), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+	jne	NONASCII
+
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+# endif
+L(ascii):
+	.section .rodata.cst16,"aM",@progbits,16
+	.align 16
+.Lbelowupper:
+	.quad	0x4040404040404040
+	.quad	0x4040404040404040
+.Ltopupper:
+	.quad	0x5b5b5b5b5b5b5b5b
+	.quad	0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+# ifdef PIC
+#  define UCLOW_reg .Lbelowupper@GOTOFF(%ebx)
+#  define UCHIGH_reg .Ltopupper@GOTOFF(%ebx)
+#  define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx)
+# else
+#  define UCLOW_reg .Lbelowupper
+#  define UCHIGH_reg .Ltopupper
+#  define LCQWORD_reg .Ltouppermask
+# endif
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	PUSH	(REM)
+#endif
+
+	movl	STR1(%esp), %edx
+	movl	STR2(%esp), %eax
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	movl	CNT(%esp), REM
+	cmp	$16, REM
+	jb	L(less16bytes_sncmp)
+#elif !defined USE_AS_STRCASECMP_L
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	add	$8, %edx
+	add	$8, %eax
+#endif
+	movl	%edx, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(crosspage)
+	mov	%eax, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(crosspage)
+	pxor	%xmm0, %xmm0
+	movlpd	(%eax), %xmm1
+	movlpd	(%edx), %xmm2
+	movhpd	8(%eax), %xmm1
+	movhpd	8(%edx), %xmm2
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+	movdqa	reg1, %xmm5;					\
+	movdqa	reg2, %xmm7;					\
+	movdqa	UCHIGH_reg, %xmm6;				\
+	pcmpgtb	UCLOW_reg, %xmm5;				\
+	pcmpgtb	UCLOW_reg, %xmm7;				\
+	pcmpgtb	reg1, %xmm6;					\
+	pand	%xmm6, %xmm5;					\
+	movdqa	UCHIGH_reg, %xmm6;				\
+	pcmpgtb	reg2, %xmm6;					\
+	pand	%xmm6, %xmm7;					\
+	pand	LCQWORD_reg, %xmm5;				\
+	por	%xmm5, reg1;					\
+	pand	LCQWORD_reg, %xmm7;				\
+	por	%xmm7, reg2
+	TOLOWER (%xmm1, %xmm2)
+#else
+# define TOLOWER(reg1, reg2)
+#endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %ecx
+	sub	$0xffff, %ecx
+	jnz	L(less16bytes)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(eq)
+#endif
+	add	$16, %eax
+	add	$16, %edx
+
+L(crosspage):
+
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	PUSH	(FLAGS)
+#endif
+	PUSH	(%edi)
+	PUSH	(%esi)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	pushl	$0
+	cfi_adjust_cfa_offset (4)
+#endif
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cfi_remember_state
+#endif
+
+	movl	%edx, %edi
+	movl	%eax, %ecx
+	and	$0xf, %ecx
+	and	$0xf, %edi
+	xor	%ecx, %eax
+	xor	%edi, %edx
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	xor	FLAGS, FLAGS
+#endif
+	cmp	%edi, %ecx
+	je	L(ashr_0)
+	ja	L(bigger)
+	orl	$0x20, FLAGS
+	xchg	%edx, %eax
+	xchg	%ecx, %edi
+L(bigger):
+	lea	15(%edi), %edi
+	sub	%ecx, %edi
+	cmp	$8, %edi
+	jle	L(ashr_less_8)
+	cmp	$14, %edi
+	je	L(ashr_15)
+	cmp	$13, %edi
+	je	L(ashr_14)
+	cmp	$12, %edi
+	je	L(ashr_13)
+	cmp	$11, %edi
+	je	L(ashr_12)
+	cmp	$10, %edi
+	je	L(ashr_11)
+	cmp	$9, %edi
+	je	L(ashr_10)
+L(ashr_less_8):
+	je	L(ashr_9)
+	cmp	$7, %edi
+	je	L(ashr_8)
+	cmp	$6, %edi
+	je	L(ashr_7)
+	cmp	$5, %edi
+	je	L(ashr_6)
+	cmp	$4, %edi
+	je	L(ashr_5)
+	cmp	$3, %edi
+	je	L(ashr_4)
+	cmp	$2, %edi
+	je	L(ashr_3)
+	cmp	$1, %edi
+	je	L(ashr_2)
+	cmp	$0, %edi
+	je	L(ashr_1)
+
+/*
+ * The following cases will be handled by ashr_0
+ *  ecx(offset of esi)  eax(offset of edi)  relative offset  corresponding case
+ *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
+ */
+	.p2align 4
+L(ashr_0):
+	mov	$0xffff, %esi
+	movdqa	(%eax), %xmm1
+	pxor	%xmm0, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movdqa	(%edx), %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm2, %xmm1
+#else
+	pcmpeqb	(%edx), %xmm1
+#endif
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	mov	%ecx, %edi
+	jne	L(less32bytes)
+	UPDATE_STRNCMP_COUNTER
+	movl	$0x10, FLAGS
+	mov	$0x10, %ecx
+	pxor	%xmm0, %xmm0
+	.p2align 4
+L(loop_ashr_0):
+	movdqa	(%eax, %ecx), %xmm1
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movdqa	(%edx, %ecx), %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+#else
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	(%edx, %ecx), %xmm1
+#endif
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	jmp	L(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
+ */
+	.p2align 4
+L(ashr_1):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$15, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-15(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$1, FLAGS
+	lea	1(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_1):
+	add	$16, %edi
+	jg	L(nibble_ashr_1)
+
+L(gobble_ashr_1):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$1, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_1)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$1, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_1)
+
+	.p2align 4
+L(nibble_ashr_1):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfffe, %esi
+	jnz	L(ashr_1_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$15, REM
+	jbe	L(ashr_1_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_1)
+
+	.p2align 4
+L(ashr_1_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$1, %xmm0
+	psrldq	$1, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(14~15)            n -14            1(15 +(n-14) - n)         ashr_2
+ */
+	.p2align 4
+L(ashr_2):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$14, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-14(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$2, FLAGS
+	lea	2(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_2):
+	add	$16, %edi
+	jg	L(nibble_ashr_2)
+
+L(gobble_ashr_2):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$2, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_2)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$2, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_2)
+
+	.p2align 4
+L(nibble_ashr_2):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfffc, %esi
+	jnz	L(ashr_2_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$14, REM
+	jbe	L(ashr_2_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_2)
+
+	.p2align 4
+L(ashr_2_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$2, %xmm0
+	psrldq	$2, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(13~15)            n -13            2(15 +(n-13) - n)         ashr_3
+ */
+	.p2align 4
+L(ashr_3):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$13, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-13(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$3, FLAGS
+	lea	3(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_3):
+	add	$16, %edi
+	jg	L(nibble_ashr_3)
+
+L(gobble_ashr_3):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$3, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_3)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$3, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_3)
+
+	.p2align 4
+L(nibble_ashr_3):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfff8, %esi
+	jnz	L(ashr_3_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$13, REM
+	jbe	L(ashr_3_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_3)
+
+	.p2align 4
+L(ashr_3_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$3, %xmm0
+	psrldq	$3, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(12~15)            n -12            3(15 +(n-12) - n)         ashr_4
+ */
+	.p2align 4
+L(ashr_4):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$12, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-12(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$4, FLAGS
+	lea	4(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_4):
+	add	$16, %edi
+	jg	L(nibble_ashr_4)
+
+L(gobble_ashr_4):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$4, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_4)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$4, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_4)
+
+	.p2align 4
+L(nibble_ashr_4):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfff0, %esi
+	jnz	L(ashr_4_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$12, REM
+	jbe	L(ashr_4_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_4)
+
+	.p2align 4
+L(ashr_4_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$4, %xmm0
+	psrldq	$4, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(11~15)            n -11            4(15 +(n-11) - n)         ashr_5
+ */
+	.p2align 4
+L(ashr_5):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$11, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-11(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$5, FLAGS
+	lea	5(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_5):
+	add	$16, %edi
+	jg	L(nibble_ashr_5)
+
+L(gobble_ashr_5):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$5, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_5)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$5, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_5)
+
+	.p2align 4
+L(nibble_ashr_5):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xffe0, %esi
+	jnz	L(ashr_5_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$11, REM
+	jbe	L(ashr_5_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_5)
+
+	.p2align 4
+L(ashr_5_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$5, %xmm0
+	psrldq	$5, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(10~15)            n -10            5(15 +(n-10) - n)         ashr_6
+ */
+
+	.p2align 4
+L(ashr_6):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$10, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-10(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$6, FLAGS
+	lea	6(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_6):
+	add	$16, %edi
+	jg	L(nibble_ashr_6)
+
+L(gobble_ashr_6):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$6, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_6)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$6, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_6)
+
+	.p2align 4
+L(nibble_ashr_6):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xffc0, %esi
+	jnz	L(ashr_6_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$10, REM
+	jbe	L(ashr_6_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_6)
+
+	.p2align 4
+L(ashr_6_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$6, %xmm0
+	psrldq	$6, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(9~15)            n - 9            6(15 +(n-9) - n)         ashr_7
+ */
+
+	.p2align 4
+L(ashr_7):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$9, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-9(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$7, FLAGS
+	lea	8(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_7):
+	add	$16, %edi
+	jg	L(nibble_ashr_7)
+
+L(gobble_ashr_7):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$7, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_7)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$7, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_7)
+
+	.p2align 4
+L(nibble_ashr_7):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xff80, %esi
+	jnz	L(ashr_7_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$9, REM
+	jbe	L(ashr_7_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_7)
+
+	.p2align 4
+L(ashr_7_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$7, %xmm0
+	psrldq	$7, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_8
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(8~15)            n - 8            7(15 +(n-8) - n)         ashr_8
+ */
+	.p2align 4
+L(ashr_8):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$8, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-8(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$8, FLAGS
+	lea	8(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_8):
+	add	$16, %edi
+	jg	L(nibble_ashr_8)
+
+L(gobble_ashr_8):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$8, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_8)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$8, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_8)
+
+	.p2align 4
+L(nibble_ashr_8):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xff00, %esi
+	jnz	L(ashr_8_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$8, REM
+	jbe	L(ashr_8_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_8)
+
+	.p2align 4
+L(ashr_8_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$8, %xmm0
+	psrldq	$8, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_9
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(7~15)            n - 7            8(15 +(n-7) - n)         ashr_9
+ */
+	.p2align 4
+L(ashr_9):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$7, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-7(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$9, FLAGS
+	lea	9(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_9):
+	add	$16, %edi
+	jg	L(nibble_ashr_9)
+
+L(gobble_ashr_9):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$9, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_9)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$9, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_9)
+
+	.p2align 4
+L(nibble_ashr_9):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfe00, %esi
+	jnz	L(ashr_9_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	jbe	L(ashr_9_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_9)
+
+	.p2align 4
+L(ashr_9_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$9, %xmm0
+	psrldq	$9, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_10
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(6~15)            n - 6            9(15 +(n-6) - n)         ashr_10
+ */
+	.p2align 4
+L(ashr_10):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$6, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-6(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$10, FLAGS
+	lea	10(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_10):
+	add	$16, %edi
+	jg	L(nibble_ashr_10)
+
+L(gobble_ashr_10):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$10, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_10)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$10, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_10)
+
+	.p2align 4
+L(nibble_ashr_10):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfc00, %esi
+	jnz	L(ashr_10_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	jbe	L(ashr_10_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_10)
+
+	.p2align 4
+L(ashr_10_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$10, %xmm0
+	psrldq	$10, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_11
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(5~15)            n - 5            10(15 +(n-5) - n)         ashr_11
+ */
+	.p2align 4
+L(ashr_11):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$5, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-5(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$11, FLAGS
+	lea	11(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_11):
+	add	$16, %edi
+	jg	L(nibble_ashr_11)
+
+L(gobble_ashr_11):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$11, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_11)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$11, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_11)
+
+	.p2align 4
+L(nibble_ashr_11):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xf800, %esi
+	jnz	L(ashr_11_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	jbe	L(ashr_11_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_11)
+
+	.p2align 4
+L(ashr_11_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$11, %xmm0
+	psrldq	$11, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_12
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(4~15)            n - 4            11(15 +(n-4) - n)         ashr_12
+ */
+	.p2align 4
+L(ashr_12):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$4, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-4(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$12, FLAGS
+	lea	12(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_12):
+	add	$16, %edi
+	jg	L(nibble_ashr_12)
+
+L(gobble_ashr_12):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$12, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_12)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$12, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_12)
+
+	.p2align 4
+L(nibble_ashr_12):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xf000, %esi
+	jnz	L(ashr_12_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	jbe	L(ashr_12_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_12)
+
+	.p2align 4
+L(ashr_12_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$12, %xmm0
+	psrldq	$12, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_13
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(3~15)            n - 3            12(15 +(n-3) - n)         ashr_13
+ */
+	.p2align 4
+L(ashr_13):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-3(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$13, FLAGS
+	lea	13(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_13):
+	add	$16, %edi
+	jg	L(nibble_ashr_13)
+
+L(gobble_ashr_13):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$13, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_13)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$13, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_13)
+
+	.p2align 4
+L(nibble_ashr_13):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xe000, %esi
+	jnz	L(ashr_13_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	jbe	L(ashr_13_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_13)
+
+	.p2align 4
+L(ashr_13_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$13, %xmm0
+	psrldq	$13, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(2~15)            n - 2            13(15 +(n-2) - n)         ashr_14
+ */
+	.p2align 4
+L(ashr_14):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$2, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-2(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$14, FLAGS
+	lea	14(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_14):
+	add	$16, %edi
+	jg	L(nibble_ashr_14)
+
+L(gobble_ashr_14):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$14, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_14)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$14, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_14)
+
+	.p2align 4
+L(nibble_ashr_14):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xc000, %esi
+	jnz	L(ashr_14_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	jbe	L(ashr_14_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_14)
+
+	.p2align 4
+L(ashr_14_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$14, %xmm0
+	psrldq	$14, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(1~15)            n - 1            14(15 +(n-1) - n)         ashr_15
+ */
+
+	.p2align 4
+L(ashr_15):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$1, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-1(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$15, FLAGS
+	lea	15(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_15):
+	add	$16, %edi
+	jg	L(nibble_ashr_15)
+
+L(gobble_ashr_15):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$15, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_15)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$15, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_15)
+
+	.p2align 4
+L(nibble_ashr_15):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0x8000, %esi
+	jnz	L(ashr_15_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	jbe	L(ashr_15_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_15)
+
+	.p2align 4
+L(ashr_15_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$15, %xmm0
+	psrldq	$15, %xmm3
+	jmp	L(aftertail)
+
+	.p2align 4
+L(aftertail):
+	TOLOWER (%xmm1, %xmm3)
+	pcmpeqb	%xmm3, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	not	%esi
+L(exit):
+	mov	FLAGS, %edi
+	and	$0x1f, %edi
+	lea	-16(%edi, %ecx), %edi
+L(less32bytes):
+	add	%edi, %edx
+	add	%ecx, %eax
+	testl	$0x20, FLAGS
+	jz	L(ret2)
+	xchg	%eax, %edx
+
+	.p2align 4
+L(ret2):
+	mov	%esi, %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+#endif
+	POP	(%esi)
+	POP	(%edi)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	POP	(FLAGS)
+#endif
+L(less16bytes):
+	test	%cl, %cl
+	jz	L(2next_8_bytes)
+
+	test	$0x01, %cl
+	jnz	L(Byte0)
+
+	test	$0x02, %cl
+	jnz	L(Byte1)
+
+	test	$0x04, %cl
+	jnz	L(Byte2)
+
+	test	$0x08, %cl
+	jnz	L(Byte3)
+
+	test	$0x10, %cl
+	jnz	L(Byte4)
+
+	test	$0x20, %cl
+	jnz	L(Byte5)
+
+	test	$0x40, %cl
+	jnz	L(Byte6)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	jbe	L(eq)
+#endif
+
+	movzx	7(%eax), %ecx
+	movzx	7(%edx), %eax
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte0):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$0, REM
+	jbe	L(eq)
+#endif
+	movzx	(%eax), %ecx
+	movzx	(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte1):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	jbe	L(eq)
+#endif
+	movzx	1(%eax), %ecx
+	movzx	1(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte2):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	jbe	L(eq)
+#endif
+	movzx	2(%eax), %ecx
+	movzx	2(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte3):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	jbe	L(eq)
+#endif
+	movzx	3(%eax), %ecx
+	movzx	3(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte4):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	jbe	L(eq)
+#endif
+	movzx	4(%eax), %ecx
+	movzx	4(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte5):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	jbe	L(eq)
+#endif
+	movzx	5(%eax), %ecx
+	movzx	5(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte6):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	jbe	L(eq)
+#endif
+	movzx	6(%eax), %ecx
+	movzx	6(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(2next_8_bytes):
+	add	$8, %eax
+	add	$8, %edx
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$8, REM
+	lea	-8(REM), REM
+	jbe	L(eq)
+#endif
+
+	test	$0x01, %ch
+	jnz	L(Byte0)
+
+	test	$0x02, %ch
+	jnz	L(Byte1)
+
+	test	$0x04, %ch
+	jnz	L(Byte2)
+
+	test	$0x08, %ch
+	jnz	L(Byte3)
+
+	test	$0x10, %ch
+	jnz	L(Byte4)
+
+	test	$0x20, %ch
+	jnz	L(Byte5)
+
+	test	$0x40, %ch
+	jnz	L(Byte6)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	jbe	L(eq)
+#endif
+	movzx	7(%eax), %ecx
+	movzx	7(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+#ifdef USE_AS_STRNCMP
+L(neq_sncmp):
+#endif
+L(neq):
+	mov	$1, %eax
+	ja	L(neq_bigger)
+	neg	%eax
+L(neq_bigger):
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+#endif
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	POP	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	POP	(%ebx)
+# endif
+#endif
+	ret
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	.p2align 4
+	cfi_restore_state
+L(more8byteseq):
+
+# ifdef USE_AS_STRNCASECMP_L
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+# endif
+	POP	(%esi)
+	POP	(%edi)
+# ifdef USE_AS_STRNCMP
+	POP	(FLAGS)
+# endif
+#endif
+
+#ifdef USE_AS_STRNCMP
+L(eq_sncmp):
+#endif
+L(eq):
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	POP	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	POP	(%ebx)
+# endif
+#endif
+	xorl	%eax, %eax
+	ret
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	.p2align 4
+# if defined USE_AS_STRNCASECMP_L && defined PIC
+	CFI_PUSH (%ebx)
+# endif
+	CFI_PUSH (REM)
+L(less16bytes_sncmp):
+# ifdef USE_AS_STRNCASECMP_L
+	PUSH	(%esi)
+# endif
+	test	REM, REM
+	jz	L(eq_sncmp)
+
+	movzbl	(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, (%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$1, REM
+	je	L(eq_sncmp)
+
+	movzbl	1(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	1(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 1(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$2, REM
+	je	L(eq_sncmp)
+
+	movzbl	2(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	2(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 2(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$3, REM
+	je	L(eq_sncmp)
+
+	movzbl	3(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	3(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 3(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$4, REM
+	je	L(eq_sncmp)
+
+	movzbl	4(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	4(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 4(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$5, REM
+	je	L(eq_sncmp)
+
+	movzbl	5(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	5(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 5(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$6, REM
+	je	L(eq_sncmp)
+
+	movzbl	6(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	6(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 6(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$7, REM
+	je	L(eq_sncmp)
+
+	movzbl	7(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	7(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 7(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+
+	cmp	$8, REM
+	je	L(eq_sncmp)
+
+	movzbl	8(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	8(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 8(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$9, REM
+	je	L(eq_sncmp)
+
+	movzbl	9(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	9(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 9(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$10, REM
+	je	L(eq_sncmp)
+
+	movzbl	10(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	10(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 10(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$11, REM
+	je	L(eq_sncmp)
+
+	movzbl	11(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	11(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 11(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+
+	cmp	$12, REM
+	je	L(eq_sncmp)
+
+	movzbl	12(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	12(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 12(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$13, REM
+	je	L(eq_sncmp)
+
+	movzbl	13(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	13(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 13(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$14, REM
+	je	L(eq_sncmp)
+
+	movzbl	14(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	14(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 14(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$15, REM
+	je	L(eq_sncmp)
+
+	movzbl	15(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	15(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 15(%edx)
+# endif
+	jne	L(neq_sncmp)
+
+# ifdef USE_AS_STRNCASECMP_L
+L(eq_sncmp):
+	POP	(%esi)
+# endif
+	POP	(REM)
+# if defined USE_AS_STRNCASECMP_L && defined PIC
+	POP	(%ebx)
+# endif
+	xor	%eax, %eax
+	ret
+
+# ifdef USE_AS_STRNCASECMP_L
+	.p2align 4
+#  ifdef PIC
+	CFI_PUSH (%ebx)
+#  endif
+	CFI_PUSH (REM)
+	CFI_PUSH (%esi)
+L(neq_sncmp):
+	mov	$1, %eax
+	mov	$-1, %edx
+	cmovna	%edx, %eax
+	POP	(%esi)
+	POP	(REM)
+#  ifdef PIC
+	POP	(%ebx)
+#  endif
+	ret
+# endif
+#endif
+
+END (STRCMP)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S
new file mode 100644
index 0000000000..56de25a4b7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S
@@ -0,0 +1,95 @@
+/* Multiple versions of strcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRNCMP
+# define STRCMP			strncmp
+# define __GI_STRCMP		__GI_strncmp
+# define __STRCMP_IA32		__strncmp_ia32
+# define __STRCMP_SSSE3		__strncmp_ssse3
+# define __STRCMP_SSE4_2	__strncmp_sse4_2
+#elif defined USE_AS_STRCASECMP_L
+# define STRCMP			__strcasecmp_l
+# define __GI_STRCMP		__GI_strcasecmp_l
+# define __STRCMP_IA32		__strcasecmp_l_ia32
+# define __STRCMP_SSSE3		__strcasecmp_l_ssse3
+# define __STRCMP_SSE4_2	__strcasecmp_l_sse4_2
+#elif defined USE_AS_STRNCASECMP_L
+# define STRCMP			__strncasecmp_l
+# define __GI_STRCMP		__GI_strncasecmp_l
+# define __STRCMP_IA32		__strncasecmp_l_ia32
+# define __STRCMP_SSSE3		__strncasecmp_l_ssse3
+# define __STRCMP_SSE4_2	__strncasecmp_l_sse4_2
+#else
+# define STRCMP			strcmp
+# define __GI_STRCMP		__GI_strcmp
+# define __STRCMP_IA32		__strcmp_ia32
+# define __STRCMP_SSSE3		__strcmp_ssse3
+# define __STRCMP_SSE4_2	__strcmp_sse4_2
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncmp in static library since we
+   need strncmp before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc)
+	.text
+ENTRY(STRCMP)
+	.type	STRCMP, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__STRCMP_IA32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__STRCMP_SSSE3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	LOAD_FUNC_GOT_EAX (__STRCMP_SSE4_2)
+2:	ret
+END(STRCMP)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __STRCMP_IA32, @function; \
+	.p2align 4; \
+	.globl __STRCMP_IA32; \
+	.hidden __STRCMP_IA32; \
+	__STRCMP_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __STRCMP_IA32, .-__STRCMP_IA32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCMP; __GI_STRCMP = __STRCMP_IA32
+# endif
+#endif
+
+#if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L \
+    && !defined USE_AS_STRNCASECMP_L
+# include "../strcmp.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S
new file mode 100644
index 0000000000..ed627a5f62
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S
@@ -0,0 +1,2250 @@
+/* strcpy with SSE2 and unaligned load
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG)                  \
+	cfi_adjust_cfa_offset (4);     \
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)                   \
+	cfi_adjust_cfa_offset (-4);    \
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+#  define STRCPY  __strcpy_sse2
+# endif
+
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN  STR2+4
+
+# ifdef USE_AS_STRNCPY
+#  define PARMS  16
+#  define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi)
+#  define RETURN  POP(%edi); POP(%esi); POP(%ebx); ret;          \
+	CFI_PUSH(%ebx); CFI_PUSH(%esi); CFI_PUSH(%edi);
+
+# ifdef SHARED
+#  define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into ECX and branch to it. TABLE is a
+	jump table with relative offsets.
+	INDEX is a register contains the index into the jump table.
+	SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)            \
+	/* We first load PC into ECX.  */                       \
+	SETUP_PIC_REG(cx);                                      \
+	/* Get the address of the jump table.  */               \
+	addl	$(TABLE - .), %ecx;                             \
+	/* Get the entry and convert the relative offset to the \
+	absolute	address.  */                            \
+	addl	(%ecx,INDEX,SCALE), %ecx;                       \
+	/* We loaded the jump table and adjusted ECX. Go.  */  \
+	jmp	*%ecx
+# else
+#  define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute	offsets.  INDEX is a register contains the index into the
+	jump	table.  SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edi
+	mov	STR2(%esp), %esi
+	movl	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(ExitZero)
+
+	mov	%esi, %ecx
+# ifndef USE_AS_STPCPY
+	mov	%edi, %eax      /* save result */
+# endif
+	and	$15, %ecx
+	jz	L(SourceStringAlignmentZero)
+
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	(%esi), %xmm1
+	add	%ecx, %ebx
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# else
+	cmp	$17, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# else
+	cmp	$33, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%edi)
+
+	sub	%ecx, %edi
+
+/* If source address alignment != destination address alignment */
+	.p2align 4
+L(Unalign16Both):
+	mov	$16, %ecx
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$48, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+
+	movaps	16(%esi, %ecx), %xmm4
+	movdqu	%xmm3, (%edi, %ecx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+
+	movaps	16(%esi, %ecx), %xmm1
+	movdqu	%xmm4, (%edi, %ecx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
+
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+
+	movdqu	%xmm3, (%edi, %ecx)
+	mov	%esi, %edx
+	lea	16(%esi, %ecx), %esi
+	and	$-0x40, %esi
+	sub	%esi, %edx
+	sub	%edx, %edi
+	lea	128(%ebx, %edx), %ebx
+
+L(Unaligned64Loop):
+	movaps	(%esi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%esi), %xmm5
+	movaps	32(%esi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%esi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(Unaligned64Leave)
+L(Unaligned64Loop_start):
+	add	$64, %edi
+	add	$64, %esi
+	movdqu	%xmm4, -64(%edi)
+	movaps	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%edi)
+	movaps	16(%esi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%esi), %xmm3
+	movdqu	%xmm6, -32(%edi)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%edi)
+	movaps	48(%esi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+	test	%edx, %edx
+	jz	L(Unaligned64Loop_start)
+L(Unaligned64Leave):
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%ecx, %ecx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%edi)
+	movdqu	%xmm5, 16(%edi)
+	movdqu	%xmm6, 32(%edi)
+# ifdef USE_AS_STPCPY
+	lea	48(%edi, %edx), %eax
+# endif
+	movdqu	%xmm7, 48(%edi)
+	add	$15, %ebx
+	sub	%edx, %ebx
+	lea	49(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentZero):
+	pxor	%xmm0, %xmm0
+	movdqa	(%esi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# else
+	cmp	$17, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	pcmpeqb	16(%esi), %xmm0
+	movdqu	%xmm1, (%edi)
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# else
+	cmp	$33, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes1)
+
+	jmp	L(Unalign16Both)
+
+/*-----------------End of main part---------------------------*/
+
+/* Case1 */
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %esi
+	add	$16, %edi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+	sub	%ecx, %ebx
+	bsf	%edx, %edx
+	add	%ecx, %esi
+	add	$16, %edx
+	sub	%ecx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%edx, %edx
+# ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+# endif
+	movdqu	%xmm4, (%edi)
+	add	$63, %ebx
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi, %edx), %eax
+# endif
+	movdqu	%xmm5, 16(%edi)
+	add	$47, %ebx
+	sub	%edx, %ebx
+	lea	17(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%edx, %edx
+	movdqu	%xmm4, (%edi)
+	movdqu	%xmm5, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	32(%edi, %edx), %eax
+# endif
+	movdqu	%xmm6, 32(%edi)
+	add	$31, %ebx
+	sub	%edx, %ebx
+	lea	33(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+	movdqu	%xmm6, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+	movdqu	%xmm5, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+	movdqu	%xmm4, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+	movdqu	%xmm3, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+	movdqu	%xmm1, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%ecx, %edi
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	add	$16, %edx
+	sub	%ecx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%ecx, %edi
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32BytesCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %edi
+	add	$16, %esi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(Exit0):
+# ifdef USE_AS_STPCPY
+	mov	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	movb	%dh, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	(%edi), %eax
+# endif
+	sub	$1, %ebx
+	lea	1(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	movw	(%esi), %dx
+	movw	%dx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	1(%edi), %eax
+# endif
+	sub	$2, %ebx
+	lea	2(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	movw	(%esi), %cx
+	movw	%cx, (%edi)
+	movb	%dh, 2(%edi)
+# ifdef USE_AS_STPCPY
+	lea	2(%edi), %eax
+# endif
+	sub	$3, %ebx
+	lea	3(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%esi), %edx
+	movl	%edx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	3(%edi), %eax
+# endif
+	sub	$4, %ebx
+	lea	4(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	movl	(%esi), %ecx
+	movb	%dh, 4(%edi)
+	movl	%ecx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	4(%edi), %eax
+# endif
+	sub	$5, %ebx
+	lea	5(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%edi)
+	movw	%dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	5(%edi), %eax
+# endif
+	sub	$6, %ebx
+	lea	6(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%edi)
+	movl	%edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+	lea	6(%edi), %eax
+# endif
+	sub	$7, %ebx
+	lea	7(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	7(%edi), %eax
+# endif
+	sub	$8, %ebx
+	lea	8(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	movlpd	(%esi), %xmm0
+	movb	%dh, 8(%edi)
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	8(%edi), %eax
+# endif
+	sub	$9, %ebx
+	lea	9(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	9(%edi), %eax
+# endif
+	sub	$10, %ebx
+	lea	10(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	10(%edi), %eax
+# endif
+	sub	$11, %ebx
+	lea	11(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	11(%edi), %eax
+# endif
+	sub	$12, %ebx
+	lea	12(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+	lea	12(%edi), %eax
+# endif
+	sub	$13, %ebx
+	lea	13(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+	lea	13(%edi), %eax
+# endif
+	sub	$14, %ebx
+	lea	14(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	14(%edi), %eax
+# endif
+	sub	$15, %ebx
+	lea	15(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	15(%edi), %eax
+# endif
+	sub	$16, %ebx
+	lea	16(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit17):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+	movb	%dh, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi), %eax
+# endif
+	sub	$17, %ebx
+	lea	17(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movw	%cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	17(%edi), %eax
+# endif
+	sub	$18, %ebx
+	lea	18(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	18(%edi), %eax
+# endif
+	sub	$19, %ebx
+	lea	19(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	19(%edi), %eax
+# endif
+	sub	$20, %ebx
+	lea	20(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+	movb	%dh, 20(%edi)
+# ifdef USE_AS_STPCPY
+	lea	20(%edi), %eax
+# endif
+	sub	$21, %ebx
+	lea	21(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	21(%edi), %eax
+# endif
+	sub	$22, %ebx
+	lea	22(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	22(%edi), %eax
+# endif
+	sub	$23, %ebx
+	lea	23(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	23(%edi), %eax
+# endif
+	sub	$24, %ebx
+	lea	24(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movb	%dh, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	24(%edi), %eax
+# endif
+	sub	$25, %ebx
+	lea	25(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movw	%cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	25(%edi), %eax
+# endif
+	sub	$26, %ebx
+	lea	26(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+	lea	26(%edi), %eax
+# endif
+	sub	$27, %ebx
+	lea	27(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	27(%edi), %eax
+# endif
+	sub	$28, %ebx
+	lea	28(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+	lea	28(%edi), %eax
+# endif
+	sub	$29, %ebx
+	lea	29(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	29(%edi), %eax
+# endif
+	sub	$30, %ebx
+	lea	30(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+
+	.p2align 4
+L(Exit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	30(%edi), %eax
+# endif
+	sub	$31, %ebx
+	lea	31(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	31(%edi), %eax
+# endif
+	sub	$32, %ebx
+	lea	32(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(StrncpyExit1):
+	movb	(%esi), %dl
+	movb	%dl, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	1(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit2):
+	movw	(%esi), %dx
+	movw	%dx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	2(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit3):
+	movw	(%esi), %cx
+	movb	2(%esi), %dl
+	movw	%cx, (%edi)
+	movb	%dl, 2(%edi)
+# ifdef USE_AS_STPCPY
+	lea	3(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit4):
+	movl	(%esi), %edx
+	movl	%edx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	4(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit5):
+	movl	(%esi), %ecx
+	movb	4(%esi), %dl
+	movl	%ecx, (%edi)
+	movb	%dl, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	5(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%edi)
+	movw	%dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	6(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%edi)
+	movl	%edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+	lea	7(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	8(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit9):
+	movlpd	(%esi), %xmm0
+	movb	8(%esi), %dl
+	movlpd	%xmm0, (%edi)
+	movb	%dl, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	9(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	10(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	11(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	12(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+	lea	13(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+	lea	14(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	15(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit17):
+	movdqu	(%esi), %xmm0
+	movb	16(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movb	%cl, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	17(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movw	%cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	18(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	19(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	20(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movb	20(%esi), %dl
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+	movb	%dl, 20(%edi)
+# ifdef USE_AS_STPCPY
+	lea	21(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	22(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	23(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	24(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movb	24(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movb	%cl, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	25(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movw	%cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	26(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+	lea	27(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	28(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+	lea	29(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	30(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	31(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	32(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit33):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movb	32(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+	movb	%cl, 32(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill0):
+	RETURN
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill3):
+	movl	%edx, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%edi)
+	movb	%dl, 4(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%edi)
+	movw	%dx, 4(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill7):
+	movlpd	%xmm0, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill8):
+	movlpd	%xmm0, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill9):
+	movlpd	%xmm0, (%edi)
+	movb	%dl, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill10):
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill11):
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill12):
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill13):
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm0, 5(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill14):
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm0, 6(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill15):
+	movdqu	%xmm0, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill16):
+	movdqu	%xmm0, (%edi)
+	RETURN
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+	movdqu	%xmm2, (%edi, %ecx)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmmExit):
+	bsf	%edx, %edx
+	add	$15, %ebx
+	add	%ecx, %edi
+# ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+# endif
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+
+	.p2align 4
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%edx, %edx
+	sub	$16, %ebx
+	jbe	L(StrncpyFillExit)
+
+	movdqu	%xmm0, (%edi)
+	add	$16, %edi
+
+	mov	%edi, %esi
+	and	$0xf, %esi
+	sub	%esi, %edi
+	add	%esi, %ebx
+	sub	$64, %ebx
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%edi)
+	movdqa	%xmm0, 16(%edi)
+	movdqa	%xmm0, 32(%edi)
+	movdqa	%xmm0, 48(%edi)
+	add	$64, %edi
+	sub	$64, %ebx
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %ebx
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%edi)
+	movdqa	%xmm0, 16(%edi)
+	add	$32, %edi
+	sub	$16, %ebx
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%edi)
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillLess32):
+	add	$16, %ebx
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%edi)
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillExit):
+	add	$16, %ebx
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%ebx), %ecx
+	and	$-16, %ecx
+	add	$48, %ebx
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%edi)
+# ifdef USE_AS_STPCPY
+	lea	64(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	xor	%ecx, %ecx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm4, (%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm5, 16(%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm6, 32(%edi)
+	lea	16(%edi, %ecx), %edi
+	lea	16(%esi, %ecx), %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(ExitZero):
+	movl	%edi, %eax
+	RETURN
+
+END (STRCPY)
+
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int    JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+
+L(ExitStrncpyTable):
+	.int	JMPTBL(L(Exit0), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+	.int    JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+
+	.p2align 4
+L(FillTable):
+	.int	JMPTBL(L(Fill0), L(FillTable))
+	.int	JMPTBL(L(Fill1), L(FillTable))
+	.int	JMPTBL(L(Fill2), L(FillTable))
+	.int	JMPTBL(L(Fill3), L(FillTable))
+	.int	JMPTBL(L(Fill4), L(FillTable))
+	.int	JMPTBL(L(Fill5), L(FillTable))
+	.int	JMPTBL(L(Fill6), L(FillTable))
+	.int	JMPTBL(L(Fill7), L(FillTable))
+	.int	JMPTBL(L(Fill8), L(FillTable))
+	.int	JMPTBL(L(Fill9), L(FillTable))
+	.int	JMPTBL(L(Fill10), L(FillTable))
+	.int	JMPTBL(L(Fill11), L(FillTable))
+	.int	JMPTBL(L(Fill12), L(FillTable))
+	.int	JMPTBL(L(Fill13), L(FillTable))
+	.int	JMPTBL(L(Fill14), L(FillTable))
+	.int	JMPTBL(L(Fill15), L(FillTable))
+	.int	JMPTBL(L(Fill16), L(FillTable))
+# else
+#  define PARMS  4
+#  define ENTRANCE
+#  define RETURN  POP (%edi); ret; CFI_PUSH (%edi)
+#  define RETURN1  ret
+
+	.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	cmpb	$0, 7(%ecx)
+	jz	L(ExitTail8)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	cmpb	$0, 14(%ecx)
+	jz	L(ExitTail15)
+	cmpb	$0, 15(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	PUSH	(%ebx)
+
+	mov	%edx, %edi
+	lea	16(%ecx), %ebx
+	and	$-16, %ebx
+	pxor	%xmm0, %xmm0
+	movdqu	(%ecx), %xmm1
+	movdqu	%xmm1, (%edx)
+	pcmpeqb	(%ebx), %xmm0
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%ecx, %eax
+	lea	16(%ecx), %ecx
+	and	$-16, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+	xor	%ebx, %ebx
+
+	.p2align 4
+	movdqa	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movdqu	%xmm1, (%edx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm3
+	movdqu	%xmm2, (%edx, %ebx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm4
+	movdqu	%xmm3, (%edx, %ebx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm1
+	movdqu	%xmm4, (%edx, %ebx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm2
+	movdqu	%xmm1, (%edx, %ebx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm3
+	movdqu	%xmm2, (%edx, %ebx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movdqu	%xmm3, (%edx, %ebx)
+	mov	%ecx, %eax
+	lea	16(%ecx, %ebx), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	add	$64, %ecx
+	pminub	%xmm7, %xmm3
+	add	$64, %edx
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+L(Aligned64Loop_start):
+	movdqu	%xmm4, -64(%edx)
+	movaps	(%ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%edx)
+	movaps	16(%ecx), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%ecx), %xmm3
+	movdqu	%xmm6, -32(%edx)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%edx)
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$64, %edx
+	add	$64, %ecx
+	test	%eax, %eax
+	jz	L(Aligned64Loop_start)
+L(Aligned64Leave):
+	sub	$0xa0, %ebx
+	pxor	%xmm0, %xmm0
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%ebx), %ebx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%ebx), %ebx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movdqu	%xmm6, -32(%edx)
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%ebx), %ebx
+
+/*-----------------End of main part---------------------------*/
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%ebx, %edx
+	add	%ebx, %ecx
+
+	POP	(%ebx)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	/* Exit 8 */
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	/* Exit 16 */
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm0
+	movlpd	%xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	15(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	1(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+# ifdef USE_AS_STPCPY
+	lea	2(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	3(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	4(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	5(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+	lea	6(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	8(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	9(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	10(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	11(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+	lea	12(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+	lea	13(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+CFI_POP (%edi)
+
+	.p2align 4
+L(ExitTail1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	movl	%edx, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitTail2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	1(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+# ifdef USE_AS_STPCPY
+	lea	2(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	3(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	4(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	5(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+	lea	6(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail8):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail9):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	8(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail10):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	9(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail11):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	10(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail12):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	11(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+	lea	12(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+	lea	13(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail16):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm0
+	movlpd	%xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	15(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+END (STRCPY)
+# endif
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000000..effd85da94
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
@@ -0,0 +1,3901 @@
+/* strcpy with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#  define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#  define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#  define POP(REG)	popl REG; CFI_POP (REG)
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_ssse3
+#  endif
+
+#  ifdef USE_AS_STRNCPY
+#   define PARMS  8
+#   define ENTRANCE PUSH (%ebx)
+#   define RETURN  POP (%ebx); ret; CFI_PUSH (%ebx);
+#   define RETURN1  POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+#  else
+#   define PARMS  4
+#   define ENTRANCE
+#   define RETURN  ret
+#   define RETURN1  POP (%edi); ret; CFI_PUSH (%edi)
+#  endif
+
+#  ifdef USE_AS_STPCPY
+#   define SAVE_RESULT(n)  lea	n(%edx), %eax
+#   define SAVE_RESULT_TAIL(n)  lea	n(%edx), %eax
+#  else
+#   define SAVE_RESULT(n)  movl	%edi, %eax
+#   define SAVE_RESULT_TAIL(n)  movl	%edx, %eax
+#  endif
+
+#  define STR1  PARMS
+#  define STR2  STR1+4
+#  define LEN  STR2+4
+
+/* In this code following instructions are used for copying:
+	movb	- 1 byte
+	movw	- 2 byte
+	movl	- 4 byte
+	movlpd	- 8 byte
+	movaps	- 16 byte - requires 16 byte alignment
+	of	sourse and destination adresses.
+*/
+
+.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+#  ifdef USE_AS_STRNCPY
+	movl	LEN(%esp), %ebx
+	cmp	$8, %ebx
+	jbe	L(StrncpyExit8Bytes)
+#  endif
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	cmpb	$0, 7(%ecx)
+	jz	L(ExitTail8)
+#  ifdef USE_AS_STRNCPY
+	cmp	$16, %ebx
+	jb	L(StrncpyExit15Bytes)
+#  endif
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	cmpb	$0, 14(%ecx)
+	jz	L(ExitTail15)
+#  ifdef USE_AS_STRNCPY
+	cmp	$16, %ebx
+	je	L(ExitTail16)
+#  endif
+	cmpb	$0, 15(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	mov	%edx, %edi
+# endif
+	PUSH	(%esi)
+# ifdef USE_AS_STRNCPY
+	mov	%ecx, %esi
+	sub	$16, %ebx
+	and	$0xf, %esi
+
+/* add 16 bytes ecx_offset to ebx */
+
+	add	%esi, %ebx
+# endif
+	lea	16(%ecx), %esi
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	movlpd	(%ecx), %xmm1
+	movlpd	%xmm1, (%edx)
+
+	pcmpeqb	(%esi), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %esi
+
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%edx, %eax
+	lea	16(%edx), %edx
+	and	$-16, %edx
+	sub	%edx, %eax
+
+# ifdef USE_AS_STRNCPY
+	add	%eax, %esi
+	lea	-1(%esi), %esi
+	and	$1<<31, %esi
+	test	%esi, %esi
+	jnz	L(ContinueCopy)
+	lea	16(%ebx), %ebx
+
+L(ContinueCopy):
+# endif
+	sub	%eax, %ecx
+	mov	%ecx, %eax
+	and	$0xf, %eax
+	mov	$0, %esi
+
+/* case: ecx_offset == edx_offset */
+
+	jz	L(Align16Both)
+
+	cmp	$8, %eax
+	jae	L(ShlHigh8)
+	cmp	$1, %eax
+	je	L(Shl1)
+	cmp	$2, %eax
+	je	L(Shl2)
+	cmp	$3, %eax
+	je	L(Shl3)
+	cmp	$4, %eax
+	je	L(Shl4)
+	cmp	$5, %eax
+	je	L(Shl5)
+	cmp	$6, %eax
+	je	L(Shl6)
+	jmp	L(Shl7)
+
+L(ShlHigh8):
+	je	L(Shl8)
+	cmp	$9, %eax
+	je	L(Shl9)
+	cmp	$10, %eax
+	je	L(Shl10)
+	cmp	$11, %eax
+	je	L(Shl11)
+	cmp	$12, %eax
+	je	L(Shl12)
+	cmp	$13, %eax
+	je	L(Shl13)
+	cmp	$14, %eax
+	je	L(Shl14)
+	jmp	L(Shl15)
+
+L(Align16Both):
+	movaps	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movaps	%xmm1, (%edx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm4
+	movaps	%xmm3, (%edx, %esi)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm1
+	movaps	%xmm4, (%edx, %esi)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm2
+	movaps	%xmm1, (%edx, %esi)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%edx, %esi)
+	mov	%ecx, %eax
+	lea	16(%ecx, %esi), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	lea	112(%ebx, %eax), %ebx
+# endif
+	mov	$-0x40, %esi
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	lea	64(%edx), %edx
+	pcmpeqb	%xmm0, %xmm3
+	lea	64(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeaveCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%edx)
+	movaps	%xmm5, -48(%edx)
+	movaps	%xmm6, -32(%edx)
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+	lea	48(%ebx), %ebx
+# endif
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%edx)
+	pcmpeqb	%xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl1):
+	movaps	-1(%ecx), %xmm1
+	movaps	15(%ecx), %xmm2
+L(Shl1Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	31(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-15(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-1(%ecx), %xmm1
+
+L(Shl1LoopStart):
+	movaps	15(%ecx), %xmm2
+	movaps	31(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	47(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	63(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$1, %xmm3, %xmm4
+	jnz	L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave1)
+# endif
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	mov	$15, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl2):
+	movaps	-2(%ecx), %xmm1
+	movaps	14(%ecx), %xmm2
+L(Shl2Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	30(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-14(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-2(%ecx), %xmm1
+
+L(Shl2LoopStart):
+	movaps	14(%ecx), %xmm2
+	movaps	30(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	46(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	62(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$2, %xmm3, %xmm4
+	jnz	L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave2)
+# endif
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	mov	$14, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl3):
+	movaps	-3(%ecx), %xmm1
+	movaps	13(%ecx), %xmm2
+L(Shl3Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	29(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-13(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-3(%ecx), %xmm1
+
+L(Shl3LoopStart):
+	movaps	13(%ecx), %xmm2
+	movaps	29(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	45(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	61(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$3, %xmm3, %xmm4
+	jnz	L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave3)
+# endif
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	mov	$13, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%ecx), %xmm1
+	movaps	12(%ecx), %xmm2
+L(Shl4Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	28(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-12(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%ecx), %xmm2
+	movaps	28(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave4)
+# endif
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	mov	$12, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl5):
+	movaps	-5(%ecx), %xmm1
+	movaps	11(%ecx), %xmm2
+L(Shl5Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	27(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-11(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-5(%ecx), %xmm1
+
+L(Shl5LoopStart):
+	movaps	11(%ecx), %xmm2
+	movaps	27(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	43(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	59(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$5, %xmm3, %xmm4
+	jnz	L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave5)
+# endif
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 7(%edx)
+	mov	$11, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl6):
+	movaps	-6(%ecx), %xmm1
+	movaps	10(%ecx), %xmm2
+L(Shl6Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	26(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-10(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-6(%ecx), %xmm1
+
+L(Shl6LoopStart):
+	movaps	10(%ecx), %xmm2
+	movaps	26(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	42(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	58(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$6, %xmm3, %xmm4
+	jnz	L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave6)
+# endif
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	6(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 6(%edx)
+	mov	$10, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl7):
+	movaps	-7(%ecx), %xmm1
+	movaps	9(%ecx), %xmm2
+L(Shl7Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	25(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-9(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-7(%ecx), %xmm1
+
+L(Shl7LoopStart):
+	movaps	9(%ecx), %xmm2
+	movaps	25(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	41(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	57(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$7, %xmm3, %xmm4
+	jnz	L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave7)
+# endif
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	5(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 5(%edx)
+	mov	$9, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%ecx), %xmm1
+	movaps	8(%ecx), %xmm2
+L(Shl8Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	24(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-8(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%ecx), %xmm2
+	movaps	24(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave8)
+# endif
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$8, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl9):
+	movaps	-9(%ecx), %xmm1
+	movaps	7(%ecx), %xmm2
+L(Shl9Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	23(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-7(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-9(%ecx), %xmm1
+
+L(Shl9LoopStart):
+	movaps	7(%ecx), %xmm2
+	movaps	23(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	39(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	55(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$9, %xmm3, %xmm4
+	jnz	L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave9)
+# endif
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+	movlpd	-1(%ecx), %xmm0
+	movlpd	%xmm0, -1(%edx)
+	mov	$7, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl10):
+	movaps	-10(%ecx), %xmm1
+	movaps	6(%ecx), %xmm2
+L(Shl10Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	22(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-6(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-10(%ecx), %xmm1
+
+L(Shl10LoopStart):
+	movaps	6(%ecx), %xmm2
+	movaps	22(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	38(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	54(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$10, %xmm3, %xmm4
+	jnz	L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave10)
+# endif
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+	movlpd	-2(%ecx), %xmm0
+	movlpd	%xmm0, -2(%edx)
+	mov	$6, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl11):
+	movaps	-11(%ecx), %xmm1
+	movaps	5(%ecx), %xmm2
+L(Shl11Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	21(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-5(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-11(%ecx), %xmm1
+
+L(Shl11LoopStart):
+	movaps	5(%ecx), %xmm2
+	movaps	21(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	37(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	53(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$11, %xmm3, %xmm4
+	jnz	L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave11)
+# endif
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+	movlpd	-3(%ecx), %xmm0
+	movlpd	%xmm0, -3(%edx)
+	mov	$5, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%ecx), %xmm1
+	movaps	4(%ecx), %xmm2
+L(Shl12Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	20(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-4(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%ecx), %xmm2
+	movaps	20(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave12)
+# endif
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl13):
+	movaps	-13(%ecx), %xmm1
+	movaps	3(%ecx), %xmm2
+L(Shl13Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	19(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-3(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-13(%ecx), %xmm1
+
+L(Shl13LoopStart):
+	movaps	3(%ecx), %xmm2
+	movaps	19(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	35(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	51(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$13, %xmm3, %xmm4
+	jnz	L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave13)
+# endif
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+	movl	-1(%ecx), %esi
+	movl	%esi, -1(%edx)
+	mov	$3, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl14):
+	movaps	-14(%ecx), %xmm1
+	movaps	2(%ecx), %xmm2
+L(Shl14Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	18(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-2(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-14(%ecx), %xmm1
+
+L(Shl14LoopStart):
+	movaps	2(%ecx), %xmm2
+	movaps	18(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	34(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	50(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$14, %xmm3, %xmm4
+	jnz	L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave14)
+# endif
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+	movl	-2(%ecx), %esi
+	movl	%esi, -2(%edx)
+	mov	$2, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl15):
+	movaps	-15(%ecx), %xmm1
+	movaps	1(%ecx), %xmm2
+L(Shl15Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	17(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-1(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-15(%ecx), %xmm1
+
+L(Shl15LoopStart):
+	movaps	1(%ecx), %xmm2
+	movaps	17(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	33(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	49(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$15, %xmm3, %xmm4
+	jnz	L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave15)
+# endif
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+	movl	-3(%ecx), %esi
+	movl	%esi, -3(%edx)
+	mov	$1, %esi
+# ifdef USE_AS_STRCAT
+	jmp	L(CopyFrom1To16Bytes)
+# endif
+
+
+# ifndef USE_AS_STRCAT
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+#  ifdef USE_AS_STRNCPY
+	add	$16, %ebx
+#  endif
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh8)
+
+L(CopyFrom1To16BytesLess8):
+	mov	%al, %ah
+	and	$15, %ah
+	jz	L(ExitHigh4)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT	(3)
+#  ifdef USE_AS_STRNCPY
+	sub	$4, %ebx
+	lea	4(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh4):
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+
+	.p2align 4
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT	(7)
+#  ifdef USE_AS_STRNCPY
+	sub	$8, %ebx
+	lea	8(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh8):
+	mov	%ah, %al
+	and	$15, %al
+	jz	L(ExitHigh12)
+
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT	(11)
+#  ifdef USE_AS_STRNCPY
+	sub	$12, %ebx
+	lea	12(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh12):
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	SAVE_RESULT	(15)
+#  ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	lea	16(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+#   ifdef USE_AS_STRNCPY
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%esi, %ecx
+	add	%esi, %edx
+
+	POP	(%esi)
+
+	test	%al, %al
+	jz	L(ExitHighCase2)
+
+	cmp	$8, %ebx
+	ja	L(CopyFrom1To16BytesLess8)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(Exit7)
+	jmp	L(Exit8)
+
+	.p2align 4
+L(ExitHighCase2):
+	cmp	$8, %ebx
+	jbe	L(CopyFrom1To16BytesLess8Case3)
+
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %ebx
+	je	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(Exit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %ebx
+	je	L(Exit15)
+	jmp	L(Exit16)
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+
+	cmp	$8, %ebx
+	ja	L(ExitHigh8Case3)
+
+L(CopyFrom1To16BytesLess8Case3):
+	cmp	$4, %ebx
+	ja	L(ExitHigh4Case3)
+
+	cmp	$1, %ebx
+	je	L(Exit1)
+	cmp	$2, %ebx
+	je	L(Exit2)
+	cmp	$3, %ebx
+	je	L(Exit3)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT	(4)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh4Case3):
+	cmp	$5, %ebx
+	je	L(Exit5)
+	cmp	$6, %ebx
+	je	L(Exit6)
+	cmp	$7, %ebx
+	je	L(Exit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT	(8)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh8Case3):
+	cmp	$12, %ebx
+	ja	L(ExitHigh12Case3)
+
+	cmp	$9, %ebx
+	je	L(Exit9)
+	cmp	$10, %ebx
+	je	L(Exit10)
+	cmp	$11, %ebx
+	je	L(Exit11)
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT	(12)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh12Case3):
+	cmp	$13, %ebx
+	je	L(Exit13)
+	cmp	$14, %ebx
+	je	L(Exit14)
+	cmp	$15, %ebx
+	je	L(Exit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	SAVE_RESULT	(16)
+	RETURN1
+
+#  endif
+
+	.p2align 4
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	SAVE_RESULT	(0)
+#  ifdef USE_AS_STRNCPY
+	sub	$1, %ebx
+	lea	1(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	SAVE_RESULT	(1)
+#  ifdef USE_AS_STRNCPY
+	sub	$2, %ebx
+	lea	2(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	SAVE_RESULT	(2)
+#  ifdef USE_AS_STRNCPY
+	sub	$3, %ebx
+	lea	3(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	SAVE_RESULT	(4)
+#  ifdef USE_AS_STRNCPY
+	sub	$5, %ebx
+	lea	5(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	SAVE_RESULT	(5)
+#  ifdef USE_AS_STRNCPY
+	sub	$6, %ebx
+	lea	6(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	SAVE_RESULT	(6)
+#  ifdef USE_AS_STRNCPY
+	sub	$7, %ebx
+	lea	7(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit9):
+	movlpd	(%ecx), %xmm0
+	movb	8(%ecx), %al
+	movlpd	%xmm0, (%edx)
+	movb	%al, 8(%edx)
+	SAVE_RESULT	(8)
+#  ifdef USE_AS_STRNCPY
+	sub	$9, %ebx
+	lea	9(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit10):
+	movlpd	(%ecx), %xmm0
+	movw	8(%ecx), %ax
+	movlpd	%xmm0, (%edx)
+	movw	%ax, 8(%edx)
+	SAVE_RESULT	(9)
+#  ifdef USE_AS_STRNCPY
+	sub	$10, %ebx
+	lea	10(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit11):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 7(%edx)
+	SAVE_RESULT	(10)
+#  ifdef USE_AS_STRNCPY
+	sub	$11, %ebx
+	lea	11(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	SAVE_RESULT	(12)
+#  ifdef USE_AS_STRNCPY
+	sub	$13, %ebx
+	lea	13(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	SAVE_RESULT	(13)
+#  ifdef USE_AS_STRNCPY
+	sub	$14, %ebx
+	lea	14(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	SAVE_RESULT	(14)
+#  ifdef USE_AS_STRNCPY
+	sub	$15, %ebx
+	lea	15(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+CFI_POP	(%edi)
+
+#  ifdef USE_AS_STRNCPY
+	.p2align 4
+L(Fill0):
+	RETURN
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill3):
+	movw	%dx, (%ecx)
+	movb	%dl, 2(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%ecx)
+	movb	%dl, 4(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%ecx)
+	movw	%dx, 4(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill7):
+	movl	%edx, (%ecx)
+	movl	%edx, 3(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill8):
+	movlpd	%xmm0, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill9):
+	movlpd	%xmm0, (%ecx)
+	movb	%dl, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill10):
+	movlpd	%xmm0, (%ecx)
+	movw	%dx, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill11):
+	movlpd	%xmm0, (%ecx)
+	movl	%edx, 7(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill12):
+	movlpd	%xmm0, (%ecx)
+	movl	%edx, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill13):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 5(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill14):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 6(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill15):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 7(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill16):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(StrncpyFillExit1):
+	lea	16(%ebx), %ebx
+L(FillFrom1To16Bytes):
+	test	%ebx, %ebx
+	jz	L(Fill0)
+	cmp	$16, %ebx
+	je	L(Fill16)
+	cmp	$8, %ebx
+	je	L(Fill8)
+	jg	L(FillMore8)
+	cmp	$4, %ebx
+	je	L(Fill4)
+	jg	L(FillMore4)
+	cmp	$2, %ebx
+	jl	L(Fill1)
+	je	L(Fill2)
+	jg	L(Fill3)
+L(FillMore8):	/* but less than 16 */
+	cmp	$12, %ebx
+	je	L(Fill12)
+	jl	L(FillLess12)
+	cmp	$14, %ebx
+	jl	L(Fill13)
+	je	L(Fill14)
+	jg	L(Fill15)
+L(FillMore4):	/* but less than 8 */
+	cmp	$6, %ebx
+	jl	L(Fill5)
+	je	L(Fill6)
+	jg	L(Fill7)
+L(FillLess12):	/* but more than 8 */
+	cmp	$10, %ebx
+	jl	L(Fill9)
+	je	L(Fill10)
+	jmp	L(Fill11)
+
+	CFI_PUSH(%edi)
+
+	.p2align 4
+L(StrncpyFillTailWithZero1):
+	POP	(%edi)
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%edx, %edx
+	sub	$16, %ebx
+	jbe	L(StrncpyFillExit1)
+
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 8(%ecx)
+
+	lea	16(%ecx), %ecx
+
+	mov	%ecx, %edx
+	and	$0xf, %edx
+	sub	%edx, %ecx
+	add	%edx, %ebx
+	xor	%edx, %edx
+	sub	$64, %ebx
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%ecx)
+	movdqa	%xmm0, 16(%ecx)
+	movdqa	%xmm0, 32(%ecx)
+	movdqa	%xmm0, 48(%ecx)
+	lea	64(%ecx), %ecx
+	sub	$64, %ebx
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %ebx
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%ecx)
+	movdqa	%xmm0, 16(%ecx)
+	lea	32(%ecx), %ecx
+	sub	$16, %ebx
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%ecx)
+	lea	16(%ecx), %ecx
+	jmp	L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+	add	$16, %ebx
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%ecx)
+	lea	16(%ecx), %ecx
+	jmp	L(FillFrom1To16Bytes)
+#  endif
+
+	.p2align 4
+L(ExitTail1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	SAVE_RESULT_TAIL (0)
+#  ifdef USE_AS_STRNCPY
+	sub	$1, %ebx
+	lea	1(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	SAVE_RESULT_TAIL (1)
+#  ifdef USE_AS_STRNCPY
+	sub	$2, %ebx
+	lea	2(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	SAVE_RESULT_TAIL (2)
+#  ifdef USE_AS_STRNCPY
+	sub	$3, %ebx
+	lea	3(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT_TAIL (3)
+#  ifdef USE_AS_STRNCPY
+	sub	$4, %ebx
+	lea	4(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	SAVE_RESULT_TAIL (4)
+#  ifdef USE_AS_STRNCPY
+	sub	$5, %ebx
+	lea	5(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	SAVE_RESULT_TAIL (5)
+#  ifdef USE_AS_STRNCPY
+	sub	$6, %ebx
+	lea	6(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	SAVE_RESULT_TAIL (6)
+#  ifdef USE_AS_STRNCPY
+	sub	$7, %ebx
+	lea	7(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT_TAIL (7)
+#  ifdef USE_AS_STRNCPY
+	sub	$8, %ebx
+	lea	8(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail9):
+	movlpd	(%ecx), %xmm0
+	movb	8(%ecx), %al
+	movlpd	%xmm0, (%edx)
+	movb	%al, 8(%edx)
+	SAVE_RESULT_TAIL (8)
+#  ifdef USE_AS_STRNCPY
+	sub	$9, %ebx
+	lea	9(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail10):
+	movlpd	(%ecx), %xmm0
+	movw	8(%ecx), %ax
+	movlpd	%xmm0, (%edx)
+	movw	%ax, 8(%edx)
+	SAVE_RESULT_TAIL (9)
+#  ifdef USE_AS_STRNCPY
+	sub	$10, %ebx
+	lea	10(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail11):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 7(%edx)
+	SAVE_RESULT_TAIL (10)
+#  ifdef USE_AS_STRNCPY
+	sub	$11, %ebx
+	lea	11(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail12):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT_TAIL (11)
+#  ifdef USE_AS_STRNCPY
+	sub	$12, %ebx
+	lea	12(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail13):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	SAVE_RESULT_TAIL (12)
+#  ifdef USE_AS_STRNCPY
+	sub	$13, %ebx
+	lea	13(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail14):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	SAVE_RESULT_TAIL (13)
+#  ifdef USE_AS_STRNCPY
+	sub	$14, %ebx
+	lea	14(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail15):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	SAVE_RESULT_TAIL (14)
+#  ifdef USE_AS_STRNCPY
+	sub	$15, %ebx
+	lea	15(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	SAVE_RESULT_TAIL (15)
+#  ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	lea	16(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+# endif
+
+# ifdef USE_AS_STRNCPY
+#  ifndef USE_AS_STRCAT
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+#  endif
+	.p2align 4
+L(StrncpyLeaveCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm4, -64(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm5, -48(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm6, -32(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm6, -32(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+	jmp	L(CopyFrom1To16BytesCase2)
+
+/*--------------------------------------------------*/
+	.p2align 4
+L(StrncpyExit1Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	mov	$15, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit2Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	mov	$14, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit3Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	mov	$13, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit4Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	mov	$12, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit5Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 7(%edx)
+	mov	$11, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit6Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	6(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 6(%edx)
+	mov	$10, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit7Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	5(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 5(%edx)
+	mov	$9, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit8Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$8, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit9Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$7, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit10Case2OrCase3):
+	movlpd	-1(%ecx), %xmm0
+	movlpd	%xmm0, -1(%edx)
+	mov	$6, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit11Case2OrCase3):
+	movlpd	-2(%ecx), %xmm0
+	movlpd	%xmm0, -2(%edx)
+	mov	$5, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit12Case2OrCase3):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit13Case2OrCase3):
+	movl	-1(%ecx), %esi
+	movl	%esi, -1(%edx)
+	mov	$3, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit14Case2OrCase3):
+	movl	-2(%ecx), %esi
+	movl	%esi, -2(%edx)
+	mov	$2, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit15Case2OrCase3):
+	movl	-3(%ecx), %esi
+	movl	%esi, -3(%edx)
+	mov	$1, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave1):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit1)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit1):
+	lea	15(%edx, %esi), %edx
+	lea	15(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave2):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit2)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit2):
+	lea	14(%edx, %esi), %edx
+	lea	14(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave3):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit3)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit3):
+	lea	13(%edx, %esi), %edx
+	lea	13(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave4):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit4)
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit4):
+	lea	12(%edx, %esi), %edx
+	lea	12(%ecx, %esi), %ecx
+	movlpd	-12(%ecx), %xmm0
+	movl	-4(%ecx), %eax
+	movlpd	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave5):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit5)
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit5):
+	lea	11(%edx, %esi), %edx
+	lea	11(%ecx, %esi), %ecx
+	movlpd	-11(%ecx), %xmm0
+	movl	-4(%ecx), %eax
+	movlpd	%xmm0, -11(%edx)
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave6):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit6)
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit6):
+	lea	10(%edx, %esi), %edx
+	lea	10(%ecx, %esi), %ecx
+
+	movlpd	-10(%ecx), %xmm0
+	movw	-2(%ecx), %ax
+	movlpd	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave7):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit7)
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit7):
+	lea	9(%edx, %esi), %edx
+	lea	9(%ecx, %esi), %ecx
+
+	movlpd	-9(%ecx), %xmm0
+	movb	-1(%ecx), %ah
+	movlpd	%xmm0, -9(%edx)
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave8):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit8)
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit8):
+	lea	8(%edx, %esi), %edx
+	lea	8(%ecx, %esi), %ecx
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave9):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit9)
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit9):
+	lea	7(%edx, %esi), %edx
+	lea	7(%ecx, %esi), %ecx
+
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave10):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit10)
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit10):
+	lea	6(%edx, %esi), %edx
+	lea	6(%ecx, %esi), %ecx
+
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave11):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit11)
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit11):
+	lea	5(%edx, %esi), %edx
+	lea	5(%ecx, %esi), %ecx
+	movl	-5(%ecx), %esi
+	movb	-1(%ecx), %ah
+	movl	%esi, -5(%edx)
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave12):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit12)
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit12):
+	lea	4(%edx, %esi), %edx
+	lea	4(%ecx, %esi), %ecx
+	movl	-4(%ecx), %eax
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave13):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit13)
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit13):
+	lea	3(%edx, %esi), %edx
+	lea	3(%ecx, %esi), %ecx
+
+	movl	-4(%ecx), %eax
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave14):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit14)
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit14):
+	lea	2(%edx, %esi), %edx
+	lea	2(%ecx, %esi), %ecx
+	movw	-2(%ecx), %ax
+	movw	%ax, -2(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave15):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit15)
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit15):
+	lea	1(%edx, %esi), %edx
+	lea	1(%ecx, %esi), %ecx
+	movb	-1(%ecx), %ah
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+# endif
+
+# ifndef USE_AS_STRCAT
+#  ifdef USE_AS_STRNCPY
+	CFI_POP (%esi)
+	CFI_POP (%edi)
+
+	.p2align 4
+L(ExitTail0):
+	movl	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(StrncpyExit15Bytes):
+	cmp	$12, %ebx
+	jbe	L(StrncpyExit12Bytes)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmp	$13, %ebx
+	je	L(ExitTail13)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmp	$14, %ebx
+	je	L(ExitTail14)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+#   ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   else
+	movl	%edx, %eax
+#   endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit12Bytes):
+	cmp	$9, %ebx
+	je	L(ExitTail9)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmp	$10, %ebx
+	je	L(ExitTail10)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmp	$11, %ebx
+	je	L(ExitTail11)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT_TAIL (11)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit8Bytes):
+	cmp	$4, %ebx
+	jbe	L(StrncpyExit4Bytes)
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+
+	cmp	$5, %ebx
+	je	L(ExitTail5)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmp	$6, %ebx
+	je	L(ExitTail6)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmp	$7, %ebx
+	je	L(ExitTail7)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+#   ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   else
+	movl	%edx, %eax
+#   endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit4Bytes):
+	test	%ebx, %ebx
+	jz	L(ExitTail0)
+	cmp	$1, %ebx
+	je	L(ExitTail1)
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmp	$2, %ebx
+	je	L(ExitTail2)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmp	$3, %ebx
+	je	L(ExitTail3)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT_TAIL (3)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+	RETURN
+#  endif
+
+END (STRCPY)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S
new file mode 100644
index 0000000000..ffbc03c6d5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S
@@ -0,0 +1,116 @@
+/* Multiple versions of strcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
+# ifndef STRCPY
+#  define STRCPY strcpy
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3	__stpncpy_ssse3
+#  define STRCPY_SSE2		__stpncpy_sse2
+#  define STRCPY_IA32		__stpncpy_ia32
+#  define __GI_STRCPY		__GI_stpncpy
+#  define __GI___STRCPY		__GI___stpncpy
+# else
+#  define STRCPY_SSSE3	__stpcpy_ssse3
+#  define STRCPY_SSE2		__stpcpy_sse2
+#  define STRCPY_IA32		__stpcpy_ia32
+#  define __GI_STRCPY		__GI_stpcpy
+#  define __GI___STRCPY		__GI___stpcpy
+# endif
+#else
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3	__strncpy_ssse3
+#  define STRCPY_SSE2		__strncpy_sse2
+#  define STRCPY_IA32		__strncpy_ia32
+#  define __GI_STRCPY		__GI_strncpy
+# else
+#  define STRCPY_SSSE3	__strcpy_ssse3
+#  define STRCPY_SSE2		__strcpy_sse2
+#  define STRCPY_IA32		__strcpy_ia32
+#  define __GI_STRCPY		__GI_strcpy
+# endif
+#endif
+
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncpy in static library since we
+   need strncpy before the initialization happened.  */
+#if IS_IN (libc)
+
+	.text
+ENTRY(STRCPY)
+	.type	STRCPY, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCPY_IA32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCPY_SSE2)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCPY_SSSE3)
+2:	ret
+END(STRCPY)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCPY_IA32, @function; \
+	.align 16; \
+	.globl STRCPY_IA32; \
+	.hidden STRCPY_IA32; \
+	STRCPY_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCPY_IA32, .-STRCPY_IA32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCPY; __GI_STRCPY = STRCPY_IA32
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI___STRCPY; __GI___STRCPY = STRCPY_IA32
+
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+#  include "../../stpncpy.S"
+# else
+#  include "../../i586/stpcpy.S"
+# endif
+#else
+# ifndef USE_AS_STRNCPY
+#  include "../../i586/strcpy.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c
new file mode 100644
index 0000000000..6d61e190a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c
@@ -0,0 +1,2 @@
+#define __strcspn_sse2 __strcspn_ia32
+#include <sysdeps/x86_64/multiarch/strcspn-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S
new file mode 100644
index 0000000000..21e5093924
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S
@@ -0,0 +1,75 @@
+/* Multiple versions of strcspn
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRPBRK
+#define STRCSPN_SSE42	__strpbrk_sse42
+#define STRCSPN_IA32	__strpbrk_ia32
+#define __GI_STRCSPN	__GI_strpbrk
+#else
+#ifndef STRCSPN
+#define STRCSPN		strcspn
+#define STRCSPN_SSE42	__strcspn_sse42
+#define STRCSPN_IA32	__strcspn_ia32
+#define __GI_STRCSPN	__GI_strcspn
+#endif
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strpbrk in static library since we
+   need strpbrk before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc)
+	.text
+ENTRY(STRCSPN)
+	.type	STRCSPN, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCSPN_IA32)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCSPN_SSE42)
+2:	ret
+END(STRCSPN)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCSPN_IA32, @function; \
+	.globl STRCSPN_IA32; \
+	.p2align 4; \
+	STRCSPN_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCSPN_IA32, .-STRCSPN_IA32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_IA32
+#endif
+
+#ifdef USE_AS_STRPBRK
+#include "../../strpbrk.S"
+#else
+#include "../../strcspn.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
new file mode 100644
index 0000000000..d3ea864bab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
@@ -0,0 +1,125 @@
+/* strlen with SSE2 and BSF
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined SHARED && IS_IN (libc)
+
+#include <sysdep.h>
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+#define PARMS		4 + 8	/* Preserve ESI and EDI.  */
+#define	STR		PARMS
+#define ENTRANCE	PUSH (%esi); PUSH (%edi); cfi_remember_state
+#define RETURN		POP (%edi); POP (%esi); ret; \
+			cfi_restore_state; cfi_remember_state
+
+	.text
+ENTRY ( __strlen_sse2_bsf)
+	ENTRANCE
+	mov	STR(%esp), %edi
+	xor	%eax, %eax
+	mov	%edi, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%edi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	mov	%edi, %eax
+	and	$-16, %eax
+	jmp	L(align16_start)
+L(next):
+
+	mov	%edi, %eax
+	and	$-16, %eax
+	pcmpeqb	(%eax), %xmm0
+	mov	$-1, %esi
+	sub	%eax, %ecx
+	shl	%cl, %esi
+	pmovmskb %xmm0, %edx
+	and	%esi, %edx
+	jnz	L(exit)
+L(align16_start):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	.p2align 4
+L(align16_loop):
+	pcmpeqb	16(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(align16_loop)
+L(exit):
+	sub	%edi, %eax
+L(exit_less16):
+	bsf	%edx, %edx
+	add	%edx, %eax
+	RETURN
+L(exit16):
+	sub	%edi, %eax
+	bsf	%edx, %edx
+	add	%edx, %eax
+	add	$16, %eax
+	RETURN
+L(exit32):
+	sub	%edi, %eax
+	bsf	%edx, %edx
+	add	%edx, %eax
+	add	$32, %eax
+	RETURN
+L(exit48):
+	sub	%edi, %eax
+	bsf	%edx, %edx
+	add	%edx, %eax
+	add	$48, %eax
+	POP (%edi)
+	POP (%esi)
+	ret
+
+END ( __strlen_sse2_bsf)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S
new file mode 100644
index 0000000000..36fc1469d0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S
@@ -0,0 +1,695 @@
+/* strlen with SSE2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */
+
+#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+
+#  include <sysdep.h>
+#  define PARMS	4
+#  define STR	PARMS
+#  define RETURN	ret
+
+#  ifdef USE_AS_STRNLEN
+#   define LEN	PARMS + 8
+#   define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#   define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#   define PUSH(REG)	pushl	REG;	CFI_PUSH (REG)
+#   define POP(REG)	popl	REG;	CFI_POP (REG)
+#   undef RETURN
+#   define RETURN	POP (%edi); CFI_PUSH(%edi); ret
+#  endif
+
+#  ifndef STRLEN
+#   define STRLEN	__strlen_sse2
+#  endif
+
+	atom_text_section
+ENTRY (STRLEN)
+	mov	STR(%esp), %edx
+#  ifdef USE_AS_STRNLEN
+	PUSH	(%edi)
+	movl	LEN(%esp), %edi
+	sub	$4, %edi
+	jbe	L(len_less4_prolog)
+#  endif
+# endif
+	xor	%eax, %eax
+	cmpb	$0, (%edx)
+	jz	L(exit_tail0)
+	cmpb	$0, 1(%edx)
+	jz	L(exit_tail1)
+	cmpb	$0, 2(%edx)
+	jz	L(exit_tail2)
+	cmpb	$0, 3(%edx)
+	jz	L(exit_tail3)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less8_prolog)
+# endif
+
+	cmpb	$0, 4(%edx)
+	jz	L(exit_tail4)
+	cmpb	$0, 5(%edx)
+	jz	L(exit_tail5)
+	cmpb	$0, 6(%edx)
+	jz	L(exit_tail6)
+	cmpb	$0, 7(%edx)
+	jz	L(exit_tail7)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less12_prolog)
+# endif
+
+	cmpb	$0, 8(%edx)
+	jz	L(exit_tail8)
+	cmpb	$0, 9(%edx)
+	jz	L(exit_tail9)
+	cmpb	$0, 10(%edx)
+	jz	L(exit_tail10)
+	cmpb	$0, 11(%edx)
+	jz	L(exit_tail11)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less16_prolog)
+# endif
+
+	cmpb	$0, 12(%edx)
+	jz	L(exit_tail12)
+	cmpb	$0, 13(%edx)
+	jz	L(exit_tail13)
+	cmpb	$0, 14(%edx)
+	jz	L(exit_tail14)
+	cmpb	$0, 15(%edx)
+	jz	L(exit_tail15)
+
+	pxor	%xmm0, %xmm0
+	lea	16(%edx), %eax
+	mov	%eax, %ecx
+	and	$-16, %eax
+
+# ifdef USE_AS_STRNLEN
+	and	$15, %edx
+	add	%edx, %edi
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	mov	%eax, %edx
+	and	$63, %edx
+	add	%edx, %edi
+# endif
+
+	and	$-0x40, %eax
+
+	.p2align 4
+L(aligned_64_loop):
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+	movaps	(%eax), %xmm0
+	movaps	16(%eax), %xmm1
+	movaps	32(%eax), %xmm2
+	movaps	48(%eax), %xmm6
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqb	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	64(%eax), %eax
+	jz	L(aligned_64_loop)
+
+	pcmpeqb	-64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	48(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqb	-32(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqb	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+L(exit):
+	sub	%ecx, %eax
+	test	%dl, %dl
+	jz	L(exit_high)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_8)
+	test	$0x01, %dl
+	jnz	L(exit_tail0)
+	test	$0x02, %dl
+	jnz	L(exit_tail1)
+	test	$0x04, %dl
+	jnz	L(exit_tail2)
+	add	$3, %eax
+	RETURN
+
+	.p2align 4
+L(exit_8):
+	test	$0x10, %dl
+	jnz	L(exit_tail4)
+	test	$0x20, %dl
+	jnz	L(exit_tail5)
+	test	$0x40, %dl
+	jnz	L(exit_tail6)
+	add	$7, %eax
+	RETURN
+
+	.p2align 4
+L(exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_high_8)
+	test	$0x01, %dh
+	jnz	L(exit_tail8)
+	test	$0x02, %dh
+	jnz	L(exit_tail9)
+	test	$0x04, %dh
+	jnz	L(exit_tail10)
+	add	$11, %eax
+	RETURN
+
+	.p2align 4
+L(exit_high_8):
+	test	$0x10, %dh
+	jnz	L(exit_tail12)
+	test	$0x20, %dh
+	jnz	L(exit_tail13)
+	test	$0x40, %dh
+	jnz	L(exit_tail14)
+	add	$15, %eax
+L(exit_tail0):
+	RETURN
+
+# ifdef USE_AS_STRNLEN
+
+	.p2align 4
+L(len_less64):
+	pxor	%xmm0, %xmm0
+	add	$64, %edi
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	movl	LEN(%esp), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit):
+	sub	%ecx, %eax
+
+	test	%dl, %dl
+	jz	L(strnlen_exit_high)
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(strnlen_exit_8)
+	test	$0x01, %dl
+	jnz	L(exit_tail0)
+	test	$0x02, %dl
+	jnz	L(strnlen_exit_tail1)
+	test	$0x04, %dl
+	jnz	L(strnlen_exit_tail2)
+	sub	$4, %edi
+	jb	L(return_start_len)
+	lea	3(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_8):
+	test	$0x10, %dl
+	jnz	L(strnlen_exit_tail4)
+	test	$0x20, %dl
+	jnz	L(strnlen_exit_tail5)
+	test	$0x40, %dl
+	jnz	L(strnlen_exit_tail6)
+	sub	$8, %edi
+	jb	L(return_start_len)
+	lea	7(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(strnlen_exit_high_8)
+	test	$0x01, %dh
+	jnz	L(strnlen_exit_tail8)
+	test	$0x02, %dh
+	jnz	L(strnlen_exit_tail9)
+	test	$0x04, %dh
+	jnz	L(strnlen_exit_tail10)
+	sub	$12, %edi
+	jb	L(return_start_len)
+	lea	11(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_high_8):
+	test	$0x10, %dh
+	jnz	L(strnlen_exit_tail12)
+	test	$0x20, %dh
+	jnz	L(strnlen_exit_tail13)
+	test	$0x40, %dh
+	jnz	L(strnlen_exit_tail14)
+	sub	$16, %edi
+	jb	L(return_start_len)
+	lea	15(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail1):
+	sub	$2, %edi
+	jb	L(return_start_len)
+	lea	1(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail2):
+	sub	$3, %edi
+	jb	L(return_start_len)
+	lea	2(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail4):
+	sub	$5, %edi
+	jb	L(return_start_len)
+	lea	4(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail5):
+	sub	$6, %edi
+	jb	L(return_start_len)
+	lea	5(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail6):
+	sub	$7, %edi
+	jb	L(return_start_len)
+	lea	6(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail8):
+	sub	$9, %edi
+	jb	L(return_start_len)
+	lea	8(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail9):
+	sub	$10, %edi
+	jb	L(return_start_len)
+	lea	9(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail10):
+	sub	$11, %edi
+	jb	L(return_start_len)
+	lea	10(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail12):
+	sub	$13, %edi
+	jb	L(return_start_len)
+	lea	12(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail13):
+	sub	$14, %edi
+	jb	L(return_start_len)
+	lea	13(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail14):
+	sub	$15, %edi
+	jb	L(return_start_len)
+	lea	14(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(return_start_len):
+	movl	LEN(%esp), %eax
+	RETURN
+
+/* for prolog only */
+
+	.p2align 4
+L(len_less4_prolog):
+	xor	%eax, %eax
+
+	add	$4, %edi
+	jz	L(exit_tail0)
+
+	cmpb	$0, (%edx)
+	jz	L(exit_tail0)
+	cmp	$1, %edi
+	je	L(exit_tail1)
+
+	cmpb	$0, 1(%edx)
+	jz	L(exit_tail1)
+	cmp	$2, %edi
+	je	L(exit_tail2)
+
+	cmpb	$0, 2(%edx)
+	jz	L(exit_tail2)
+	cmp	$3, %edi
+	je	L(exit_tail3)
+
+	cmpb	$0, 3(%edx)
+	jz	L(exit_tail3)
+	mov	$4, %eax
+	RETURN
+
+	.p2align 4
+L(len_less8_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 4(%edx)
+	jz	L(exit_tail4)
+	cmp	$1, %edi
+	je	L(exit_tail5)
+
+	cmpb	$0, 5(%edx)
+	jz	L(exit_tail5)
+	cmp	$2, %edi
+	je	L(exit_tail6)
+
+	cmpb	$0, 6(%edx)
+	jz	L(exit_tail6)
+	cmp	$3, %edi
+	je	L(exit_tail7)
+
+	cmpb	$0, 7(%edx)
+	jz	L(exit_tail7)
+	mov	$8, %eax
+	RETURN
+
+
+	.p2align 4
+L(len_less12_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 8(%edx)
+	jz	L(exit_tail8)
+	cmp	$1, %edi
+	je	L(exit_tail9)
+
+	cmpb	$0, 9(%edx)
+	jz	L(exit_tail9)
+	cmp	$2, %edi
+	je	L(exit_tail10)
+
+	cmpb	$0, 10(%edx)
+	jz	L(exit_tail10)
+	cmp	$3, %edi
+	je	L(exit_tail11)
+
+	cmpb	$0, 11(%edx)
+	jz	L(exit_tail11)
+	mov	$12, %eax
+	RETURN
+
+	.p2align 4
+L(len_less16_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 12(%edx)
+	jz	L(exit_tail12)
+	cmp	$1, %edi
+	je	L(exit_tail13)
+
+	cmpb	$0, 13(%edx)
+	jz	L(exit_tail13)
+	cmp	$2, %edi
+	je	L(exit_tail14)
+
+	cmpb	$0, 14(%edx)
+	jz	L(exit_tail14)
+	cmp	$3, %edi
+	je	L(exit_tail15)
+
+	cmpb	$0, 15(%edx)
+	jz	L(exit_tail15)
+	mov	$16, %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(exit_tail1):
+	add	$1, %eax
+	RETURN
+
+L(exit_tail2):
+	add	$2, %eax
+	RETURN
+
+L(exit_tail3):
+	add	$3, %eax
+	RETURN
+
+L(exit_tail4):
+	add	$4, %eax
+	RETURN
+
+L(exit_tail5):
+	add	$5, %eax
+	RETURN
+
+L(exit_tail6):
+	add	$6, %eax
+	RETURN
+
+L(exit_tail7):
+	add	$7, %eax
+	RETURN
+
+L(exit_tail8):
+	add	$8, %eax
+	RETURN
+
+L(exit_tail9):
+	add	$9, %eax
+	RETURN
+
+L(exit_tail10):
+	add	$10, %eax
+	RETURN
+
+L(exit_tail11):
+	add	$11, %eax
+	RETURN
+
+L(exit_tail12):
+	add	$12, %eax
+	RETURN
+
+L(exit_tail13):
+	add	$13, %eax
+	RETURN
+
+L(exit_tail14):
+	add	$14, %eax
+	RETURN
+
+L(exit_tail15):
+	add	$15, %eax
+# ifndef USE_AS_STRCAT
+	RETURN
+END (STRLEN)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S
new file mode 100644
index 0000000000..77cf6bcdb0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S
@@ -0,0 +1,60 @@
+/* Multiple versions of strlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc and for the
+   DSO.  In static binaries, we need strlen before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(strlen)
+	.type	strlen, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strlen_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strlen_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strlen_sse2)
+2:	ret
+END(strlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strlen_ia32, @function; \
+	.globl __strlen_ia32; \
+	.p2align 4; \
+	__strlen_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strlen_ia32, .-__strlen_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strlen; __GI_strlen = __strlen_ia32
+#endif
+
+#include "../../i586/strlen.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c
new file mode 100644
index 0000000000..76581eb62b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c
@@ -0,0 +1,8 @@
+#include <string.h>
+
+extern __typeof (strncasecmp) __strncasecmp_nonascii;
+
+#define __strncasecmp __strncasecmp_nonascii
+#include <string/strncase.c>
+
+strong_alias (__strncasecmp_nonascii, __strncasecmp_ia32)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S
new file mode 100644
index 0000000000..a56e63a566
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S
@@ -0,0 +1,39 @@
+/* Entry point for multi-version x86 strncasecmp.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY(__strncasecmp)
+	.type	__strncasecmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strncasecmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strncasecmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	LOAD_FUNC_GOT_EAX (__strncasecmp_sse4_2)
+2:	ret
+END(__strncasecmp)
+
+weak_alias (__strncasecmp, strncasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c
new file mode 100644
index 0000000000..7e601af271
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c
@@ -0,0 +1,13 @@
+#include <string.h>
+
+extern __typeof (strncasecmp_l) __strncasecmp_l_nonascii;
+
+#define __strncasecmp_l __strncasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL    1
+#include <string/strncase.c>
+
+strong_alias (__strncasecmp_l_nonascii, __strncasecmp_l_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+   the IFUNC.  */
+strong_alias (__strncasecmp_l_nonascii, __GI___strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S
new file mode 100644
index 0000000000..557210832e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRNCASECMP_L 1
+#include "strcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S
new file mode 100644
index 0000000000..d438a1ae35
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRNCASECMP_L 1
+#include "strcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S
new file mode 100644
index 0000000000..8a74ee8574
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S
@@ -0,0 +1,7 @@
+/* Multiple versions of strncasecmp_l
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCMP __strncasecmp_l
+#define USE_AS_STRNCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strncasecmp_l, strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c
new file mode 100644
index 0000000000..132a000545
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c
@@ -0,0 +1,8 @@
+#define STRNCAT __strncat_ia32
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+  __hidden_ver1 (__strncat_ia32, __GI___strncat, __strncat_ia32);
+#endif
+
+#include "string/strncat.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S
new file mode 100644
index 0000000000..f1045b72b8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S
@@ -0,0 +1,4 @@
+#define STRCAT  __strncat_sse2
+#define USE_AS_STRNCAT
+
+#include "strcat-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S
new file mode 100644
index 0000000000..625b90a978
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S
@@ -0,0 +1,4 @@
+#define STRCAT  __strncat_ssse3
+#define USE_AS_STRNCAT
+
+#include "strcat-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S
new file mode 100644
index 0000000000..5c1bf41453
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncat
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCAT strncat
+#define USE_AS_STRNCAT
+#include "strcat.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c
new file mode 100644
index 0000000000..cc059da494
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c
@@ -0,0 +1,8 @@
+#ifdef SHARED
+# define STRNCMP __strncmp_ia32
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)  \
+    __hidden_ver1 (__strncmp_ia32, __GI_strncmp, __strncmp_ia32);
+#endif
+
+#include "string/strncmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S
new file mode 100644
index 0000000000..cf14dfaf6c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP	__strncmp_sse4_2
+# include "strcmp-sse4.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
new file mode 100644
index 0000000000..536c8685f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP	__strncmp_ssse3
+# include "strcmp-ssse3.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S
new file mode 100644
index 0000000000..150d4786d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncmp
+   All versions must be listed in ifunc-impl-list.c.  */
+#define USE_AS_STRNCMP
+#define STRCMP	strncmp
+#include "strcmp.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c
new file mode 100644
index 0000000000..201e3f98b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c
@@ -0,0 +1,8 @@
+#define STRNCPY __strncpy_ia32
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)  \
+    __hidden_ver1 (__strncpy_ia32, __GI_strncpy, __strncpy_ia32);
+#endif
+
+#include "string/strncpy.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S
new file mode 100644
index 0000000000..bdd99239a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
new file mode 100644
index 0000000000..bf82ee447d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S
new file mode 100644
index 0000000000..9c257efc6e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define USE_AS_STRNCPY
+#define STRCPY strncpy
+#include "strcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c
new file mode 100644
index 0000000000..351e939a93
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c
@@ -0,0 +1,10 @@
+#define STRNLEN  __strnlen_ia32
+#ifdef SHARED
+# undef libc_hidden_def
+# define libc_hidden_def(name)  \
+    __hidden_ver1 (__strnlen_ia32, __GI_strnlen, __strnlen_ia32); \
+    strong_alias (__strnlen_ia32, __strnlen_ia32_1); \
+    __hidden_ver1 (__strnlen_ia32_1, __GI___strnlen, __strnlen_ia32_1);
+#endif
+
+#include "string/strnlen.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S
new file mode 100644
index 0000000000..56b6ae2a5c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNLEN
+#define STRLEN __strnlen_sse2
+#include "strlen-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S
new file mode 100644
index 0000000000..d241522c70
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S
@@ -0,0 +1,37 @@
+/* Multiple versions of strnlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__strnlen)
+	.type	__strnlen, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strnlen_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strnlen_sse2)
+2:	ret
+END(__strnlen)
+
+weak_alias(__strnlen, strnlen)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c
new file mode 100644
index 0000000000..5db62053b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c
@@ -0,0 +1,2 @@
+#define __strpbrk_sse2 __strpbrk_ia32
+#include <sysdeps/x86_64/multiarch/strpbrk-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S
new file mode 100644
index 0000000000..7201d6376f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strpbrk
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCSPN strpbrk
+#define USE_AS_STRPBRK
+#include "strcspn.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
new file mode 100644
index 0000000000..39a7c8825b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
@@ -0,0 +1,282 @@
+/* strrchr with SSE2 with bsf and bsr
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	.text
+ENTRY (__strrchr_sse2_bsf)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	PUSH	(%edi)
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$63, %ecx
+	cmp	$48, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	ja	L(crosscashe)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+	add	$16, %edi
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_return_value1):
+	bsf	%edx, %ecx
+	mov	$2, %edx
+	shl	%cl, %edx
+	sub	$1, %edx
+	and	%edx, %eax
+	jz	L(return_null)
+	bsr	%eax, %eax
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+	CFI_PUSH	(%edi)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%edx, %edx
+	jnz	L(unaligned_return_value1)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	lea	16(%edi), %esi
+	and	$-16, %edi
+	add	$16, %edi
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+	L(crosscashe):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	add	$16, %edi
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_return_value):
+	add	%ecx, %edi
+	bsf	%edx, %ecx
+	mov	$2, %edx
+	shl	%cl, %edx
+	sub	$1, %edx
+	and	%edx, %eax
+	jz	L(return_null)
+	bsr	%eax, %eax
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+	CFI_PUSH	(%edi)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(unaligned_return_value)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	add	$16, %edi
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%ebx, %ebx
+	jz	L(return_null_1)
+	bsr	%ebx, %eax
+	add	%esi, %eax
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	sub	$16, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH	(%edi)
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(return_value_1)
+	mov	%eax, %ebx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(return_value_1):
+	bsf	%ecx, %ecx
+	mov	$2, %edx
+	shl	%cl, %edx
+	sub	$1, %edx
+	and	%edx, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	bsr	%eax, %eax
+	add	%edi, %eax
+	sub	$16, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH	(%edi)
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH	(%edi)
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+/* Return NULL.  */
+	.p2align 4
+L(return_null_1):
+	POP	(%ebx)
+	POP	(%esi)
+	POP	(%edi)
+	xor	%eax, %eax
+	ret
+
+END (__strrchr_sse2_bsf)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S
new file mode 100644
index 0000000000..20934288be
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S
@@ -0,0 +1,708 @@
+/* strrchr SSE2 without bsf and bsr
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  8
+# define ENTRANCE PUSH(%edi);
+# define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	atom_text_section
+ENTRY (__strrchr_sse2)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$63, %ecx
+	cmp	$48, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	ja	L(crosscache)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %ecx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%ecx, %ecx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%ecx, %ecx
+	jnz	L(prolog_find_zero_1)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	mov	%edi, %esi
+	and	$-16, %edi
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(crosscache):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(prolog_find_zero)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%ebx, %ebx
+	jz	L(return_null_1)
+	mov	%ebx, %eax
+	mov	%esi, %edi
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(return_null_1):
+	POP	(%ebx)
+	POP	(%esi)
+
+	xor	%eax, %eax
+	RETURN
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(find_zero)
+	mov	%eax, %ebx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	%cl, %cl
+	jz	L(find_zero_high)
+	mov	%cl, %dl
+	and	$15, %dl
+	jz	L(find_zero_8)
+	test	$0x01, %cl
+	jnz	L(FindZeroExit1)
+	test	$0x02, %cl
+	jnz	L(FindZeroExit2)
+	test	$0x04, %cl
+	jnz	L(FindZeroExit3)
+	and	$1 << 4 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_8):
+	test	$0x10, %cl
+	jnz	L(FindZeroExit5)
+	test	$0x20, %cl
+	jnz	L(FindZeroExit6)
+	test	$0x40, %cl
+	jnz	L(FindZeroExit7)
+	and	$1 << 8 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_high):
+	mov	%ch, %dh
+	and	$15, %dh
+	jz	L(find_zero_high_8)
+	test	$0x01, %ch
+	jnz	L(FindZeroExit9)
+	test	$0x02, %ch
+	jnz	L(FindZeroExit10)
+	test	$0x04, %ch
+	jnz	L(FindZeroExit11)
+	and	$1 << 12 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_high_8):
+	test	$0x10, %ch
+	jnz	L(FindZeroExit13)
+	test	$0x20, %ch
+	jnz	L(FindZeroExit14)
+	test	$0x40, %ch
+	jnz	L(FindZeroExit15)
+	and	$1 << 16 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit1):
+	and	$1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit2):
+	and	$1 << 2 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit3):
+	and	$1 << 3 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit5):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit6):
+	and	$1 << 6 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit7):
+	and	$1 << 7 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit9):
+	and	$1 << 9 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit10):
+	and	$1 << 10 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit11):
+	and	$1 << 11 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit13):
+	and	$1 << 13 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit14):
+	and	$1 << 14 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit15):
+	and	$1 << 15 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	.p2align 4
+L(match_exit):
+	test	%ah, %ah
+	jnz	L(match_exit_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(match_exit_8)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_exit_8):
+	test	$0x80, %al
+	jnz	L(Exit8)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	lea	-12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_exit_high):
+	mov	%ah, %dh
+	and	$15 << 4, %dh
+	jnz	L(match_exit_high_8)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_exit_high_8):
+	test	$0x80, %ah
+	jnz	L(Exit16)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	lea	-4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	lea	-15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	lea	-14(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	lea	-13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	lea	-11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	lea	-10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit8):
+	lea	-9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	lea	-7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	lea	-6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	lea	-5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	lea	-3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	lea	-2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit16):
+	lea	-1(%edi), %eax
+	RETURN
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%ecx, %edi
+	mov     %edx, %ecx
+L(prolog_find_zero_1):
+	test	%cl, %cl
+	jz	L(prolog_find_zero_high)
+	mov	%cl, %dl
+	and	$15, %dl
+	jz	L(prolog_find_zero_8)
+	test	$0x01, %cl
+	jnz	L(PrologFindZeroExit1)
+	test	$0x02, %cl
+	jnz	L(PrologFindZeroExit2)
+	test	$0x04, %cl
+	jnz	L(PrologFindZeroExit3)
+	and	$1 << 4 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_8):
+	test	$0x10, %cl
+	jnz	L(PrologFindZeroExit5)
+	test	$0x20, %cl
+	jnz	L(PrologFindZeroExit6)
+	test	$0x40, %cl
+	jnz	L(PrologFindZeroExit7)
+	and	$1 << 8 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_high):
+	mov	%ch, %dh
+	and	$15, %dh
+	jz	L(prolog_find_zero_high_8)
+	test	$0x01, %ch
+	jnz	L(PrologFindZeroExit9)
+	test	$0x02, %ch
+	jnz	L(PrologFindZeroExit10)
+	test	$0x04, %ch
+	jnz	L(PrologFindZeroExit11)
+	and	$1 << 12 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_high_8):
+	test	$0x10, %ch
+	jnz	L(PrologFindZeroExit13)
+	test	$0x20, %ch
+	jnz	L(PrologFindZeroExit14)
+	test	$0x40, %ch
+	jnz	L(PrologFindZeroExit15)
+	and	$1 << 16 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit1):
+	and	$1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit2):
+	and	$1 << 2 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit3):
+	and	$1 << 3 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit5):
+	and	$1 << 5 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit6):
+	and	$1 << 6 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit7):
+	and	$1 << 7 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit9):
+	and	$1 << 9 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit10):
+	and	$1 << 10 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit11):
+	and	$1 << 11 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit13):
+	and	$1 << 13 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit14):
+	and	$1 << 14 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit15):
+	and	$1 << 15 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+END (__strrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S
new file mode 100644
index 0000000000..d9281eaeae
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S
@@ -0,0 +1,57 @@
+/* Multiple versions of strrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(strrchr)
+	.type	strrchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strrchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strrchr_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strrchr_sse2)
+2:	ret
+END(strrchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strrchr_ia32, @function; \
+	.globl __strrchr_ia32; \
+	.p2align 4; \
+	__strrchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strrchr_ia32, .-__strrchr_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strrchr; __GI_strrchr = __strrchr_ia32
+#endif
+
+#include "../../strrchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c
new file mode 100644
index 0000000000..bea09dea71
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c
@@ -0,0 +1,2 @@
+#define __strspn_sse2 __strspn_ia32
+#include <sysdeps/x86_64/multiarch/strspn-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S
new file mode 100644
index 0000000000..1269062381
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S
@@ -0,0 +1,56 @@
+/* Multiple versions of strspn
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+	.text
+ENTRY(strspn)
+	.type	strspn, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strspn_ia32)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strspn_sse42)
+2:	ret
+END(strspn)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strspn_ia32, @function; \
+	.globl __strspn_ia32; \
+	.p2align 4; \
+__strspn_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strspn_ia32, .-__strspn_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strspn; __GI_strspn = __strspn_ia32
+#endif
+
+#include "../../strspn.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c
new file mode 100644
index 0000000000..593cfec273
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/test-multiarch.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c
new file mode 100644
index 0000000000..7760b966e2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/varshift.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h
new file mode 100644
index 0000000000..7c72c70d67
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/varshift.h>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c
new file mode 100644
index 0000000000..38d41d04de
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c
@@ -0,0 +1,22 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# undef libc_hidden_weak
+# define libc_hidden_weak(name)
+
+# undef weak_alias
+# define weak_alias(name,alias)
+
+# ifdef SHARED
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+   __hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32); \
+   strong_alias (__wcschr_ia32, __wcschr_ia32_1); \
+   __hidden_ver1 (__wcschr_ia32_1, __GI___wcschr, __wcschr_ia32_1);
+# endif
+#endif
+
+extern __typeof (wcschr) __wcschr_ia32;
+
+#define WCSCHR  __wcschr_ia32
+#include <wcsmbs/wcschr.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S
new file mode 100644
index 0000000000..9ff6c3b8d6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S
@@ -0,0 +1,219 @@
+/* wcschr with SSE2, without using bsf instructions
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define STR1	PARMS
+# define STR2	STR1+4
+
+	atom_text_section
+ENTRY (__wcschr_sse2)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	mov	%ecx, %eax
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+
+	and	$63, %eax
+	cmp	$48, %eax
+	ja	L(cross_cache)
+
+	movdqu	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	and	$-16, %ecx
+	jmp	L(loop)
+
+	.p2align 4
+L(cross_cache):
+	PUSH	(%edi)
+	mov	%ecx, %edi
+	mov	%eax, %ecx
+	and	$-16, %edi
+	and	$15, %ecx
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	jz	L(unaligned_no_match)
+
+	add	%edi, %ecx
+	POP	(%edi)
+
+	test	%edx, %edx
+	jz	L(match_case1)
+	test	%al, %al
+	jz	L(match_higth_case2)
+	test	$15, %al
+	jnz	L(match_case2_4)
+	test	$15, %dl
+	jnz	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(unaligned_no_match):
+	mov	%edi, %ecx
+	POP	(%edi)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	pxor	%xmm2, %xmm2
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	add	$16, %ecx
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	pmovmskb %xmm2, %edx
+	test	%eax, %eax
+	jz	L(return_null)
+	test	%edx, %edx
+	jz	L(match_case1)
+
+	.p2align 4
+L(match_case2):
+	test	%al, %al
+	jz	L(match_higth_case2)
+	test	$15, %al
+	jnz	L(match_case2_4)
+	test	$15, %dl
+	jnz	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case2_4):
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(match_higth_case2):
+	test	%dl, %dl
+	jnz	L(return_null)
+	test	$15, %ah
+	jnz	L(match_case2_12)
+	test	$15, %dh
+	jnz	L(return_null)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case2_12):
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case1):
+	test	%al, %al
+	jz	L(match_higth_case1)
+
+	test	$0x01, %al
+	jnz	L(exit0)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_higth_case1):
+	test	$0x01, %ah
+	jnz	L(exit3)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit0):
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit3):
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+END (__wcschr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S
new file mode 100644
index 0000000000..d3c65a6436
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S
@@ -0,0 +1,36 @@
+/* Multiple versions of wcschr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__wcschr)
+	.type	wcschr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcschr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcschr_sse2)
+2:	ret
+END(__wcschr)
+weak_alias (__wcschr, wcschr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c
new file mode 100644
index 0000000000..e3337d77e2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c
@@ -0,0 +1,14 @@
+#include <wchar.h>
+
+#define WCSCMP __wcscmp_ia32
+#ifdef SHARED
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+  __hidden_ver1 (__wcscmp_ia32, __GI___wcscmp, __wcscmp_ia32);
+#endif
+#undef weak_alias
+#define weak_alias(name, alias)
+
+extern __typeof (wcscmp) __wcscmp_ia32;
+
+#include "wcsmbs/wcscmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
new file mode 100644
index 0000000000..a464b58204
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
@@ -0,0 +1,1018 @@
+/* wcscmp with SSE2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define ENTRANCE PUSH(%esi); PUSH(%edi)
+# define RETURN  POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi);
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+
+/* Note: wcscmp uses signed comparison, not unsugned as in strcmp function. */
+
+	.text
+ENTRY (__wcscmp_sse2)
+/*
+	* This implementation uses SSE to compare up to 16 bytes at a time.
+*/
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %eax
+
+	mov	(%eax), %ecx
+	cmp	%ecx, (%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	4(%eax), %ecx
+	cmp	%ecx, 4(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	8(%eax), %ecx
+	cmp	%ecx, 8(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	12(%eax), %ecx
+	cmp	%ecx, 12(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	ENTRANCE
+	add	$16, %eax
+	add	$16, %edx
+
+	mov	%eax, %esi
+	mov	%edx, %edi
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
+	mov	%al, %ch
+	mov	%dl, %cl
+	and	$63, %eax		/* esi alignment in cache line */
+	and	$63, %edx		/* edi alignment in cache line */
+	and	$15, %cl
+	jz	L(continue_00)
+	cmp	$16, %edx
+	jb	L(continue_0)
+	cmp	$32, %edx
+	jb	L(continue_16)
+	cmp	$48, %edx
+	jb	L(continue_32)
+
+L(continue_48):
+	and	$15, %ch
+	jz	L(continue_48_00)
+	cmp	$16, %eax
+	jb	L(continue_0_48)
+	cmp	$32, %eax
+	jb	L(continue_16_48)
+	cmp	$48, %eax
+	jb	L(continue_32_48)
+
+	.p2align 4
+L(continue_48_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_48_48)
+
+L(continue_0):
+	and	$15, %ch
+	jz	L(continue_0_00)
+	cmp	$16, %eax
+	jb	L(continue_0_0)
+	cmp	$32, %eax
+	jb	L(continue_0_16)
+	cmp	$48, %eax
+	jb	L(continue_0_32)
+
+	.p2align 4
+L(continue_0_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	mov	48(%esi), %ecx
+	cmp	%ecx, 48(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	52(%esi), %ecx
+	cmp	%ecx, 52(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	56(%esi), %ecx
+	cmp	%ecx, 56(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	60(%esi), %ecx
+	cmp	%ecx, 60(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_0_48)
+
+	.p2align 4
+L(continue_00):
+	and	$15, %ch
+	jz	L(continue_00_00)
+	cmp	$16, %eax
+	jb	L(continue_00_0)
+	cmp	$32, %eax
+	jb	L(continue_00_16)
+	cmp	$48, %eax
+	jb	L(continue_00_32)
+
+	.p2align 4
+L(continue_00_48):
+	pcmpeqd	(%edi), %xmm0
+	mov	(%edi), %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(less4_double_words1)
+
+	cmp	(%esi), %eax
+	jne	L(nequal)
+
+	mov	4(%edi), %eax
+	cmp	4(%esi), %eax
+	jne	L(nequal)
+
+	mov	8(%edi), %eax
+	cmp	8(%esi), %eax
+	jne	L(nequal)
+
+	mov	12(%edi), %eax
+	cmp	12(%esi), %eax
+	jne	L(nequal)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_32):
+	and	$15, %ch
+	jz	L(continue_32_00)
+	cmp	$16, %eax
+	jb	L(continue_0_32)
+	cmp	$32, %eax
+	jb	L(continue_16_32)
+	cmp	$48, %eax
+	jb	L(continue_32_32)
+
+	.p2align 4
+L(continue_32_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	16(%esi), %ecx
+	cmp	%ecx, 16(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	20(%esi), %ecx
+	cmp	%ecx, 20(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	24(%esi), %ecx
+	cmp	%ecx, 24(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	28(%esi), %ecx
+	cmp	%ecx, 28(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results */
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(continue_16):
+	and	$15, %ch
+	jz	L(continue_16_00)
+	cmp	$16, %eax
+	jb	L(continue_0_16)
+	cmp	$32, %eax
+	jb	L(continue_16_16)
+	cmp	$48, %eax
+	jb	L(continue_16_32)
+
+	.p2align 4
+L(continue_16_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	mov	32(%esi), %ecx
+	cmp	%ecx, 32(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	36(%esi), %ecx
+	cmp	%ecx, 36(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	40(%esi), %ecx
+	cmp	%ecx, 40(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	44(%esi), %ecx
+	cmp	%ecx, 44(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_16_48)
+
+	.p2align 4
+L(continue_00_00):
+	movdqa	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqa	16(%edi), %xmm3
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqa	32(%edi), %xmm5
+	pcmpeqd	%xmm5, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm5		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm5		/* packed sub of comparison results*/
+	pmovmskb %xmm5, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqa	48(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_00_00)
+
+	.p2align 4
+L(continue_00_32):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_00_16):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_00_0):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_48_00):
+	pcmpeqd	(%esi), %xmm0
+	mov	(%edi), %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(less4_double_words1)
+
+	cmp	(%esi), %eax
+	jne	L(nequal)
+
+	mov	4(%edi), %eax
+	cmp	4(%esi), %eax
+	jne	L(nequal)
+
+	mov	8(%edi), %eax
+	cmp	8(%esi), %eax
+	jne	L(nequal)
+
+	mov	12(%edi), %eax
+	cmp	12(%esi), %eax
+	jne	L(nequal)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_32_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_16_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_0_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_32_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_16_16):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm3
+	movdqu	16(%esi), %xmm4
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_0_0):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm3
+	movdqu	16(%esi), %xmm4
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_0_16):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(continue_0_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_16_48)
+
+	.p2align 4
+L(continue_16_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(less4_double_words1):
+	cmp	(%esi), %eax
+	jne	L(nequal)
+	test	%eax, %eax
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(less4_double_words):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word):
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word):
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_16):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_16)
+	and	$15, %dl
+	jz	L(second_double_word_16)
+	mov	16(%esi), %ecx
+	cmp	%ecx, 16(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_16):
+	mov	20(%esi), %ecx
+	cmp	%ecx, 20(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_16):
+	and	$15, %dh
+	jz	L(fourth_double_word_16)
+	mov	24(%esi), %ecx
+	cmp	%ecx, 24(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_16):
+	mov	28(%esi), %ecx
+	cmp	%ecx, 28(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_32):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_32)
+	and	$15, %dl
+	jz	L(second_double_word_32)
+	mov	32(%esi), %ecx
+	cmp	%ecx, 32(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_32):
+	mov	36(%esi), %ecx
+	cmp	%ecx, 36(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_32):
+	and	$15, %dh
+	jz	L(fourth_double_word_32)
+	mov	40(%esi), %ecx
+	cmp	%ecx, 40(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_32):
+	mov	44(%esi), %ecx
+	cmp	%ecx, 44(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_48):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_48)
+	and	$15, %dl
+	jz	L(second_double_word_48)
+	mov	48(%esi), %ecx
+	cmp	%ecx, 48(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_48):
+	mov	52(%esi), %ecx
+	cmp	%ecx, 52(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_48):
+	and	$15, %dh
+	jz	L(fourth_double_word_48)
+	mov	56(%esi), %ecx
+	cmp	%ecx, 56(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_48):
+	mov	60(%esi), %ecx
+	cmp	%ecx, 60(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(nequal):
+	mov	$1, %eax
+	jg	L(return)
+	neg	%eax
+	RETURN
+
+	.p2align 4
+L(return):
+	RETURN
+
+	.p2align 4
+L(equal):
+	xorl	%eax, %eax
+	RETURN
+
+	CFI_POP (%edi)
+	CFI_POP (%esi)
+
+	.p2align 4
+L(neq):
+	mov	$1, %eax
+	jg	L(neq_bigger)
+	neg	%eax
+
+L(neq_bigger):
+	ret
+
+	.p2align 4
+L(eq):
+	xorl	%eax, %eax
+	ret
+
+END (__wcscmp_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S
new file mode 100644
index 0000000000..7118bdd4db
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S
@@ -0,0 +1,39 @@
+/* Multiple versions of wcscmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc and for the
+   DSO.  In static binaries, we need wcscmp before the initialization
+   happened.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__wcscmp)
+	.type	__wcscmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcscmp_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcscmp_sse2)
+2:	ret
+END(__wcscmp)
+weak_alias (__wcscmp, wcscmp)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c
new file mode 100644
index 0000000000..fb3000392b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define wcscpy  __wcscpy_ia32
+#endif
+
+#include "wcsmbs/wcscpy.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
new file mode 100644
index 0000000000..6280ba92ab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
@@ -0,0 +1,600 @@
+/* wcscpy with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define RETURN	POP (%edi); ret; CFI_PUSH (%edi)
+# define STR1	PARMS
+# define STR2	STR1+4
+# define LEN	STR2+4
+
+	atom_text_section
+ENTRY (__wcscpy_ssse3)
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+
+	cmp	$0, (%ecx)
+	jz	L(ExitTail4)
+	cmp	$0, 4(%ecx)
+	jz	L(ExitTail8)
+	cmp	$0, 8(%ecx)
+	jz	L(ExitTail12)
+	cmp	$0, 12(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	mov	%edx, %edi
+	PUSH	(%esi)
+	lea	16(%ecx), %esi
+
+	and	$-16, %esi
+
+	pxor	%xmm0, %xmm0
+	pcmpeqd	(%esi), %xmm0
+	movdqu	(%ecx), %xmm1
+	movdqu	%xmm1, (%edx)
+
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%edx, %eax
+	lea	16(%edx), %edx
+	and	$-16, %edx
+	sub	%edx, %eax
+
+	sub	%eax, %ecx
+	mov	%ecx, %eax
+	and	$0xf, %eax
+	mov	$0, %esi
+
+	jz	L(Align16Both)
+	cmp	$4, %eax
+	je	L(Shl4)
+	cmp	$8, %eax
+	je	L(Shl8)
+	jmp	L(Shl12)
+
+L(Align16Both):
+	movaps	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movaps	%xmm1, (%edx)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm4
+	movaps	%xmm3, (%edx, %esi)
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm1
+	movaps	%xmm4, (%edx, %esi)
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm2
+	movaps	%xmm1, (%edx, %esi)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%edx, %esi)
+	mov	%ecx, %eax
+	lea	16(%ecx, %esi), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+
+	mov	$-0x40, %esi
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	lea	64(%edx), %edx
+	pcmpeqd	%xmm0, %xmm3
+	lea	64(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%edx)
+	movaps	%xmm5, -48(%edx)
+	movaps	%xmm6, -32(%edx)
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%edx)
+	pcmpeqd	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	$-0x40, %esi
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%ecx), %xmm1
+	movaps	12(%ecx), %xmm2
+L(Shl4Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	28(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-12(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%ecx), %xmm2
+	movaps	28(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	POP	(%esi)
+	add	$12, %edx
+	add	$12, %ecx
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%ecx), %xmm1
+	movaps	8(%ecx), %xmm2
+L(Shl8Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	24(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-8(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%ecx), %xmm2
+	movaps	24(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	POP	(%esi)
+	add	$8, %edx
+	add	$8, %ecx
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%ecx), %xmm1
+	movaps	4(%ecx), %xmm2
+L(Shl12Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	20(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-4(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%ecx), %xmm2
+	movaps	20(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit12)
+L(Exit16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edi, %eax
+	RETURN
+
+CFI_POP	(%edi)
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	ret
+
+END (__wcscpy_ssse3)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S
new file mode 100644
index 0000000000..cfc97dd87c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S
@@ -0,0 +1,36 @@
+/* Multiple versions of wcscpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(wcscpy)
+	.type	wcscpy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcscpy_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcscpy_ssse3)
+2:	ret
+END(wcscpy)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c
new file mode 100644
index 0000000000..a335dc0f7e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c
@@ -0,0 +1,9 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# define WCSLEN  __wcslen_ia32
+#endif
+
+extern __typeof (wcslen) __wcslen_ia32;
+
+#include "wcsmbs/wcslen.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S
new file mode 100644
index 0000000000..bd3fc4c79b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S
@@ -0,0 +1,193 @@
+/* wcslen with SSE2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+# define STR	4
+
+	.text
+ENTRY (__wcslen_sse2)
+	mov	STR(%esp), %edx
+
+	cmp	$0, (%edx)
+	jz	L(exit_tail0)
+	cmp	$0, 4(%edx)
+	jz	L(exit_tail1)
+	cmp	$0, 8(%edx)
+	jz	L(exit_tail2)
+	cmp	$0, 12(%edx)
+	jz	L(exit_tail3)
+	cmp	$0, 16(%edx)
+	jz	L(exit_tail4)
+	cmp	$0, 20(%edx)
+	jz	L(exit_tail5)
+	cmp	$0, 24(%edx)
+	jz	L(exit_tail6)
+	cmp	$0, 28(%edx)
+	jz	L(exit_tail7)
+
+	pxor	%xmm0, %xmm0
+
+	lea	32(%edx), %eax
+	lea	16(%edx), %ecx
+	and	$-16, %eax
+
+	pcmpeqd	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	and	$-0x40, %eax
+
+	.p2align 4
+L(aligned_64_loop):
+	movaps	(%eax), %xmm0
+	movaps	16(%eax), %xmm1
+	movaps	32(%eax), %xmm2
+	movaps	48(%eax), %xmm6
+
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqd	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	64(%eax), %eax
+	jz	L(aligned_64_loop)
+
+	pcmpeqd	-64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	48(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	-32(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	jmp	L(aligned_64_loop)
+
+	.p2align 4
+L(exit):
+	sub	%ecx, %eax
+	shr	$2, %eax
+	test	%dl, %dl
+	jz	L(exit_high)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_1)
+	ret
+
+	.p2align 4
+L(exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_3)
+	add	$2, %eax
+	ret
+
+	.p2align 4
+L(exit_1):
+	add	$1, %eax
+	ret
+
+	.p2align 4
+L(exit_3):
+	add	$3, %eax
+	ret
+
+	.p2align 4
+L(exit_tail0):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_tail1):
+	mov	$1, %eax
+	ret
+
+	.p2align 4
+L(exit_tail2):
+	mov	$2, %eax
+	ret
+
+	.p2align 4
+L(exit_tail3):
+	mov	$3, %eax
+	ret
+
+	.p2align 4
+L(exit_tail4):
+	mov	$4, %eax
+	ret
+
+	.p2align 4
+L(exit_tail5):
+	mov	$5, %eax
+	ret
+
+	.p2align 4
+L(exit_tail6):
+	mov	$6, %eax
+	ret
+
+	.p2align 4
+L(exit_tail7):
+	mov	$7, %eax
+	ret
+
+END (__wcslen_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S
new file mode 100644
index 0000000000..6ef9b6e7b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S
@@ -0,0 +1,37 @@
+/* Multiple versions of wcslen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__wcslen)
+	.type	__wcslen, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcslen_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcslen_sse2)
+2:	ret
+END(__wcslen)
+
+weak_alias(__wcslen, wcslen)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c
new file mode 100644
index 0000000000..8d8a335b5b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define wcsrchr  __wcsrchr_ia32
+#endif
+
+#include "wcsmbs/wcsrchr.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
new file mode 100644
index 0000000000..1a9b60e55e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
@@ -0,0 +1,354 @@
+/* wcsrchr with SSE2, without using bsf instructions.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	8
+# define ENTRANCE	PUSH (%edi);
+# define RETURN	POP (%edi); ret; CFI_PUSH (%edi);
+# define STR1	PARMS
+# define STR2	STR1+4
+
+	atom_text_section
+ENTRY (__wcsrchr_sse2)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	mov	%ecx, %edi
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+
+/* ECX has OFFSET. */
+	and	$63, %ecx
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+/* Find where NULL is.  */
+	pmovmskb %xmm2, %ecx
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%ecx, %ecx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+
+	PUSH	(%esi)
+
+	xor	%edx, %edx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%ecx, %ecx
+	jnz	L(prolog_find_zero_1)
+
+	PUSH	(%esi)
+
+/* Save current match */
+	mov	%eax, %edx
+	mov	%edi, %esi
+	and	$-16, %edi
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(crosscache):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm3
+	pcmpeqd	%xmm1, %xmm0
+/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	PUSH	(%esi)
+
+	xor	%edx, %edx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(prolog_find_zero)
+
+	PUSH	(%esi)
+
+	mov	%eax, %edx
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm3
+	pcmpeqd	%xmm3, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm4
+	pcmpeqd	%xmm4, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm4
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm4, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm5
+	pcmpeqd	%xmm5, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm5
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm5, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%edx, %edx
+	jz	L(return_null_1)
+	mov	%edx, %eax
+	mov	%esi, %edi
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(return_null_1):
+	POP	(%esi)
+
+	xor	%eax, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(find_zero)
+/* save match info */
+	mov	%eax, %edx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	%cl, %cl
+	jz	L(find_zero_in_third_or_fourth_wchar)
+	test	$15, %cl
+	jz	L(find_zero_in_second_wchar)
+	and	$1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_third_or_fourth_wchar):
+	test	$15, %ch
+	jz	L(find_zero_in_fourth_wchar)
+	and	$1 << 9 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_fourth_wchar):
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match_second_wchar):
+	lea	-12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_third_or_fourth_wchar):
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_third_wchar):
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_fourth_wchar):
+	lea	-4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%ecx, %edi
+	mov     %edx, %ecx
+L(prolog_find_zero_1):
+	test	%cl, %cl
+	jz	L(prolog_find_zero_in_third_or_fourth_wchar)
+	test	$15, %cl
+	jz	L(prolog_find_zero_in_second_wchar)
+	and	$1, %eax
+	jz	L(return_null)
+
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_null)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_third_or_fourth_wchar):
+	test	$15, %ch
+	jz	L(prolog_find_zero_in_fourth_wchar)
+	and	$1 << 9 - 1, %eax
+	jz	L(return_null)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_fourth_wchar):
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+END (__wcsrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S
new file mode 100644
index 0000000000..cf67333995
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S
@@ -0,0 +1,35 @@
+/* Multiple versions of wcsrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(wcsrchr)
+	.type	wcsrchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcsrchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcsrchr_sse2)
+2:	ret
+END(wcsrchr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000000..75ab4b94c1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c
@@ -0,0 +1,9 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# define WMEMCMP  __wmemcmp_ia32
+#endif
+
+extern __typeof (wmemcmp) __wmemcmp_ia32;
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000000..1a857c7e21
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_2
+
+#include "memcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000000..a41ef95fc1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S
new file mode 100644
index 0000000000..1b9a54a413
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S
@@ -0,0 +1,40 @@
+/* Multiple versions of wmemcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+	.text
+ENTRY(wmemcmp)
+	.type	wmemcmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wmemcmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wmemcmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wmemcmp_sse4_2)
+2:	ret
+END(wmemcmp)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/nptl/tls.h b/REORG.TODO/sysdeps/i386/i686/nptl/tls.h
new file mode 100644
index 0000000000..5b527af9d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/nptl/tls.h
@@ -0,0 +1,35 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _TLS_H
+
+/* Additional definitions for <tls.h> on i686 and up.  */
+
+
+/* Macros to load from and store into segment registers.  We can use
+   the 32-bit instructions.  */
+#define TLS_GET_GS() \
+  ({ int __seg; __asm ("movl %%gs, %0" : "=q" (__seg)); __seg; })
+#define TLS_SET_GS(val) \
+  __asm ("movl %0, %%gs" :: "q" (val))
+
+
+/* Get the full set of definitions.  */
+#include_next <tls.h>
+
+#endif	/* tls.h */
diff --git a/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S b/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S
new file mode 100644
index 0000000000..ce9c94d41a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S
@@ -0,0 +1,20 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define HAVE_CMOV	1
+#include <sysdeps/i386/pthread_spin_trylock.S>
diff --git a/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h b/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h
new file mode 100644
index 0000000000..9b5a1b0d47
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h
@@ -0,0 +1,23 @@
+/* Define macros for stack address aliasing issues for NPTL.  i686 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* What is useful is to avoid the 64k aliasing problem which reliably
+   happens if all stacks use sizes which are a multiple of 64k.  Tell
+   the stack allocator to disturb this by allocation one more page if
+   necessary.  */
+#define MULTI_PAGE_ALIASING     65536
diff --git a/REORG.TODO/sysdeps/i386/i686/strcmp.S b/REORG.TODO/sysdeps/i386/i686/strcmp.S
new file mode 100644
index 0000000000..1ae305912e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/strcmp.S
@@ -0,0 +1,52 @@
+/* Highly optimized version for ix86, x>=6.
+   Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define STR1	PARMS
+#define STR2	STR1+4
+
+        .text
+ENTRY (strcmp)
+
+	movl	STR1(%esp), %ecx
+	movl	STR2(%esp), %edx
+
+L(oop):	movb	(%ecx), %al
+	cmpb	(%edx), %al
+	jne	L(neq)
+	incl	%ecx
+	incl	%edx
+	testb	%al, %al
+	jnz	L(oop)
+
+	xorl	%eax, %eax
+	/* when strings are equal, pointers rest one beyond
+	   the end of the NUL terminators.  */
+	ret
+
+L(neq):	movl	$1, %eax
+	movl	$-1, %ecx
+	cmovbl	%ecx, %eax
+
+	ret
+END (strcmp)
+libc_hidden_builtin_def (strcmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h b/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h
new file mode 100644
index 0000000000..51f03fe77b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h
@@ -0,0 +1,44 @@
+/* Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <stdint.h>
+#ifndef __SSE__
+#include_next <tst-stack-align.h>
+#else
+#include <xmmintrin.h>
+
+#define TEST_STACK_ALIGN() \
+  ({									     \
+    __m128 _m;								     \
+    double _d = 12.0;							     \
+    long double _ld = 15.0;						     \
+    int _ret = 0;							     \
+    printf ("__m128:  %p %zu\n", &_m, __alignof (__m128));		     \
+    if ((((uintptr_t) &_m) & (__alignof (__m128) - 1)) != 0)		     \
+      _ret = 1;								     \
+									     \
+    printf ("double:  %g %p %zu\n", _d, &_d, __alignof (double));	     \
+    if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0)		     \
+      _ret = 1;								     \
+									     \
+    printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double));    \
+    if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0)	     \
+      _ret = 1;								     \
+    _ret;								     \
+    })
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i786/Implies b/REORG.TODO/sysdeps/i386/i786/Implies
new file mode 100644
index 0000000000..1cd29f63cf
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i786/Implies
@@ -0,0 +1,2 @@
+# The PPro and PII cores are mostly the same.
+i386/i686
diff --git a/REORG.TODO/sysdeps/i386/init-arch.h b/REORG.TODO/sysdeps/i386/init-arch.h
new file mode 100644
index 0000000000..72881c5679
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/init-arch.h
@@ -0,0 +1,19 @@
+/* Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define MINIMUM_ISA 486
+#include <sysdeps/x86/init-arch.h>
diff --git a/REORG.TODO/sysdeps/i386/jmpbuf-offsets.h b/REORG.TODO/sysdeps/i386/jmpbuf-offsets.h
new file mode 100644
index 0000000000..1c95db7287
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/jmpbuf-offsets.h
@@ -0,0 +1,25 @@
+/* Private macros for accessing __jmp_buf contents.  i386 version.
+   Copyright (C) 2006-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define JB_BX	0
+#define JB_SI	1
+#define JB_DI	2
+#define JB_BP	3
+#define JB_SP	4
+#define JB_PC	5
+#define JB_SIZE 24
diff --git a/REORG.TODO/sysdeps/i386/jmpbuf-unwind.h b/REORG.TODO/sysdeps/i386/jmpbuf-unwind.h
new file mode 100644
index 0000000000..0a63a832cc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/jmpbuf-unwind.h
@@ -0,0 +1,47 @@
+/* Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <setjmp.h>
+#include <jmpbuf-offsets.h>
+#include <stdint.h>
+#include <unwind.h>
+#include <sysdep.h>
+
+/* Test if longjmp to JMPBUF would unwind the frame
+   containing a local variable at ADDRESS.  */
+#define _JMPBUF_UNWINDS(jmpbuf, address, demangle) \
+  ((void *) (address) < (void *) demangle ((jmpbuf)[JB_SP]))
+
+#define _JMPBUF_CFA_UNWINDS_ADJ(_jmpbuf, _context, _adj) \
+  _JMPBUF_UNWINDS_ADJ (_jmpbuf, (void *) _Unwind_GetCFA (_context), _adj)
+
+static inline uintptr_t __attribute__ ((unused))
+_jmpbuf_sp (__jmp_buf regs)
+{
+  uintptr_t sp = regs[JB_SP];
+#ifdef PTR_DEMANGLE
+  PTR_DEMANGLE (sp);
+#endif
+  return sp;
+}
+
+#define _JMPBUF_UNWINDS_ADJ(_jmpbuf, _address, _adj) \
+  ((uintptr_t) (_address) - (_adj) < _jmpbuf_sp (_jmpbuf) - (_adj))
+
+/* We use the normal longjmp for unwinding.  */
+#define __libc_unwind_longjmp(buf, val) __libc_longjmp (buf, val)
diff --git a/REORG.TODO/sysdeps/i386/ldbl2mpn.c b/REORG.TODO/sysdeps/i386/ldbl2mpn.c
new file mode 100644
index 0000000000..076be0ae7e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/ldbl2mpn.c
@@ -0,0 +1,120 @@
+/* Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+#include <ieee754.h>
+#include <float.h>
+#include <stdlib.h>
+
+/* Convert a `long double' in IEEE854 standard double-precision format to a
+   multi-precision integer representing the significand scaled up by its
+   number of bits (64 for long double) and an integral power of two
+   (MPN frexpl). */
+
+mp_size_t
+__mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
+			   int *expt, int *is_neg,
+			   long double value)
+{
+  union ieee854_long_double u;
+  u.d = value;
+
+  *is_neg = u.ieee.negative;
+  *expt = (int) u.ieee.exponent - IEEE854_LONG_DOUBLE_BIAS;
+
+#if BITS_PER_MP_LIMB == 32
+  res_ptr[0] = u.ieee.mantissa1; /* Low-order 32 bits of fraction.  */
+  res_ptr[1] = u.ieee.mantissa0; /* High-order 32 bits.  */
+  #define N 2
+#elif BITS_PER_MP_LIMB == 64
+  /* Hopefully the compiler will combine the two bitfield extracts
+     and this composition into just the original quadword extract.  */
+  res_ptr[0] = ((mp_limb_t) u.ieee.mantissa0 << 32) | u.ieee.mantissa1;
+  #define N 1
+#else
+  #error "mp_limb size " BITS_PER_MP_LIMB "not accounted for"
+#endif
+
+  if (u.ieee.exponent == 0)
+    {
+      /* A biased exponent of zero is a special case.
+	 Either it is a zero or it is a denormal number.  */
+      if (res_ptr[0] == 0 && res_ptr[N - 1] == 0) /* Assumes N<=2.  */
+	/* It's zero.  */
+	*expt = 0;
+      else
+	{
+	  /* It is a denormal number, meaning it has no implicit leading
+	     one bit, and its exponent is in fact the format minimum.  */
+	  int cnt;
+
+	  /* One problem with Intel's 80-bit format is that the explicit
+	     leading one in the normalized representation has to be zero
+	     for denormalized number.  If it is one, the number is according
+	     to Intel's specification an invalid number.  We make the
+	     representation unique by explicitly clearing this bit.  */
+	  res_ptr[N - 1] &= ~((mp_limb_t) 1 << ((LDBL_MANT_DIG - 1) % BITS_PER_MP_LIMB));
+
+	  if (res_ptr[N - 1] != 0)
+	    {
+	      count_leading_zeros (cnt, res_ptr[N - 1]);
+	      if (cnt != 0)
+		{
+#if N == 2
+		  res_ptr[N - 1] = res_ptr[N - 1] << cnt
+				   | (res_ptr[0] >> (BITS_PER_MP_LIMB - cnt));
+		  res_ptr[0] <<= cnt;
+#else
+		  res_ptr[N - 1] <<= cnt;
+#endif
+		}
+	      *expt = LDBL_MIN_EXP - 1 - cnt;
+	    }
+	  else if (res_ptr[0] != 0)
+	    {
+	      count_leading_zeros (cnt, res_ptr[0]);
+	      res_ptr[N - 1] = res_ptr[0] << cnt;
+	      res_ptr[0] = 0;
+	      *expt = LDBL_MIN_EXP - 1 - BITS_PER_MP_LIMB - cnt;
+	    }
+	  else
+	    {
+	      /* This is the special case of the pseudo denormal number
+		 with only the implicit leading bit set.  The value is
+		 in fact a normal number and so we have to treat this
+		 case differently.  */
+#if N == 2
+	      res_ptr[N - 1] = 0x80000000ul;
+#else
+	      res_ptr[0] = 0x8000000000000000ul;
+#endif
+	      *expt = LDBL_MIN_EXP - 1;
+	    }
+	}
+    }
+  else if (u.ieee.exponent < 0x7fff
+#if N == 2
+	   && res_ptr[0] == 0
+#endif
+	   && res_ptr[N - 1] == 0)
+    /* Pseudo zero.  */
+    *expt = 0;
+
+  return N;
+}
diff --git a/REORG.TODO/sysdeps/i386/ldsodefs.h b/REORG.TODO/sysdeps/i386/ldsodefs.h
new file mode 100644
index 0000000000..a369f5fc68
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/ldsodefs.h
@@ -0,0 +1,41 @@
+/* Run-time dynamic linker data structures for loaded ELF shared objects.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef	_I386_LDSODEFS_H
+#define	_I386_LDSODEFS_H	1
+
+#include <elf.h>
+#include <cpu-features.h>
+
+struct La_i86_regs;
+struct La_i86_retval;
+
+#define ARCH_PLTENTER_MEMBERS						\
+    Elf32_Addr (*i86_gnu_pltenter) (Elf32_Sym *, unsigned int, uintptr_t *, \
+				    uintptr_t *, struct La_i86_regs *,	\
+				    unsigned int *, const char *name,	\
+				    long int *framesizep)
+
+#define ARCH_PLTEXIT_MEMBERS						\
+    unsigned int (*i86_gnu_pltexit) (Elf32_Sym *, unsigned int, uintptr_t *, \
+				     uintptr_t *, const struct La_i86_regs *, \
+				     struct La_i86_retval *, const char *)
+
+#include_next <ldsodefs.h>
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/link-defines.sym b/REORG.TODO/sysdeps/i386/link-defines.sym
new file mode 100644
index 0000000000..0995adb37f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/link-defines.sym
@@ -0,0 +1,20 @@
+#include "link.h"
+#include <stddef.h>
+
+--
+LONG_DOUBLE_SIZE	sizeof (long double)
+
+LR_SIZE			sizeof (struct La_i86_regs)
+LR_EDX_OFFSET		offsetof (struct La_i86_regs, lr_edx)
+LR_ECX_OFFSET		offsetof (struct La_i86_regs, lr_ecx)
+LR_EAX_OFFSET		offsetof (struct La_i86_regs, lr_eax)
+LR_EBP_OFFSET		offsetof (struct La_i86_regs, lr_ebp)
+LR_ESP_OFFSET		offsetof (struct La_i86_regs, lr_esp)
+
+LRV_SIZE		sizeof (struct La_i86_retval)
+LRV_EAX_OFFSET		offsetof (struct La_i86_retval, lrv_eax)
+LRV_EDX_OFFSET		offsetof (struct La_i86_retval, lrv_edx)
+LRV_ST0_OFFSET		offsetof (struct La_i86_retval, lrv_st0)
+LRV_ST1_OFFSET		offsetof (struct La_i86_retval, lrv_st1)
+LRV_BND0_OFFSET		offsetof (struct La_i86_retval, lrv_bnd0)
+LRV_BND1_OFFSET		offsetof (struct La_i86_retval, lrv_bnd1)
diff --git a/REORG.TODO/sysdeps/i386/lshift.S b/REORG.TODO/sysdeps/i386/lshift.S
new file mode 100644
index 0000000000..fa4b07793f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/lshift.S
@@ -0,0 +1,103 @@
+/* i80386 __mpn_lshift --
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+12		/* space for 3 saved regs */
+#define RES	PARMS
+#define S	RES+4
+#define SIZE	S+4
+#define CNT	SIZE+4
+
+	.text
+ENTRY (__mpn_lshift)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 8)
+	movl	S(%esp),%esi
+	cfi_rel_offset (esi, 4)
+	movl	SIZE(%esp),%edx
+	movl	CNT(%esp),%ecx
+	subl	$4,%esi			/* adjust s_ptr */
+
+	movl	(%esi,%edx,4),%ebx	/* read most significant limb */
+	cfi_rel_offset (ebx, 0)
+	cfi_remember_state
+	xorl	%eax,%eax
+	shldl	%cl,%ebx,%eax		/* compute carry limb */
+	decl	%edx
+	jz	L(end)
+	pushl	%eax			/* push carry limb onto stack */
+	cfi_adjust_cfa_offset (4)
+	testb	$1,%dl
+	jnz	L(1)			/* enter loop in the middle */
+	movl	%ebx,%eax
+
+	ALIGN (3)
+L(oop):	movl	(%esi,%edx,4),%ebx	/* load next lower limb */
+	shldl	%cl,%ebx,%eax		/* compute result limb */
+	movl	%eax,(%edi,%edx,4)	/* store it */
+	decl	%edx
+L(1):	movl	(%esi,%edx,4),%eax
+	shldl	%cl,%eax,%ebx
+	movl	%ebx,(%edi,%edx,4)
+	decl	%edx
+	jnz	L(oop)
+
+	shll	%cl,%eax		/* compute least significant limb */
+	movl	%eax,(%edi)		/* store it */
+
+	popl	%eax			/* pop carry limb */
+	cfi_adjust_cfa_offset (-4)
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+
+	cfi_restore_state
+L(end):	shll	%cl,%ebx		/* compute least significant limb */
+	movl	%ebx,(%edi)		/* store it */
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_lshift)
diff --git a/REORG.TODO/sysdeps/i386/machine-gmon.h b/REORG.TODO/sysdeps/i386/machine-gmon.h
new file mode 100644
index 0000000000..d5d8cdf7c6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/machine-gmon.h
@@ -0,0 +1,40 @@
+/* i386-specific implementation of profiling support.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* We need a special version of the `mcount' function since for ix86 it
+   must not clobber any register.  This has several reasons:
+     - there is a bug in gcc as of version 2.7.2.2 which prohibits the
+       use of profiling together with nested functions
+     - the ELF `fixup' function uses GCC's regparm feature
+     - some (future) systems might want to pass parameters in registers.  */
+
+/* We must not pollute the global namespace.  */
+#define mcount_internal __mcount_internal
+
+extern void mcount_internal (u_long frompc, u_long selfpc) internal_function;
+
+#define _MCOUNT_DECL(frompc, selfpc) \
+void internal_function mcount_internal (u_long frompc, u_long selfpc)
+
+
+/* Define MCOUNT as empty since we have the implementation in another
+   file.  */
+#define MCOUNT
diff --git a/REORG.TODO/sysdeps/i386/memchr.S b/REORG.TODO/sysdeps/i386/memchr.S
new file mode 100644
index 0000000000..db4a6418ff
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memchr.S
@@ -0,0 +1,322 @@
+/* memchr (str, chr, len) -- Return pointer to first occurrence of CHR in STR
+	 less than LEN.  For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+   This version is developed using the same algorithm as the fast C
+   version which carries the following introduction:
+   Based on strlen implementation by Torbjorn Granlund (tege@sics.se),
+   with help from Dan Sahlin (dan@sics.se) and
+   commentary by Jim Blandy (jimb@ai.mit.edu);
+   adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
+   and implemented by Roland McGrath (roland@ai.mit.edu).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+8		/* space for 2 saved regs */
+#define RTN	PARMS
+#define STR	RTN
+#define CHR	STR+4
+#define LEN	CHR+4
+
+	.text
+ENTRY (__memchr)
+
+	/* Save callee-safe registers used in this function.  */
+	pushl %esi
+	cfi_adjust_cfa_offset (4)
+	pushl %edi
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+
+	/* Load parameters into registers.  */
+	movl STR(%esp), %eax	/* str: pointer to memory block.  */
+	movl CHR(%esp), %edx	/* c: byte we are looking for.  */
+	movl LEN(%esp), %esi	/* len: length of memory block.  */
+	cfi_rel_offset (esi, 4)
+
+	/* If my must not test more than three characters test
+	   them one by one.  This is especially true for 0.  */
+	cmpl $4, %esi
+	jb L(3)
+
+	/* At the moment %edx contains CHR.  What we need for the
+	   algorithm is CHR in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* Now it is 0|0|c|c */
+	movl %edx, %ecx
+	shll $16, %edx		/* Now c|c|0|0 */
+	movw %cx, %dx		/* And finally c|c|c|c */
+
+	/* Better performance can be achieved if the word (32
+	   bit) memory access is aligned on a four-byte-boundary.
+	   So process first bytes one by one until boundary is
+	   reached. Don't use a loop for better performance.  */
+
+	testb $3, %al		/* correctly aligned ? */
+	je L(2)			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L(9)			/* target found => return */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length counter */
+	je L(4)			/* len==0 => return NULL */
+
+	testb $3, %al		/* correctly aligned ? */
+	je L(2)			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L(9)			/* target found => return */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length counter */
+	je L(4)			/* len==0 => return NULL */
+
+	testb $3, %al		/* correctly aligned ? */
+	je L(2)			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L(9)			/* target found => return */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length counter */
+	/* no test for len==0 here, because this is done in the
+	   loop head */
+	jmp L(2)
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for CHR, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is CHR.  This turns each byte that is CHR
+	 into a zero.  */
+
+
+	/* Each round the main loop processes 16 bytes.  */
+
+	ALIGN (4)
+
+L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occurred in the last byte => it was 0.	*/
+	jnc L(8)
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is CHR we don't get 0 in %edi.  */
+	jnz L(8)		/* found it => return pointer */
+
+	/* This process is unfolded four times for better performance.
+	   we don't increment the source pointer each time.  Instead we
+	   use offsets and increment by 16 in each run of the loop.  But
+	   before probing for the matching byte we need some extra code
+	   (following LL(13) below).  Even the len can be compared with
+	   constants instead of decrementing each time.  */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(7)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(7)		/* found it => return pointer */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(6)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(6)		/* found it => return pointer */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(5)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(5)		/* found it => return pointer */
+
+	/* Adjust both counters for a full round, i.e. 16 bytes.  */
+	addl $16, %eax
+L(2):	subl $16, %esi
+	jae L(1)		/* Still more than 16 bytes remaining */
+
+	/* Process remaining bytes separately.  */
+	cmpl $4-16, %esi	/* rest < 4 bytes? */
+	jb L(3)			/* yes, than test byte by byte */
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(8)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jne L(8)		/* found it => return pointer */
+	addl $4, %eax		/* adjust source pointer */
+
+	cmpl $8-16, %esi	/* rest < 8 bytes? */
+	jb L(3)			/* yes, than test byte by byte */
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(8)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jne L(8)		/* found it => return pointer */
+	addl $4, %eax		/* adjust source pointer */
+
+	cmpl $12-16, %esi	/* rest < 12 bytes? */
+	jb L(3)			/* yes, than test byte by byte */
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(8)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jne L(8)		/* found it => return pointer */
+	addl $4, %eax		/* adjust source pointer */
+
+	/* Check the remaining bytes one by one.  */
+L(3):	andl $3, %esi		/* mask out uninteresting bytes */
+	jz L(4)			/* no remaining bytes => return NULL */
+
+	cmpb %dl, (%eax)	/* compare byte with CHR */
+	je L(9)			/* equal, than return pointer */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length */
+	jz L(4)			/* no remaining bytes => return NULL */
+
+	cmpb %dl, (%eax)	/* compare byte with CHR */
+	je L(9)			/* equal, than return pointer */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length */
+	jz L(4)			/* no remaining bytes => return NULL */
+
+	cmpb %dl, (%eax)	/* compare byte with CHR */
+	je L(9)			/* equal, than return pointer */
+
+L(4):	/* no byte found => return NULL */
+	xorl %eax, %eax
+	jmp L(9)
+
+	/* add missing source pointer increments */
+L(5):	addl $4, %eax
+L(6):	addl $4, %eax
+L(7):	addl $4, %eax
+
+	/* Test for the matching byte in the word.  %ecx contains a NUL
+	   char in the byte which originally was the byte we are looking
+	   at.  */
+L(8):	testb %cl, %cl		/* test first byte in dword */
+	jz L(9)			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	testb %ch, %ch		/* test second byte in dword */
+	jz L(9)			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	testl $0xff0000, %ecx	/* test third byte in dword */
+	jz L(9)			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	/* No further test needed we we know it is one of the four bytes.  */
+L(9):	popl %edi		/* pop saved registers */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+	popl %esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+
+	ret
+END (__memchr)
+
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
diff --git a/REORG.TODO/sysdeps/i386/memcmp.S b/REORG.TODO/sysdeps/i386/memcmp.S
new file mode 100644
index 0000000000..01f8f8ef03
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memcmp.S
@@ -0,0 +1,73 @@
+/* Compare two memory blocks for differences in the first COUNT bytes.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* space for 1 saved reg */
+#define BLK1	PARMS
+#define BLK2	BLK1+4
+#define LEN	BLK2+4
+
+	.text
+ENTRY (memcmp)
+
+	pushl %esi		/* Save callee-safe registers.  */
+	cfi_adjust_cfa_offset (4)
+	movl %edi, %edx		/* Note that %edx is not used and can
+				   so be used to save %edi.  It's faster.  */
+	cfi_register (edi, edx)
+
+	movl BLK1(%esp), %esi
+	cfi_rel_offset (esi, 0)
+	movl BLK2(%esp), %edi
+	movl LEN(%esp), %ecx
+
+	cld			/* Set direction of comparison.  */
+
+	xorl %eax, %eax		/* Default result.  */
+
+	repe			/* Compare at most %ecx bytes.  */
+	cmpsb
+	jz L(1)			/* If even last byte was equal we return 0.  */
+
+	/* The memory blocks are not equal.  So result of the last
+	   subtraction is present in the carry flag.  It is set when
+	   the byte in block #2 is bigger.  In this case we have to
+	   return -1 (=0xffffffff), else 1.  */
+	sbbl %eax, %eax		/* This is tricky.  %eax == 0 and carry is set
+				   or not depending on last subtraction.  */
+
+	/* At this point %eax == 0, if the byte of block #1 was bigger, and
+	   0xffffffff if the last byte of block #2 was bigger.  The latter
+	   case is already correct but the former needs a little adjustment.
+	   Note that the following operation does not change 0xffffffff.  */
+	orb $1, %al		/* Change 0 to 1.  */
+
+L(1):	popl %esi		/* Restore registers.  */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	movl %edx, %edi
+	cfi_restore (edi)
+
+	ret
+END (memcmp)
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
+libc_hidden_builtin_def (memcmp)
diff --git a/REORG.TODO/sysdeps/i386/memcopy.h b/REORG.TODO/sysdeps/i386/memcopy.h
new file mode 100644
index 0000000000..dc6173ee29
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memcopy.h
@@ -0,0 +1,92 @@
+/* memcopy.h -- definitions for memory copy functions.  i386 version.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Torbjorn Granlund (tege@sics.se).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/generic/memcopy.h>
+
+#undef	OP_T_THRES
+#define	OP_T_THRES	8
+
+#undef	BYTE_COPY_FWD
+#define BYTE_COPY_FWD(dst_bp, src_bp, nbytes)				      \
+  do {									      \
+    int __d0;								      \
+    asm volatile(/* Clear the direction flag, so copying goes forward.  */    \
+		 "cld\n"						      \
+		 /* Copy bytes.  */					      \
+		 "rep\n"						      \
+		 "movsb" :						      \
+		 "=D" (dst_bp), "=S" (src_bp), "=c" (__d0) :		      \
+		 "0" (dst_bp), "1" (src_bp), "2" (nbytes) :		      \
+		 "memory");						      \
+  } while (0)
+
+#undef	BYTE_COPY_BWD
+#define BYTE_COPY_BWD(dst_ep, src_ep, nbytes)				      \
+  do									      \
+    {									      \
+      int __d0;								      \
+      asm volatile(/* Set the direction flag, so copying goes backwards.  */  \
+		   "std\n"						      \
+		   /* Copy bytes.  */					      \
+		   "rep\n"						      \
+		   "movsb\n"						      \
+		   /* Clear the dir flag.  Convention says it should be 0. */ \
+		   "cld" :						      \
+		   "=D" (dst_ep), "=S" (src_ep), "=c" (__d0) :		      \
+		   "0" (dst_ep - 1), "1" (src_ep - 1), "2" (nbytes) :	      \
+		   "memory");						      \
+      dst_ep += 1;							      \
+      src_ep += 1;							      \
+    } while (0)
+
+#undef	WORD_COPY_FWD
+#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)		      \
+  do									      \
+    {									      \
+      int __d0;								      \
+      asm volatile(/* Clear the direction flag, so copying goes forward.  */  \
+		   "cld\n"						      \
+		   /* Copy longwords.  */				      \
+		   "rep\n"						      \
+		   "movsl" :						      \
+ 		   "=D" (dst_bp), "=S" (src_bp), "=c" (__d0) :		      \
+		   "0" (dst_bp), "1" (src_bp), "2" ((nbytes) / 4) :	      \
+		   "memory");						      \
+      (nbytes_left) = (nbytes) % 4;					      \
+    } while (0)
+
+#undef	WORD_COPY_BWD
+#define WORD_COPY_BWD(dst_ep, src_ep, nbytes_left, nbytes)		      \
+  do									      \
+    {									      \
+      int __d0;								      \
+      asm volatile(/* Set the direction flag, so copying goes backwards.  */  \
+		   "std\n"						      \
+		   /* Copy longwords.  */				      \
+		   "rep\n"						      \
+		   "movsl\n"						      \
+		   /* Clear the dir flag.  Convention says it should be 0. */ \
+		   "cld" :						      \
+		   "=D" (dst_ep), "=S" (src_ep), "=c" (__d0) :		      \
+		   "0" (dst_ep - 4), "1" (src_ep - 4), "2" ((nbytes) / 4) :   \
+		   "memory");						      \
+      dst_ep += 4;							      \
+      src_ep += 4;							      \
+      (nbytes_left) = (nbytes) % 4;					      \
+    } while (0)
diff --git a/REORG.TODO/sysdeps/i386/memcpy.S b/REORG.TODO/sysdeps/i386/memcpy.S
new file mode 100644
index 0000000000..06568ea724
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memcpy.S
@@ -0,0 +1,95 @@
+/* memcpy with REP MOVSB/STOSB
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		memcpy
+# define MEMCPY_CHK	__memcpy_chk
+#endif
+
+#ifdef USE_AS_BCOPY
+# define STR2		12
+# define STR1		STR2+4
+# define N     		STR1+4
+#else
+# define STR1		12
+# define STR2		STR1+4
+# define N     		STR2+4
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+	.text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+	PUSH	(%esi)
+	PUSH	(%edi)
+	movl	N(%esp), %ecx
+	movl	STR1(%esp), %edi
+	movl	STR2(%esp), %esi
+	mov	%edi, %eax
+#ifdef USE_AS_MEMPCPY
+	add	%ecx, %eax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%esi, %edi
+	ja	L(copy_backward)
+	je	L(bwd_write_0bytes)
+#endif
+
+	rep	movsb
+	POP	(%edi)
+	POP	(%esi)
+	ret
+
+#ifdef USE_AS_MEMMOVE
+L(copy_backward):
+	lea	-1(%edi,%ecx), %edi
+	lea	-1(%esi,%ecx), %esi
+	std
+	rep	movsb
+	cld
+L(bwd_write_0bytes):
+	POP	(%edi)
+	POP	(%esi)
+	ret
+#endif
+
+END (MEMCPY)
+
+#ifndef USE_AS_BCOPY
+libc_hidden_builtin_def (MEMCPY)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memcpy_chk.S b/REORG.TODO/sysdeps/i386/memcpy_chk.S
new file mode 100644
index 0000000000..0f6f585c41
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memcpy_chk.S
@@ -0,0 +1,34 @@
+/* Checking memcpy for i386.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+	/* For libc.so this is defined in memcpy.S.
+	   For libc.a, this is a separate source to avoid
+	   memcpy bringing in __chk_fail and all routines
+	   it calls.  */
+        .text
+ENTRY (__memcpy_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	memcpy
+END (__memcpy_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memmove.S b/REORG.TODO/sysdeps/i386/memmove.S
new file mode 100644
index 0000000000..60a45d21e0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memmove.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		memmove
+#define MEMCPY_CHK	__memmove_chk
+#include "memcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/memmove_chk.S b/REORG.TODO/sysdeps/i386/memmove_chk.S
new file mode 100644
index 0000000000..0c7037cc05
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memmove_chk.S
@@ -0,0 +1,33 @@
+/* Checking memmove for i386
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+/* For libc.so this is defined in memmove.S.  For libc.a, this is a
+   separate source to avoid memmove bringing in __chk_fail and all
+   routines it calls.  */
+        .text
+ENTRY (__memmove_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	memmove
+END (__memmove_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/mempcpy.S b/REORG.TODO/sysdeps/i386/mempcpy.S
new file mode 100644
index 0000000000..61addb75f4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/mempcpy.S
@@ -0,0 +1,7 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy
+#define MEMCPY_CHK	__mempcpy_chk
+#include "memcpy.S"
+
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/i386/mempcpy_chk.S b/REORG.TODO/sysdeps/i386/mempcpy_chk.S
new file mode 100644
index 0000000000..4d8ac5c25b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/mempcpy_chk.S
@@ -0,0 +1,33 @@
+/* Checking mempcpy for i386
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+/* For libc.so this is defined in mempcpy.S.  For libc.a, this is a
+   separate source to avoid mempcpy bringing in __chk_fail and all
+   routines it calls.  */
+        .text
+ENTRY (__mempcpy_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	mempcpy
+END (__mempcpy_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memset.S b/REORG.TODO/sysdeps/i386/memset.S
new file mode 100644
index 0000000000..46ae65d2e4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memset.S
@@ -0,0 +1,68 @@
+/* memset with REP MOVSB/STOSB
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define STR1  8
+#ifdef USE_AS_BZERO
+#define N     STR1+4
+#else
+#define STR2  STR1+4
+#define N     STR2+4
+#endif
+
+	.text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk)
+#endif
+ENTRY (memset)
+	PUSH    (%edi)
+	movl	N(%esp), %ecx
+	movl	STR1(%esp), %edi
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	STR2(%esp), %eax
+	mov	%edi, %edx
+#endif
+	rep	stosb
+#ifndef USE_AS_BZERO
+	mov	%edx, %eax
+#endif
+	POP     (%edi)
+	ret
+END (memset)
+
+#ifndef USE_AS_BZERO
+libc_hidden_builtin_def (memset)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memset_chk.S b/REORG.TODO/sysdeps/i386/memset_chk.S
new file mode 100644
index 0000000000..da7837111e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memset_chk.S
@@ -0,0 +1,33 @@
+/* Checking memset for i386.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+/* For libc.so this is defined in memset.S.  For libc.a, this is a
+   separate source to avoid memset bringing in __chk_fail and all
+   routines it calls.  */
+        .text
+ENTRY (__memset_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	memset
+END (__memset_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memusage.h b/REORG.TODO/sysdeps/i386/memusage.h
new file mode 100644
index 0000000000..30167be833
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memusage.h
@@ -0,0 +1,20 @@
+/* Copyright (C) 2000-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define GETSP() ({ register uintptr_t stack_ptr asm ("esp"); stack_ptr; })
+
+#include <sysdeps/generic/memusage.h>
diff --git a/REORG.TODO/sysdeps/i386/mp_clz_tab.c b/REORG.TODO/sysdeps/i386/mp_clz_tab.c
new file mode 100644
index 0000000000..860f98cc62
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/mp_clz_tab.c
@@ -0,0 +1 @@
+/* __clz_tab not needed on i386.  */
diff --git a/REORG.TODO/sysdeps/i386/mul_1.S b/REORG.TODO/sysdeps/i386/mul_1.S
new file mode 100644
index 0000000000..cf83d1b343
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/mul_1.S
@@ -0,0 +1,86 @@
+/* i80386 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+   the result in a second limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+16	/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define SIZE	S1+4
+#define S2LIMB	SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+	.text
+ENTRY (__mpn_mul_1)
+
+	pushl	%res_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%s1_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%s2_limb
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp), %res_ptr
+	cfi_rel_offset (res_ptr, 12)
+	movl	S1(%esp), %s1_ptr
+	cfi_rel_offset (s1_ptr, 8)
+	movl	SIZE(%esp), %size
+	movl	S2LIMB(%esp), %s2_limb
+	cfi_rel_offset (s2_limb, 0)
+	leal	(%res_ptr,%size,4), %res_ptr
+	leal	(%s1_ptr,%size,4), %s1_ptr
+	negl	%size
+	xorl	%ebp, %ebp
+	cfi_rel_offset (ebp, 4)
+	ALIGN (3)
+L(oop):
+	movl	(%s1_ptr,%size,4), %eax
+	mull	%s2_limb
+	addl	%ebp, %eax
+	movl	%eax, (%res_ptr,%size,4)
+	adcl	$0, %edx
+	movl	%edx, %ebp
+
+	incl	%size
+	jnz	L(oop)
+	movl	%ebp, %eax
+
+	popl	%s2_limb
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s2_limb)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%s1_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s1_ptr)
+	popl	%res_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (res_ptr)
+
+	ret
+#undef size
+END (__mpn_mul_1)
diff --git a/REORG.TODO/sysdeps/i386/nptl/Makefile b/REORG.TODO/sysdeps/i386/nptl/Makefile
new file mode 100644
index 0000000000..2c61b352eb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/Makefile
@@ -0,0 +1,26 @@
+# Copyright (C) 2002-2017 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+ifeq ($(subdir),csu)
+gen-as-const-headers += tcb-offsets.sym
+endif
+
+ifeq ($(subdir),nptl)
+CFLAGS-pthread_create.c += -mpreferred-stack-boundary=4
+CFLAGS-tst-align.c += -mpreferred-stack-boundary=4
+CFLAGS-tst-align2.c += -mpreferred-stack-boundary=4
+endif
diff --git a/REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c
new file mode 100644
index 0000000000..a1205b9698
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c
@@ -0,0 +1,19 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Not needed.  pthread_spin_init is an alias for pthread_spin_unlock.  */
diff --git a/REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S
new file mode 100644
index 0000000000..160244b7a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S
@@ -0,0 +1,37 @@
+/* Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <lowlevellock.h>
+
+	.globl	pthread_spin_lock
+	.type	pthread_spin_lock,@function
+	.align	16
+pthread_spin_lock:
+	mov	4(%esp), %eax
+1:	LOCK
+	decl	0(%eax)
+	jne	2f
+	xor	%eax, %eax
+	ret
+
+	.align	16
+2:	rep
+	nop
+	cmpl	$0, 0(%eax)
+	jg	1b
+	jmp	2b
+	.size	pthread_spin_lock,.-pthread_spin_lock
diff --git a/REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S
new file mode 100644
index 0000000000..b6636ae8d7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S
@@ -0,0 +1,31 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+	.globl	pthread_spin_unlock
+	.type	pthread_spin_unlock,@function
+	.align	16
+pthread_spin_unlock:
+	movl	4(%esp), %eax
+	movl	$1, (%eax)
+	xorl	%eax, %eax
+	ret
+	.size	pthread_spin_unlock,.-pthread_spin_unlock
+
+	/* The implementation of pthread_spin_init is identical.  */
+	.globl	pthread_spin_init
+pthread_spin_init = pthread_spin_unlock
diff --git a/REORG.TODO/sysdeps/i386/nptl/pthreaddef.h b/REORG.TODO/sysdeps/i386/nptl/pthreaddef.h
new file mode 100644
index 0000000000..54abccd11b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/pthreaddef.h
@@ -0,0 +1,40 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Default stack size.  */
+#define ARCH_STACK_DEFAULT_SIZE	(2 * 1024 * 1024)
+
+/* Required stack pointer alignment at beginning.  SSE requires 16
+   bytes.  */
+#define STACK_ALIGN		16
+
+/* Minimal stack size after allocating thread descriptor and guard size.  */
+#define MINIMAL_REST_STACK	2048
+
+/* Alignment requirement for TCB.
+
+   Some processors such as Intel Atom pay a big penalty on every
+   access using a segment override if that segment's base is not
+   aligned to the size of a cache line.  (See Intel 64 and IA-32
+   Architectures Optimization Reference Manual, section 13.3.3.3,
+   "Segment Base".)  On such machines, a cache line is 64 bytes.  */
+#define TCB_ALIGNMENT		64
+
+
+/* Location of current stack frame.  */
+#define CURRENT_STACK_FRAME	__builtin_frame_address (0)
diff --git a/REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym b/REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym
new file mode 100644
index 0000000000..695a810386
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym
@@ -0,0 +1,17 @@
+#include <sysdep.h>
+#include <tls.h>
+#include <kernel-features.h>
+
+RESULT			offsetof (struct pthread, result)
+TID			offsetof (struct pthread, tid)
+CANCELHANDLING		offsetof (struct pthread, cancelhandling)
+CLEANUP_JMP_BUF		offsetof (struct pthread, cleanup_jmp_buf)
+MULTIPLE_THREADS_OFFSET	offsetof (tcbhead_t, multiple_threads)
+SYSINFO_OFFSET		offsetof (tcbhead_t, sysinfo)
+CLEANUP			offsetof (struct pthread, cleanup)
+CLEANUP_PREV		offsetof (struct _pthread_cleanup_buffer, __prev)
+MUTEX_FUTEX		offsetof (pthread_mutex_t, __data.__lock)
+POINTER_GUARD		offsetof (tcbhead_t, pointer_guard)
+#ifndef __ASSUME_PRIVATE_FUTEX
+PRIVATE_FUTEX		offsetof (tcbhead_t, private_futex)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/nptl/tls.h b/REORG.TODO/sysdeps/i386/nptl/tls.h
new file mode 100644
index 0000000000..f9a6b11ecf
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/tls.h
@@ -0,0 +1,435 @@
+/* Definition for thread-local data handling.  nptl/i386 version.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _TLS_H
+#define _TLS_H	1
+
+#include <dl-sysdep.h>
+#ifndef __ASSEMBLER__
+# include <stdbool.h>
+# include <stddef.h>
+# include <stdint.h>
+# include <stdlib.h>
+# include <sysdep.h>
+# include <libc-pointer-arith.h> /* For cast_to_integer. */
+# include <kernel-features.h>
+# include <dl-dtv.h>
+
+typedef struct
+{
+  void *tcb;		/* Pointer to the TCB.  Not necessarily the
+			   thread descriptor used by libpthread.  */
+  dtv_t *dtv;
+  void *self;		/* Pointer to the thread descriptor.  */
+  int multiple_threads;
+  uintptr_t sysinfo;
+  uintptr_t stack_guard;
+  uintptr_t pointer_guard;
+  int gscope_flag;
+#ifndef __ASSUME_PRIVATE_FUTEX
+  int private_futex;
+#else
+  int __glibc_reserved1;
+#endif
+  /* Reservation of some values for the TM ABI.  */
+  void *__private_tm[4];
+  /* GCC split stack support.  */
+  void *__private_ss;
+} tcbhead_t;
+
+# define TLS_MULTIPLE_THREADS_IN_TCB 1
+
+#else /* __ASSEMBLER__ */
+# include <tcb-offsets.h>
+#endif
+
+
+/* Alignment requirement for the stack.  For IA-32 this is governed by
+   the SSE memory functions.  */
+#define STACK_ALIGN	16
+
+#ifndef __ASSEMBLER__
+/* Get system call information.  */
+# include <sysdep.h>
+
+/* The old way: using LDT.  */
+
+/* Structure passed to `modify_ldt', 'set_thread_area', and 'clone' calls.  */
+struct user_desc
+{
+  unsigned int entry_number;
+  unsigned long int base_addr;
+  unsigned int limit;
+  unsigned int seg_32bit:1;
+  unsigned int contents:2;
+  unsigned int read_exec_only:1;
+  unsigned int limit_in_pages:1;
+  unsigned int seg_not_present:1;
+  unsigned int useable:1;
+  unsigned int empty:25;
+};
+
+/* Initializing bit fields is slow.  We speed it up by using a union.  */
+union user_desc_init
+{
+  struct user_desc desc;
+  unsigned int vals[4];
+};
+
+
+/* This is the size of the initial TCB.  Can't be just sizeof (tcbhead_t),
+   because NPTL getpid, __libc_alloca_cutoff etc. need (almost) the whole
+   struct pthread even when not linked with -lpthread.  */
+# define TLS_INIT_TCB_SIZE sizeof (struct pthread)
+
+/* Alignment requirements for the initial TCB.  */
+# define TLS_INIT_TCB_ALIGN __alignof__ (struct pthread)
+
+/* This is the size of the TCB.  */
+# define TLS_TCB_SIZE sizeof (struct pthread)
+
+/* Alignment requirements for the TCB.  */
+# define TLS_TCB_ALIGN __alignof__ (struct pthread)
+
+/* The TCB can have any size and the memory following the address the
+   thread pointer points to is unspecified.  Allocate the TCB there.  */
+# define TLS_TCB_AT_TP	1
+# define TLS_DTV_AT_TP	0
+
+/* Get the thread descriptor definition.  */
+# include <nptl/descr.h>
+
+
+/* Install the dtv pointer.  The pointer passed is to the element with
+   index -1 which contain the length.  */
+# define INSTALL_DTV(descr, dtvp) \
+  ((tcbhead_t *) (descr))->dtv = (dtvp) + 1
+
+/* Install new dtv for current thread.  */
+# define INSTALL_NEW_DTV(dtvp) \
+  ({ struct pthread *__pd;						      \
+     THREAD_SETMEM (__pd, header.dtv, (dtvp)); })
+
+/* Return dtv of given thread descriptor.  */
+# define GET_DTV(descr) \
+  (((tcbhead_t *) (descr))->dtv)
+
+/* Macros to load from and store into segment registers.  */
+# ifndef TLS_GET_GS
+#  define TLS_GET_GS() \
+  ({ int __seg; __asm ("movw %%gs, %w0" : "=q" (__seg)); __seg & 0xffff; })
+# endif
+# ifndef TLS_SET_GS
+#  define TLS_SET_GS(val) \
+  __asm ("movw %w0, %%gs" :: "q" (val))
+# endif
+
+#ifdef NEED_DL_SYSINFO
+# define INIT_SYSINFO \
+  _head->sysinfo = GLRO(dl_sysinfo)
+# define SETUP_THREAD_SYSINFO(pd) \
+  ((pd)->header.sysinfo = THREAD_GETMEM (THREAD_SELF, header.sysinfo))
+# define CHECK_THREAD_SYSINFO(pd) \
+  assert ((pd)->header.sysinfo == THREAD_GETMEM (THREAD_SELF, header.sysinfo))
+#else
+# define INIT_SYSINFO
+#endif
+
+#ifndef LOCK_PREFIX
+# ifdef UP
+#  define LOCK_PREFIX  /* nothing */
+# else
+#  define LOCK_PREFIX "lock;"
+# endif
+#endif
+
+static inline void __attribute__ ((unused, always_inline))
+tls_fill_user_desc (union user_desc_init *desc,
+                    unsigned int entry_number,
+                    void *pd)
+{
+  desc->vals[0] = entry_number;
+  /* The 'base_addr' field.  Pointer to the TCB.  */
+  desc->vals[1] = (unsigned long int) pd;
+  /* The 'limit' field.  We use 4GB which is 0xfffff pages.  */
+  desc->vals[2] = 0xfffff;
+  /* Collapsed value of the bitfield:
+     .seg_32bit = 1
+     .contents = 0
+     .read_exec_only = 0
+     .limit_in_pages = 1
+     .seg_not_present = 0
+     .useable = 1 */
+  desc->vals[3] = 0x51;
+}
+
+/* Code to initially initialize the thread pointer.  This might need
+   special attention since 'errno' is not yet available and if the
+   operation can cause a failure 'errno' must not be touched.  */
+# define TLS_INIT_TP(thrdescr) \
+  ({ void *_thrdescr = (thrdescr);					      \
+     tcbhead_t *_head = _thrdescr;					      \
+     union user_desc_init _segdescr;					      \
+     int _result;							      \
+									      \
+     _head->tcb = _thrdescr;						      \
+     /* For now the thread descriptor is at the same address.  */	      \
+     _head->self = _thrdescr;						      \
+     /* New syscall handling support.  */				      \
+     INIT_SYSINFO;							      \
+									      \
+     /* Let the kernel pick a value for the 'entry_number' field.  */	      \
+     tls_fill_user_desc (&_segdescr, -1, _thrdescr);			      \
+									      \
+     /* Install the TLS.  */						      \
+     INTERNAL_SYSCALL_DECL (err);					      \
+     _result = INTERNAL_SYSCALL (set_thread_area, err, 1, &_segdescr.desc);   \
+									      \
+     if (_result == 0)							      \
+       /* We know the index in the GDT, now load the segment register.	      \
+	  The use of the GDT is described by the value 3 in the lower	      \
+	  three bits of the segment descriptor value.			      \
+									      \
+	  Note that we have to do this even if the numeric value of	      \
+	  the descriptor does not change.  Loading the segment register	      \
+	  causes the segment information from the GDT to be loaded	      \
+	  which is necessary since we have changed it.   */		      \
+       TLS_SET_GS (_segdescr.desc.entry_number * 8 + 3);		      \
+									      \
+     _result == 0 ? NULL						      \
+     : "set_thread_area failed when setting up thread-local storage\n"; })
+
+# define TLS_DEFINE_INIT_TP(tp, pd)					      \
+  union user_desc_init _segdescr;					      \
+  /* Find the 'entry_number' field that the kernel selected in TLS_INIT_TP.   \
+     The first three bits of the segment register value select the GDT,	      \
+     ignore them.  We get the index from the value of the %gs register in     \
+     the current thread.  */						      \
+  tls_fill_user_desc (&_segdescr, TLS_GET_GS () >> 3, pd);		      \
+  const struct user_desc *tp = &_segdescr.desc
+
+
+/* Return the address of the dtv for the current thread.  */
+# define THREAD_DTV() \
+  ({ struct pthread *__pd;						      \
+     THREAD_GETMEM (__pd, header.dtv); })
+
+
+/* Return the thread descriptor for the current thread.
+
+   The contained asm must *not* be marked volatile since otherwise
+   assignments like
+	pthread_descr self = thread_self();
+   do not get optimized away.  */
+# define THREAD_SELF \
+  ({ struct pthread *__self;						      \
+     asm ("movl %%gs:%c1,%0" : "=r" (__self)				      \
+	  : "i" (offsetof (struct pthread, header.self)));		      \
+     __self;})
+
+/* Magic for libthread_db to know how to do THREAD_SELF.  */
+# define DB_THREAD_SELF \
+  REGISTER_THREAD_AREA (32, offsetof (struct user_regs_struct, xgs), 3) \
+  REGISTER_THREAD_AREA (64, 26 * 8, 3) /* x86-64's user_regs_struct->gs */
+
+
+/* Read member of the thread descriptor directly.  */
+# define THREAD_GETMEM(descr, member) \
+  ({ __typeof (descr->member) __value;					      \
+     if (sizeof (__value) == 1)						      \
+       asm volatile ("movb %%gs:%P2,%b0"				      \
+		     : "=q" (__value)					      \
+		     : "0" (0), "i" (offsetof (struct pthread, member)));     \
+     else if (sizeof (__value) == 4)					      \
+       asm volatile ("movl %%gs:%P1,%0"					      \
+		     : "=r" (__value)					      \
+		     : "i" (offsetof (struct pthread, member)));	      \
+     else								      \
+       {								      \
+	 if (sizeof (__value) != 8)					      \
+	   /* There should not be any value with a size other than 1,	      \
+	      4 or 8.  */						      \
+	   abort ();							      \
+									      \
+	 asm volatile ("movl %%gs:%P1,%%eax\n\t"			      \
+		       "movl %%gs:%P2,%%edx"				      \
+		       : "=A" (__value)					      \
+		       : "i" (offsetof (struct pthread, member)),	      \
+			 "i" (offsetof (struct pthread, member) + 4));	      \
+       }								      \
+     __value; })
+
+
+/* Same as THREAD_GETMEM, but the member offset can be non-constant.  */
+# define THREAD_GETMEM_NC(descr, member, idx) \
+  ({ __typeof (descr->member[0]) __value;				      \
+     if (sizeof (__value) == 1)						      \
+       asm volatile ("movb %%gs:%P2(%3),%b0"				      \
+		     : "=q" (__value)					      \
+		     : "0" (0), "i" (offsetof (struct pthread, member[0])),   \
+		     "r" (idx));					      \
+     else if (sizeof (__value) == 4)					      \
+       asm volatile ("movl %%gs:%P1(,%2,4),%0"				      \
+		     : "=r" (__value)					      \
+		     : "i" (offsetof (struct pthread, member[0])),	      \
+		       "r" (idx));					      \
+     else								      \
+       {								      \
+	 if (sizeof (__value) != 8)					      \
+	   /* There should not be any value with a size other than 1,	      \
+	      4 or 8.  */						      \
+	   abort ();							      \
+									      \
+	 asm volatile  ("movl %%gs:%P1(,%2,8),%%eax\n\t"		      \
+			"movl %%gs:4+%P1(,%2,8),%%edx"			      \
+			: "=&A" (__value)				      \
+			: "i" (offsetof (struct pthread, member[0])),	      \
+			  "r" (idx));					      \
+       }								      \
+     __value; })
+
+
+
+/* Set member of the thread descriptor directly.  */
+# define THREAD_SETMEM(descr, member, value) \
+  ({ if (sizeof (descr->member) == 1)					      \
+       asm volatile ("movb %b0,%%gs:%P1" :				      \
+		     : "iq" (value),					      \
+		       "i" (offsetof (struct pthread, member)));	      \
+     else if (sizeof (descr->member) == 4)				      \
+       asm volatile ("movl %0,%%gs:%P1" :				      \
+		     : "ir" (value),					      \
+		       "i" (offsetof (struct pthread, member)));	      \
+     else								      \
+       {								      \
+	 if (sizeof (descr->member) != 8)				      \
+	   /* There should not be any value with a size other than 1,	      \
+	      4 or 8.  */						      \
+	   abort ();							      \
+									      \
+	 asm volatile ("movl %%eax,%%gs:%P1\n\t"			      \
+		       "movl %%edx,%%gs:%P2" :				      \
+		       : "A" ((uint64_t) cast_to_integer (value)),	      \
+			 "i" (offsetof (struct pthread, member)),	      \
+			 "i" (offsetof (struct pthread, member) + 4));	      \
+       }})
+
+
+/* Same as THREAD_SETMEM, but the member offset can be non-constant.  */
+# define THREAD_SETMEM_NC(descr, member, idx, value) \
+  ({ if (sizeof (descr->member[0]) == 1)				      \
+       asm volatile ("movb %b0,%%gs:%P1(%2)" :				      \
+		     : "iq" (value),					      \
+		       "i" (offsetof (struct pthread, member)),		      \
+		       "r" (idx));					      \
+     else if (sizeof (descr->member[0]) == 4)				      \
+       asm volatile ("movl %0,%%gs:%P1(,%2,4)" :			      \
+		     : "ir" (value),					      \
+		       "i" (offsetof (struct pthread, member)),		      \
+		       "r" (idx));					      \
+     else								      \
+       {								      \
+	 if (sizeof (descr->member[0]) != 8)				      \
+	   /* There should not be any value with a size other than 1,	      \
+	      4 or 8.  */						      \
+	   abort ();							      \
+									      \
+	 asm volatile ("movl %%eax,%%gs:%P1(,%2,8)\n\t"			      \
+		       "movl %%edx,%%gs:4+%P1(,%2,8)" :			      \
+		       : "A" ((uint64_t) cast_to_integer (value)),	      \
+			 "i" (offsetof (struct pthread, member)),	      \
+			 "r" (idx));					      \
+       }})
+
+
+/* Atomic compare and exchange on TLS, returning old value.  */
+#define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \
+  ({ __typeof (descr->member) __ret;					      \
+     __typeof (oldval) __old = (oldval);				      \
+     if (sizeof (descr->member) == 4)					      \
+       asm volatile (LOCK_PREFIX "cmpxchgl %2, %%gs:%P3"		      \
+		     : "=a" (__ret)					      \
+		     : "0" (__old), "r" (newval),			      \
+		       "i" (offsetof (struct pthread, member)));	      \
+     else								      \
+       /* Not necessary for other sizes in the moment.  */		      \
+       abort ();							      \
+     __ret; })
+
+
+/* Atomic logical and.  */
+#define THREAD_ATOMIC_AND(descr, member, val) \
+  (void) ({ if (sizeof ((descr)->member) == 4)				      \
+	      asm volatile (LOCK_PREFIX "andl %1, %%gs:%P0"		      \
+			    :: "i" (offsetof (struct pthread, member)),	      \
+			       "ir" (val));				      \
+	    else							      \
+	      /* Not necessary for other sizes in the moment.  */	      \
+	      abort (); })
+
+
+/* Atomic set bit.  */
+#define THREAD_ATOMIC_BIT_SET(descr, member, bit) \
+  (void) ({ if (sizeof ((descr)->member) == 4)				      \
+	      asm volatile (LOCK_PREFIX "orl %1, %%gs:%P0"		      \
+			    :: "i" (offsetof (struct pthread, member)),	      \
+			       "ir" (1 << (bit)));			      \
+	    else							      \
+	      /* Not necessary for other sizes in the moment.  */	      \
+	      abort (); })
+
+
+/* Set the stack guard field in TCB head.  */
+#define THREAD_SET_STACK_GUARD(value) \
+  THREAD_SETMEM (THREAD_SELF, header.stack_guard, value)
+#define THREAD_COPY_STACK_GUARD(descr) \
+  ((descr)->header.stack_guard						      \
+   = THREAD_GETMEM (THREAD_SELF, header.stack_guard))
+
+
+/* Set the pointer guard field in the TCB head.  */
+#define THREAD_SET_POINTER_GUARD(value) \
+  THREAD_SETMEM (THREAD_SELF, header.pointer_guard, value)
+#define THREAD_COPY_POINTER_GUARD(descr) \
+  ((descr)->header.pointer_guard					      \
+   = THREAD_GETMEM (THREAD_SELF, header.pointer_guard))
+
+
+/* Get and set the global scope generation counter in the TCB head.  */
+#define THREAD_GSCOPE_FLAG_UNUSED 0
+#define THREAD_GSCOPE_FLAG_USED   1
+#define THREAD_GSCOPE_FLAG_WAIT   2
+#define THREAD_GSCOPE_RESET_FLAG() \
+  do									      \
+    { int __res;							      \
+      asm volatile ("xchgl %0, %%gs:%P1"				      \
+		    : "=r" (__res)					      \
+		    : "i" (offsetof (struct pthread, header.gscope_flag)),    \
+		      "0" (THREAD_GSCOPE_FLAG_UNUSED));			      \
+      if (__res == THREAD_GSCOPE_FLAG_WAIT)				      \
+	lll_futex_wake (&THREAD_SELF->header.gscope_flag, 1, LLL_PRIVATE);    \
+    }									      \
+  while (0)
+#define THREAD_GSCOPE_SET_FLAG() \
+  THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED)
+#define THREAD_GSCOPE_WAIT() \
+  GL(dl_wait_lookup_done) ()
+
+#endif /* __ASSEMBLER__ */
+
+#endif	/* tls.h */
diff --git a/REORG.TODO/sysdeps/i386/preconfigure b/REORG.TODO/sysdeps/i386/preconfigure
new file mode 100644
index 0000000000..c8fefd1bff
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/preconfigure
@@ -0,0 +1,5 @@
+# preconfigure fragment for i386.
+
+case "$machine" in
+i[4567]86)	base_machine=i386 machine=i386/$machine ;;
+esac
diff --git a/REORG.TODO/sysdeps/i386/pthread_spin_trylock.S b/REORG.TODO/sysdeps/i386/pthread_spin_trylock.S
new file mode 100644
index 0000000000..f71a9fcb2d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/pthread_spin_trylock.S
@@ -0,0 +1,46 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <pthread-errnos.h>
+
+
+#ifdef UP
+# define LOCK
+#else
+# define LOCK lock
+#endif
+
+	.globl	pthread_spin_trylock
+	.type	pthread_spin_trylock,@function
+	.align	16
+pthread_spin_trylock:
+	movl	4(%esp), %edx
+	movl	$1, %eax
+	xorl	%ecx, %ecx
+	LOCK
+	cmpxchgl %ecx, (%edx)
+	movl	$EBUSY, %eax
+#ifdef HAVE_CMOV
+	cmovel	%ecx, %eax
+#else
+	jne	0f
+	movl	%ecx, %eax
+0:
+#endif
+	ret
+	.size	pthread_spin_trylock,.-pthread_spin_trylock
diff --git a/REORG.TODO/sysdeps/i386/rawmemchr.S b/REORG.TODO/sysdeps/i386/rawmemchr.S
new file mode 100644
index 0000000000..246ec3f18e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/rawmemchr.S
@@ -0,0 +1,222 @@
+/* rawmemchr (str, ch) -- Return pointer to first occurrence of CH in STR.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+   This version is developed using the same algorithm as the fast C
+   version which carries the following introduction:
+   Based on strlen implementation by Torbjorn Granlund (tege@sics.se),
+   with help from Dan Sahlin (dan@sics.se) and
+   commentary by Jim Blandy (jimb@ai.mit.edu);
+   adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
+   and implemented by Roland McGrath (roland@ai.mit.edu).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* space for 1 saved reg */
+#define RTN	PARMS
+#define STR	RTN
+#define CHR	STR+4
+
+	.text
+ENTRY (__rawmemchr)
+
+	/* Save callee-safe register used in this function.  */
+	pushl %edi
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+
+	/* Load parameters into registers.  */
+	movl STR(%esp), %eax
+	movl CHR(%esp), %edx
+
+	/* At the moment %edx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* Now it is 0|0|c|c */
+	movl %edx, %ecx
+	shll $16, %edx		/* Now c|c|0|0 */
+	movw %cx, %dx		/* And finally c|c|c|c */
+
+	/* Better performance can be achieved if the word (32
+	   bit) memory access is aligned on a four-byte-boundary.
+	   So process first bytes one by one until boundary is
+	   reached. Don't use a loop for better performance.  */
+
+	testb $3, %al		/* correctly aligned ? */
+	je L(1)			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L(9)			/* target found => return */
+	incl %eax		/* increment source pointer */
+
+	testb $3, %al		/* correctly aligned ? */
+	je L(1)			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L(9)			/* target found => return */
+	incl %eax		/* increment source pointer */
+
+	testb $3, %al		/* correctly aligned ? */
+	je L(1)			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L(9)			/* target found => return */
+	incl %eax		/* increment source pointer */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for C, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is C.  This turns each byte that is C
+	 into a zero.  */
+
+
+	/* Each round the main loop processes 16 bytes.  */
+	ALIGN (4)
+
+L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occurred in the last byte => it was 0.	*/
+	jnc L(8)
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %edi.  */
+	jnz L(8)		/* found it => return pointer */
+
+	/* This process is unfolded four times for better performance.
+	   we don't increment the source pointer each time.  Instead we
+	   use offsets and increment by 16 in each run of the loop.  But
+	   before probing for the matching byte we need some extra code
+	   (following LL(13) below).  Even the len can be compared with
+	   constants instead of decrementing each time.  */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(7)		/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(7)		/* found it => return pointer */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(6)		/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(6)		/* found it => return pointer */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(5)		/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(5)		/* found it => return pointer */
+
+	/* Adjust both counters for a full round, i.e. 16 bytes.  */
+	addl $16, %eax
+	jmp L(1)
+	/* add missing source pointer increments */
+L(5):	addl $4, %eax
+L(6):	addl $4, %eax
+L(7):	addl $4, %eax
+
+	/* Test for the matching byte in the word.  %ecx contains a NUL
+	   char in the byte which originally was the byte we are looking
+	   at.  */
+L(8):	testb %cl, %cl		/* test first byte in dword */
+	jz L(9)			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	testb %ch, %ch		/* test second byte in dword */
+	jz L(9)			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	testl $0xff0000, %ecx	/* test third byte in dword */
+	jz L(9)			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	/* No further test needed we we know it is one of the four bytes.  */
+
+L(9):
+	popl %edi		/* pop saved register */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__rawmemchr)
+
+libc_hidden_def (__rawmemchr)
+weak_alias (__rawmemchr, rawmemchr)
diff --git a/REORG.TODO/sysdeps/i386/rshift.S b/REORG.TODO/sysdeps/i386/rshift.S
new file mode 100644
index 0000000000..cf179052b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/rshift.S
@@ -0,0 +1,105 @@
+/* i80386 __mpn_rshift --
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+12		/* space for 3 saved regs */
+#define RES	PARMS
+#define S	RES+4
+#define SIZE	S+4
+#define CNT	SIZE+4
+
+	.text
+ENTRY (__mpn_rshift)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 8)
+	movl	S(%esp),%esi
+	cfi_rel_offset (esi, 4)
+	movl	SIZE(%esp),%edx
+	movl	CNT(%esp),%ecx
+	leal	-4(%edi,%edx,4),%edi
+	leal	(%esi,%edx,4),%esi
+	negl	%edx
+
+	movl	(%esi,%edx,4),%ebx	/* read least significant limb */
+	cfi_rel_offset (ebx, 0)
+	cfi_remember_state
+	xorl	%eax,%eax
+	shrdl	%cl,%ebx,%eax		/* compute carry limb */
+	incl	%edx
+	jz	L(end)
+	pushl	%eax			/* push carry limb onto stack */
+	cfi_adjust_cfa_offset (4)
+	testb	$1,%dl
+	jnz	L(1)			/* enter loop in the middle */
+	movl	%ebx,%eax
+
+	ALIGN (3)
+L(oop):	movl	(%esi,%edx,4),%ebx	/* load next higher limb */
+	shrdl	%cl,%ebx,%eax		/* compute result limb */
+	movl	%eax,(%edi,%edx,4)	/* store it */
+	incl	%edx
+L(1):	movl	(%esi,%edx,4),%eax
+	shrdl	%cl,%eax,%ebx
+	movl	%ebx,(%edi,%edx,4)
+	incl	%edx
+	jnz	L(oop)
+
+	shrl	%cl,%eax		/* compute most significant limb */
+	movl	%eax,(%edi)		/* store it */
+
+	popl	%eax			/* pop carry limb */
+	cfi_adjust_cfa_offset (-4)
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+
+	cfi_restore_state
+L(end):	shrl	%cl,%ebx		/* compute most significant limb */
+	movl	%ebx,(%edi)		/* store it */
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_rshift)
diff --git a/REORG.TODO/sysdeps/i386/setfpucw.c b/REORG.TODO/sysdeps/i386/setfpucw.c
new file mode 100644
index 0000000000..40b995f18a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/setfpucw.c
@@ -0,0 +1,54 @@
+/* Set the FPU control word for x86.
+   Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+#include <fpu_control.h>
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+void
+__setfpucw (fpu_control_t set)
+{
+  fpu_control_t cw;
+
+  /* Fetch the current control word.  */
+  __asm__ ("fnstcw %0" : "=m" (*&cw));
+
+  /* Preserve the reserved bits, and set the rest as the user
+     specified (or the default, if the user gave zero).  */
+  cw &= _FPU_RESERVED;
+  cw |= set & ~_FPU_RESERVED;
+
+  __asm__ ("fldcw %0" : : "m" (*&cw));
+
+  /* If the CPU supports SSE, we set the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int xnew_exc;
+
+      /* Get the current MXCSR.  */
+      __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+      xnew_exc &= ~((0xc00 << 3) | (FE_ALL_EXCEPT << 7));
+      xnew_exc |= ((set & 0xc00) << 3) | ((set & FE_ALL_EXCEPT) << 7);
+
+      __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+    }
+}
diff --git a/REORG.TODO/sysdeps/i386/setjmp.S b/REORG.TODO/sysdeps/i386/setjmp.S
new file mode 100644
index 0000000000..738a899e8b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/setjmp.S
@@ -0,0 +1,58 @@
+/* setjmp for i386.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <jmpbuf-offsets.h>
+#include <asm-syntax.h>
+#include <stap-probe.h>
+
+#define PARMS	4		/* no space for saved regs */
+#define JMPBUF	PARMS
+#define SIGMSK	JMPBUF+4
+
+ENTRY (__sigsetjmp)
+
+	movl JMPBUF(%esp), %eax
+
+	/* Save registers.  */
+	movl %ebx, (JB_BX*4)(%eax)
+	movl %esi, (JB_SI*4)(%eax)
+	movl %edi, (JB_DI*4)(%eax)
+	leal JMPBUF(%esp), %ecx	/* Save SP as it will be after we return.  */
+#ifdef PTR_MANGLE
+	PTR_MANGLE (%ecx)
+#endif
+	movl %ecx, (JB_SP*4)(%eax)
+	movl 0(%esp), %ecx	/* Save PC we are returning to now.  */
+	LIBC_PROBE (setjmp, 3, 4@%eax, -4@SIGMSK(%esp), 4@%ecx)
+#ifdef PTR_MANGLE
+	PTR_MANGLE (%ecx)
+#endif
+	movl %ecx, (JB_PC*4)(%eax)
+	movl %ebp, (JB_BP*4)(%eax) /* Save caller's frame pointer.  */
+
+#if IS_IN (rtld)
+	/* In ld.so we never save the signal mask.  */
+	xorl %eax, %eax
+	ret
+#else
+	/* Make a tail call to __sigjmp_save; it takes the same args.  */
+	jmp __sigjmp_save
+#endif
+END (__sigsetjmp)
+hidden_def (__sigsetjmp)
diff --git a/REORG.TODO/sysdeps/i386/stackguard-macros.h b/REORG.TODO/sysdeps/i386/stackguard-macros.h
new file mode 100644
index 0000000000..039762927c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/stackguard-macros.h
@@ -0,0 +1,12 @@
+#include <stdint.h>
+
+#define STACK_CHK_GUARD \
+  ({ uintptr_t x; asm ("movl %%gs:0x14, %0" : "=r" (x)); x; })
+
+#define POINTER_CHK_GUARD \
+  ({							\
+     uintptr_t x;					\
+     asm ("movl %%gs:%c1, %0" : "=r" (x)		\
+	  : "i" (offsetof (tcbhead_t, pointer_guard)));	\
+     x;							\
+   })
diff --git a/REORG.TODO/sysdeps/i386/stackinfo.h b/REORG.TODO/sysdeps/i386/stackinfo.h
new file mode 100644
index 0000000000..ba17867d3a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/stackinfo.h
@@ -0,0 +1,43 @@
+/* Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file contains a bit of information about the stack allocation
+   of the processor.  */
+
+#ifndef _STACKINFO_H
+#define _STACKINFO_H	1
+
+#include <elf.h>
+
+/* On x86 the stack grows down.  */
+#define _STACK_GROWS_DOWN	1
+
+/* Default to an executable stack.  PF_X can be overridden if PT_GNU_STACK is
+ * present, but it is presumed absent.  */
+#define DEFAULT_STACK_PERMS (PF_R|PF_W|PF_X)
+
+/* Access to the stack pointer.  The macros are used in alloca_account
+   for which they need to act as barriers as well, hence the additional
+   (unnecessary) parameters.  */
+#define stackinfo_get_sp() \
+  ({ void *p__; asm volatile ("mov %%esp, %0" : "=r" (p__)); p__; })
+#define stackinfo_sub_sp(ptr) \
+  ({ ptrdiff_t d__;                                             \
+     asm volatile ("sub %%esp, %0" : "=r" (d__) : "0" (ptr));   \
+     d__; })
+
+#endif	/* stackinfo.h */
diff --git a/REORG.TODO/sysdeps/i386/start.S b/REORG.TODO/sysdeps/i386/start.S
new file mode 100644
index 0000000000..ccb1e2b38f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/start.S
@@ -0,0 +1,139 @@
+/* Startup code compliant to the ELF i386 ABI.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file with other
+   programs, and to distribute those programs without any restriction
+   coming from the use of this file. (The GNU Lesser General Public
+   License restrictions do apply in other respects; for example, they
+   cover modification of the file, and distribution when not linked
+   into another program.)
+
+   Note that people who make modified versions of this file are not
+   obligated to grant this special exception for their modified
+   versions; it is their choice whether to do so. The GNU Lesser
+   General Public License gives permission to release a modified
+   version without this exception; this exception also makes it
+   possible to release a modified version which carries forward this
+   exception.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is the canonical entry point, usually the first thing in the text
+   segment.  The SVR4/i386 ABI (pages 3-31, 3-32) says that when the entry
+   point runs, most registers' values are unspecified, except for:
+
+   %edx		Contains a function pointer to be registered with `atexit'.
+		This is how the dynamic linker arranges to have DT_FINI
+		functions called for shared libraries that have been loaded
+		before this code runs.
+
+   %esp		The stack contains the arguments and environment:
+		0(%esp)			argc
+		4(%esp)			argv[0]
+		...
+		(4*argc)(%esp)		NULL
+		(4*(argc+1))(%esp)	envp[0]
+		...
+					NULL
+*/
+
+	.text
+	.globl _start
+	.type _start,@function
+_start:
+	/* Clear the frame pointer.  The ABI suggests this be done, to mark
+	   the outermost frame obviously.  */
+	xorl %ebp, %ebp
+
+	/* Extract the arguments as encoded on the stack and set up
+	   the arguments for `main': argc, argv.  envp will be determined
+	   later in __libc_start_main.  */
+	popl %esi		/* Pop the argument count.  */
+	movl %esp, %ecx		/* argv starts just at the current stack top.*/
+
+	/* Before pushing the arguments align the stack to a 16-byte
+	(SSE needs 16-byte alignment) boundary to avoid penalties from
+	misaligned accesses.  Thanks to Edward Seidl <seidl@janed.com>
+	for pointing this out.  */
+	andl $0xfffffff0, %esp
+	pushl %eax		/* Push garbage because we allocate
+				   28 more bytes.  */
+
+	/* Provide the highest stack address to the user code (for stacks
+	   which grow downwards).  */
+	pushl %esp
+
+	pushl %edx		/* Push address of the shared library
+				   termination function.  */
+
+#ifdef SHARED
+	/* Load PIC register.  */
+	call 1f
+	addl $_GLOBAL_OFFSET_TABLE_, %ebx
+
+	/* Push address of our own entry points to .fini and .init.  */
+	leal __libc_csu_fini@GOTOFF(%ebx), %eax
+	pushl %eax
+	leal __libc_csu_init@GOTOFF(%ebx), %eax
+	pushl %eax
+
+	pushl %ecx		/* Push second argument: argv.  */
+	pushl %esi		/* Push first argument: argc.  */
+
+	pushl main@GOT(%ebx)
+
+	/* Call the user's main function, and exit with its value.
+	   But let the libc call main.    */
+	call __libc_start_main@PLT
+#else
+	/* Push address of our own entry points to .fini and .init.  */
+	pushl $__libc_csu_fini
+	pushl $__libc_csu_init
+
+	pushl %ecx		/* Push second argument: argv.  */
+	pushl %esi		/* Push first argument: argc.  */
+
+	pushl $main
+
+	/* Call the user's main function, and exit with its value.
+	   But let the libc call main.    */
+	call __libc_start_main
+#endif
+
+	hlt			/* Crash if somehow `exit' does return.  */
+
+#ifdef SHARED
+1:	movl	(%esp), %ebx
+	ret
+#endif
+
+/* To fulfill the System V/i386 ABI we need this symbol.  Yuck, it's so
+   meaningless since we don't support machines < 80386.  */
+	.section .rodata
+	.globl _fp_hw
+_fp_hw:	.long 3
+	.size _fp_hw, 4
+	.type _fp_hw,@object
+
+/* Define a symbol for the first piece of initialized data.  */
+	.data
+	.globl __data_start
+__data_start:
+	.long 0
+	.weak data_start
+	data_start = __data_start
diff --git a/REORG.TODO/sysdeps/i386/stpcpy.S b/REORG.TODO/sysdeps/i386/stpcpy.S
new file mode 100644
index 0000000000..d9981b677b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/stpcpy.S
@@ -0,0 +1,88 @@
+/* Copy SRC to DEST returning the address of the terminating '\0' in DEST.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper (drepper@gnu.ai.mit.edu).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This function is defined neither in ANSI nor POSIX standards but is
+   also not invented here.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define RTN	PARMS
+#define DEST	RTN
+#define SRC	DEST+4
+
+	.text
+ENTRY (__stpcpy)
+
+	movl DEST(%esp), %eax
+	movl SRC(%esp), %ecx
+	subl %eax, %ecx		/* magic: reduce number of loop variants
+				   to one using addressing mode */
+
+	/* Here we would like to write
+
+	subl $4, %eax
+	ALIGN (4)
+
+	but the assembler is too smart and optimizes for the shortest
+	form where the number only needs one byte.  But if we could
+	have the long form we would not need the alignment.  */
+
+	.byte 0x81, 0xe8	/* This is `subl $0x00000004, %eax' */
+	.long 0x00000004
+
+	/* Four times unfolded loop with only one loop counter.  This
+	   is achieved by the use of index+base addressing mode.  As the
+	   loop counter we use the destination address because this is
+	   also the result.  */
+L(1):	addl $4, %eax		/* increment loop counter */
+
+	movb (%eax,%ecx), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(2)			/* yes, then exit */
+
+	movb 1(%eax,%ecx), %dl	/* load current char */
+	movb %dl, 1(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(3)			/* yes, then exit */
+
+	movb 2(%eax,%ecx), %dl	/* load current char */
+	movb %dl, 2(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(4)			/* yes, then exit */
+
+	movb 3(%eax,%ecx), %dl	/* load current char */
+	movb %dl, 3(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jnz L(1)		/* no, then continue loop */
+
+	incl %eax		/* correct loop counter */
+L(4):	incl %eax
+L(3):	incl %eax
+L(2):
+
+	ret
+END (__stpcpy)
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/i386/stpncpy.S b/REORG.TODO/sysdeps/i386/stpncpy.S
new file mode 100644
index 0000000000..46f2aba713
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/stpncpy.S
@@ -0,0 +1,147 @@
+/* copy no more than N bytes from SRC to DEST, returning the address of
+   the terminating '\0' in DEST.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Some bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+     - original wrote n+1 chars in some cases.
+     - stpncpy() ought to behave like strncpy() ie. not null-terminate
+       if limited by n.  glibc-1.09 stpncpy() does this.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* space for 1 saved reg */
+#define RTN	PARMS
+#define DEST	RTN
+#define SRC	DEST+4
+#define LEN	SRC+4
+
+	.text
+ENTRY (__stpncpy)
+
+	pushl %esi
+	cfi_adjust_cfa_offset (4)
+
+	movl DEST(%esp), %eax
+	movl SRC(%esp), %esi
+	cfi_rel_offset (esi, 0)
+	movl LEN(%esp), %ecx
+
+	subl %eax, %esi		/* magic: reduce number of loop variants
+				   to one using addressing mode */
+	jmp L(1)		/* jump to loop "head" */
+
+	ALIGN(4)
+
+	/* Four times unfolded loop with two loop counters.  We get the
+	   third value (the source address) by using the index+base
+	   addressing mode.  */
+L(2):	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(7)			/* yes, then exit */
+
+	movb 1(%eax,%esi), %dl	/* load current char */
+	movb %dl, 1(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(6)			/* yes, then exit */
+
+	movb 2(%eax,%esi), %dl	/* load current char */
+	movb %dl, 2(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(5)			/* yes, then exit */
+
+	movb 3(%eax,%esi), %dl	/* load current char */
+	movb %dl, 3(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(4)			/* yes, then exit */
+
+	addl $4, %eax		/* increment loop counter for full round */
+
+L(1):	subl $4, %ecx		/* still more than 4 bytes allowed? */
+	jae L(2)		/* yes, then go to start of loop */
+
+	/* The maximal remaining 15 bytes are not processed in a loop.  */
+
+	addl $4, %ecx		/* correct above subtraction */
+	jz L(9)			/* maximal allowed char reached => go to end */
+
+	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(3)			/* yes, then exit */
+
+	incl %eax		/* increment pointer */
+	decl %ecx		/* decrement length counter */
+	jz L(9)			/* no more allowed => exit */
+
+	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(3)			/* yes, then exit */
+
+	incl %eax		/* increment pointer */
+	decl %ecx		/* decrement length counter */
+	jz L(9)			/* no more allowed => exit */
+
+	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(3)			/* yes, then exit */
+
+	incl %eax		/* increment pointer */
+	jmp L(9)		/* we don't have to test for counter underflow
+				   because we know we had a most 3 bytes
+				   remaining => exit */
+
+	/* When coming from the main loop we have to adjust the pointer.  */
+L(4):	decl %ecx		/* decrement counter */
+	incl %eax		/* increment pointer */
+
+L(5):	decl %ecx		/* increment pointer */
+	incl %eax		/* increment pointer */
+
+L(6):	decl %ecx		/* increment pointer */
+	incl %eax		/* increment pointer */
+L(7):
+
+	addl $3, %ecx		/* correct pre-decrementation of counter
+				   at the beginning of the loop; but why 3
+				   and not 4?  Very simple, we have to count
+				   the NUL char we already wrote.  */
+	jz L(9)			/* counter is also 0 => exit */
+
+	/* We now have to fill the rest of the buffer with NUL.  This
+	   is done in a tricky way.  Please note that the addressing mode
+	   used below is not the same we used above.  Here we use the
+	   %ecx register.  */
+L(8):
+	movb $0, (%ecx,%eax)	/* store NUL char */
+L(3):	decl %ecx		/* all bytes written? */
+	jnz L(8)		/* no, then again */
+
+L(9):	popl %esi		/* restore saved register content */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+
+	ret
+END (__stpncpy)
+
+libc_hidden_def (__stpncpy)
+weak_alias (__stpncpy, stpncpy)
diff --git a/REORG.TODO/sysdeps/i386/strcat.S b/REORG.TODO/sysdeps/i386/strcat.S
new file mode 100644
index 0000000000..4a26b3c528
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strcat.S
@@ -0,0 +1,265 @@
+/* strcat(dest, src) -- Append SRC on the end of DEST.
+   For Intel 80x86, x>=4.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>.
+   Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* space for 1 saved reg */
+#define RTN	PARMS
+#define DEST	RTN
+#define SRC	DEST+4
+
+	.text
+ENTRY (strcat)
+
+	pushl %edi		/* Save callee-safe register.  */
+	cfi_adjust_cfa_offset (4)
+
+	movl DEST(%esp), %edx
+	movl SRC(%esp), %ecx
+
+	testb $0xff, (%ecx)	/* Is source string empty? */
+	jz L(8)			/* yes => return */
+
+	/* Test the first bytes separately until destination is aligned.  */
+	testl $3, %edx		/* destination pointer aligned? */
+	jz L(1)			/* yes => begin scan loop */
+	testb $0xff, (%edx)	/* is end of string? */
+	jz L(2)			/* yes => start appending */
+	incl %edx		/* increment source pointer */
+
+	testl $3, %edx		/* destination pointer aligned? */
+	jz L(1)			/* yes => begin scan loop */
+	testb $0xff, (%edx)	/* is end of string? */
+	jz L(2)			/* yes => start appending */
+	incl %edx		/* increment source pointer */
+
+	testl $3, %edx		/* destination pointer aligned? */
+	jz L(1)			/* yes => begin scan loop */
+	testb $0xff, (%edx)	/* is end of string? */
+	jz L(2)			/* yes => start appending */
+	incl %edx		/* increment source pointer */
+
+	/* Now we are aligned.  Begin scan loop.  */
+	jmp L(1)
+
+	cfi_rel_offset (edi, 0)
+	ALIGN(4)
+
+L(4):	addl $16,%edx		/* increment destination pointer for round */
+
+L(1):	movl (%edx), %eax	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+
+	/* If you compare this with the algorithm in memchr.S you will
+	   notice that here is an `xorl' statement missing.  But you must
+	   not forget that we are looking for C == 0 and `xorl $0, %eax'
+	   is a no-op.  */
+
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occurred in the last byte => it was 0.	*/
+	jnc L(3)
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %ecx.  */
+	jnz L(3)
+
+	movl 4(%edx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(5)		/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(5)		/* one byte is NUL => stop copying */
+
+	movl 8(%edx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(6)		/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(6)		/* one byte is NUL => stop copying */
+
+	movl 12(%edx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(7)		/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L(4)			/* no byte is NUL => carry on copying */
+
+L(7):	addl $4, %edx		/* adjust source pointer */
+L(6):	addl $4, %edx
+L(5):	addl $4, %edx
+
+L(3):	testb %al, %al		/* is first byte NUL? */
+	jz L(2)			/* yes => start copying */
+	incl %edx		/* increment source pointer */
+
+	testb %ah, %ah		/* is second byte NUL? */
+	jz L(2)			/* yes => start copying */
+	incl %edx		/* increment source pointer */
+
+	testl $0xff0000, %eax	/* is third byte NUL? */
+	jz L(2)			/* yes => start copying */
+	incl %edx		/* increment source pointer */
+
+L(2):	subl %ecx, %edx		/* reduce number of loop variants */
+
+	/* Now we have to align the source pointer.  */
+	testl $3, %ecx		/* pointer correctly aligned? */
+	jz L(29)		/* yes => start copy loop */
+	movb (%ecx), %al	/* get first byte */
+	movb %al, (%ecx,%edx)	/* and store it */
+	andb %al, %al		/* is byte NUL? */
+	jz L(8)			/* yes => return */
+	incl %ecx		/* increment pointer */
+
+	testl $3, %ecx		/* pointer correctly aligned? */
+	jz L(29)		/* yes => start copy loop */
+	movb (%ecx), %al	/* get first byte */
+	movb %al, (%ecx,%edx)	/* and store it */
+	andb %al, %al		/* is byte NUL? */
+	jz L(8)			/* yes => return */
+	incl %ecx		/* increment pointer */
+
+	testl $3, %ecx		/* pointer correctly aligned? */
+	jz L(29)		/* yes => start copy loop */
+	movb (%ecx), %al	/* get first byte */
+	movb %al, (%ecx,%edx)	/* and store it */
+	andb %al, %al		/* is byte NUL? */
+	jz L(8)			/* yes => return */
+	incl %ecx		/* increment pointer */
+
+	/* Now we are aligned.  */
+	jmp L(29)		/* start copy loop */
+
+	ALIGN(4)
+
+L(28):	movl %eax, 12(%ecx,%edx)/* store word at destination */
+	addl $16, %ecx		/* adjust pointer for full round */
+
+L(29):	movl (%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(9)		/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(9)		/* one byte is NUL => stop copying */
+	movl %eax, (%ecx,%edx)	/* store word to destination */
+
+	movl 4(%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(91)		/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(91)		/* one byte is NUL => stop copying */
+	movl %eax, 4(%ecx,%edx)	/* store word to destination */
+
+	movl 8(%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(92)		/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(92)		/* one byte is NUL => stop copying */
+	movl %eax, 8(%ecx,%edx)	/* store word to destination */
+
+	movl 12(%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(93)		/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L(28)		/* no is NUL => carry on copying */
+
+L(93):	addl $4, %ecx		/* adjust pointer */
+L(92):	addl $4, %ecx
+L(91):	addl $4, %ecx
+
+L(9):	movb %al, (%ecx,%edx)	/* store first byte of last word */
+	orb %al, %al		/* is it NUL? */
+	jz L(8)			/* yes => return */
+
+	movb %ah, 1(%ecx,%edx)	/* store second byte of last word */
+	orb %ah, %ah		/* is it NUL? */
+	jz L(8)			/* yes => return */
+
+	shrl $16, %eax		/* make upper bytes accessible */
+	movb %al, 2(%ecx,%edx)	/* store third byte of last word */
+	orb %al, %al		/* is it NUL? */
+	jz L(8)			/* yes => return */
+
+	movb %ah, 3(%ecx,%edx)	/* store fourth byte of last word */
+
+L(8):	movl DEST(%esp), %eax	/* start address of destination is result */
+	popl %edi		/* restore saved register */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (strcat)
+libc_hidden_builtin_def (strcat)
diff --git a/REORG.TODO/sysdeps/i386/strchr.S b/REORG.TODO/sysdeps/i386/strchr.S
new file mode 100644
index 0000000000..6075e77882
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strchr.S
@@ -0,0 +1,290 @@
+/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4		/* space for 1 saved reg */
+#define RTN	PARMS
+#define STR	RTN
+#define CHR	STR+4
+
+	.text
+ENTRY (strchr)
+
+	pushl %edi		/* Save callee-safe registers used here.  */
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+	movl STR(%esp), %eax
+	movl CHR(%esp), %edx
+
+	/* At the moment %edx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* now it is 0|0|c|c */
+	movl %edx, %ecx
+	shll $16, %edx		/* now it is c|c|0|0 */
+	movw %cx, %dx		/* and finally c|c|c|c */
+
+	/* Before we start with the main loop we process single bytes
+	   until the source pointer is aligned.  This has two reasons:
+	   1. aligned 32-bit memory access is faster
+	   and (more important)
+	   2. we process in the main loop 32 bit in one step although
+	      we don't know the end of the string.  But accessing at
+	      4-byte alignment guarantees that we never access illegal
+	      memory if this would not also be done by the trivial
+	      implementation (this is because all processor inherent
+	      boundaries are multiples of 4.  */
+
+	testb $3, %al		/* correctly aligned ? */
+	jz L(11)		/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L(6)			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L(2)			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	testb $3, %al		/* correctly aligned ? */
+	jz L(11)		/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L(6)			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L(2)			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	testb $3, %al		/* correctly aligned ? */
+	jz L(11)		/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L(6)			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L(2)			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	/* No we have reached alignment.  */
+	jmp L(11)		/* begin loop */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for C, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is C.  This turns each byte that is C
+	 into a zero.  */
+
+	/* Each round the main loop processes 16 bytes.  */
+
+	ALIGN(4)
+
+L(1):	addl $16, %eax		/* adjust pointer for whole round */
+
+L(11):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occurred in the last byte => it was 0.	*/
+	jnc L(7)
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %edi.  */
+	jnz L(7)		/* found it => return pointer */
+
+	/* Now we made sure the dword does not contain the character we are
+	   looking for.  But because we deal with strings we have to check
+	   for the end of string before testing the next dword.  */
+
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(2)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(2)		/* found NUL => return NULL */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+	jnc L(71)		/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(71)		/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(2)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(2)		/* found NUL => return NULL */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+	jnc L(72)		/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(72)		/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(2)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(2)		/* found NUL => return NULL */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+	jnc L(73)		/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(73)		/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(2)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L(1)			/* no NUL found => restart loop */
+
+L(2):	/* Return NULL.  */
+	xorl %eax, %eax
+	popl %edi		/* restore saved register content */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+L(73):	addl $4, %eax		/* adjust pointer */
+L(72):	addl $4, %eax
+L(71):	addl $4, %eax
+
+	/* We now scan for the byte in which the character was matched.
+	   But we have to take care of the case that a NUL char is
+	   found before this in the dword.  Note that we XORed %ecx
+	   with the byte we're looking for, therefore the tests below look
+	   reversed.  */
+
+L(7):	testb %cl, %cl		/* is first byte C? */
+	jz L(6)			/* yes => return pointer */
+	cmpb %dl, %cl		/* is first byte NUL? */
+	je L(2)			/* yes => return NULL */
+	incl %eax		/* it's not in the first byte */
+
+	testb %ch, %ch		/* is second byte C? */
+	jz L(6)			/* yes => return pointer */
+	cmpb %dl, %ch		/* is second byte NUL? */
+	je L(2)			/* yes => return NULL? */
+	incl %eax		/* it's not in the second byte */
+
+	shrl $16, %ecx		/* make upper byte accessible */
+	testb %cl, %cl		/* is third byte C? */
+	jz L(6)			/* yes => return pointer */
+	cmpb %dl, %cl		/* is third byte NUL? */
+	je L(2)			/* yes => return NULL */
+
+	/* It must be in the fourth byte and it cannot be NUL.  */
+	incl %eax
+
+L(6):
+	popl %edi		/* restore saved register content */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (strchr)
+
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
diff --git a/REORG.TODO/sysdeps/i386/strchrnul.S b/REORG.TODO/sysdeps/i386/strchrnul.S
new file mode 100644
index 0000000000..800b872c74
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strchrnul.S
@@ -0,0 +1,278 @@
+/* strchrnul (str, chr) -- Return pointer to first occurrence of CHR in STR
+   or the final NUL byte.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.org>
+   Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* space for 1 saved reg */
+#define RTN	PARMS
+#define STR	RTN
+#define CHR	STR+4
+
+	.text
+ENTRY (__strchrnul)
+
+	pushl %edi		/* Save callee-safe registers used here.  */
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+
+	movl STR(%esp), %eax
+	movl CHR(%esp), %edx
+
+	/* At the moment %edx contains CHR.  What we need for the
+	   algorithm is CHR in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* now it is 0|0|c|c */
+	movl %edx, %ecx
+	shll $16, %edx		/* now it is c|c|0|0 */
+	movw %cx, %dx		/* and finally c|c|c|c */
+
+	/* Before we start with the main loop we process single bytes
+	   until the source pointer is aligned.  This has two reasons:
+	   1. aligned 32-bit memory access is faster
+	   and (more important)
+	   2. we process in the main loop 32 bit in one step although
+	      we don't know the end of the string.  But accessing at
+	      4-byte alignment guarantees that we never access illegal
+	      memory if this would not also be done by the trivial
+	      implementation (this is because all processor inherent
+	      boundaries are multiples of 4.  */
+
+	testb $3, %al		/* correctly aligned ? */
+	jz L(11)		/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L(6)			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L(6)			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	testb $3, %al		/* correctly aligned ? */
+	jz L(11)		/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L(6)			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L(6)			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	testb $3, %al		/* correctly aligned ? */
+	jz L(11)		/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L(6)			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L(6)			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	/* No we have reached alignment.  */
+	jmp L(11)		/* begin loop */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for CHR, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is CHR.  This turns each byte that is CHR
+	 into a zero.  */
+
+	/* Each round the main loop processes 16 bytes.  */
+
+	ALIGN(4)
+
+L(1):	addl $16, %eax		/* adjust pointer for whole round */
+
+L(11):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* CHR */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occurred in the last byte => it was 0.	*/
+	jnc L(7)
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is CHR we don't get 0 in %edi.  */
+	jnz L(7)		/* found it => return pointer */
+
+	/* Now we made sure the dword does not contain the character we are
+	   looking for.  But because we deal with strings we have to check
+	   for the end of string before testing the next dword.  */
+
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(7)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(7)		/* found NUL => return NULL */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* CHR */
+	jnc L(71)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(71)		/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(71)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(71)		/* found NUL => return NULL */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* CHR */
+	jnc L(72)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(72)		/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(72)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(72)		/* found NUL => return NULL */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* CHR */
+	jnc L(73)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(73)		/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(73)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L(1)			/* no NUL found => restart loop */
+
+L(73):	addl $4, %eax		/* adjust pointer */
+L(72):	addl $4, %eax
+L(71):	addl $4, %eax
+
+	/* We now scan for the byte in which the character was matched.
+	   But we have to take care of the case that a NUL char is
+	   found before this in the dword.  */
+
+L(7):	testb %cl, %cl		/* is first byte CHR? */
+	jz L(6)			/* yes => return pointer */
+	cmpb %dl, %cl		/* is first byte NUL? */
+	je L(6)			/* yes => return NULL */
+	incl %eax		/* it's not in the first byte */
+
+	testb %ch, %ch		/* is second byte CHR? */
+	jz L(6)			/* yes => return pointer */
+	cmpb %dl, %ch		/* is second byte NUL? */
+	je L(6)			/* yes => return NULL? */
+	incl %eax		/* it's not in the second byte */
+
+	shrl $16, %ecx		/* make upper byte accessible */
+	testb %cl, %cl		/* is third byte CHR? */
+	jz L(6)			/* yes => return pointer */
+	cmpb %dl, %cl		/* is third byte NUL? */
+	je L(6)			/* yes => return NULL */
+
+	/* It must be in the fourth byte and it cannot be NUL.  */
+	incl %eax
+
+L(6):	popl %edi		/* restore saved register content */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__strchrnul)
+
+weak_alias (__strchrnul, strchrnul)
diff --git a/REORG.TODO/sysdeps/i386/strcspn.S b/REORG.TODO/sysdeps/i386/strcspn.S
new file mode 100644
index 0000000000..c852a3b1e5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strcspn.S
@@ -0,0 +1,240 @@
+/* strcspn (str, ss) -- Return the length of the initial segment of STR
+			which contains no characters from SS.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define STR	PARMS
+#define STOP	STR+4
+
+	.text
+ENTRY (strcspn)
+
+	movl STR(%esp), %edx
+	movl STOP(%esp), %eax
+
+	/* First we create a table with flags for all possible characters.
+	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+	   supported by the C string functions we have 256 characters.
+	   Before inserting marks for the stop characters we clear the whole
+	   table.  The unrolled form is much faster than a loop.  */
+	xorl %ecx, %ecx		/* %ecx = 0 !!! */
+
+	pushl %ecx		/* make a 256 bytes long block filled with 0 */
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* These immediate values make the label 2 */
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* to be aligned on a 16 byte boundary to */
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* get a better performance of the loop.  */
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L(2):	movb (%eax), %cl	/* get byte from stopset */
+	testb %cl, %cl		/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 1(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 2(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 3(%eax), %cl	/* get byte from stopset */
+	addl $4, %eax		/* increment stopset pointer */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+	testb $0xff, %cl	/* is NUL char? */
+	jnz L(2)		/* no => process next dword from stopset */
+
+L(1):	leal -4(%edx), %eax	/* prepare loop */
+
+	/* We use a neat trick for the following loop.  Normally we would
+	   have to test for two termination conditions
+	   1. a character in the stopset was found
+	   and
+	   2. the end of the string was found
+	   But as a sign that the character is in the stopset we store its
+	   value in the table.  But the value of NUL is NUL so the loop
+	   terminates for NUL in every case.  */
+
+L(3):	addl $4, %eax		/* adjust pointer for full loop round */
+
+	movb (%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L(4)			/* yes => return */
+
+	movb 1(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L(5)			/* yes => return */
+
+	movb 2(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L(6)			/* yes => return */
+
+	movb 3(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	jne L(3)		/* yes => return */
+
+	incl %eax		/* adjust pointer */
+L(6):	incl %eax
+L(5):	incl %eax
+
+L(4):	addl $256, %esp		/* remove stopset */
+	cfi_adjust_cfa_offset (-256)
+	subl %edx, %eax		/* we have to return the number of valid
+				   characters, so compute distance to first
+				   non-valid character */
+	ret
+END (strcspn)
+libc_hidden_builtin_def (strcspn)
diff --git a/REORG.TODO/sysdeps/i386/string-inlines.c b/REORG.TODO/sysdeps/i386/string-inlines.c
new file mode 100644
index 0000000000..d023bc3aa3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/string-inlines.c
@@ -0,0 +1,47 @@
+/* Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is to avoid PLT entries for the x86 version.  */
+#define __memcpy_g __memcpy_g_internal
+#define __strchr_g __strchr_g_internal
+#include <string/string-inlines.c>
+
+void *
+(__memcpy_c) (void *d, const void *s, size_t n)
+{
+  return memcpy (d, s, n);
+}
+
+void *
+__memset_cc (void *s, unsigned long int pattern, size_t n)
+{
+  return memset (s, pattern & 0xff, n);
+}
+strong_alias (__memset_cc, __memset_cg)
+
+void *
+__memset_gg (void *s, char c, size_t n)
+{
+  return memset (s, c, n);
+}
+
+#ifdef __memcpy_c
+# undef __memcpy_g
+strong_alias (__memcpy_g_internal, __memcpy_g)
+# undef __strchr_g
+strong_alias (__strchr_g_internal, __strchr_g)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/strlen.S b/REORG.TODO/sysdeps/i386/strlen.S
new file mode 100644
index 0000000000..192fadf20a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strlen.S
@@ -0,0 +1,132 @@
+/* strlen(str) -- determine the length of the string STR.
+   Optimized for Intel 80x86, x>=4.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define STR	PARMS
+
+	.text
+ENTRY (strlen)
+
+	movl STR(%esp), %ecx
+	movl %ecx, %eax		/* duplicate it */
+
+	andl $3, %ecx		/* mask alignment bits */
+	jz L(1)			/* aligned => start loop */
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L(2)			/* yes => return */
+	incl %eax		/* increment pointer */
+
+	xorl $3, %ecx		/* was alignment = 3? */
+	jz L(1)			/* yes => now it is aligned and start loop */
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L(2)			/* yes => return */
+	addl $1, %eax		/* increment pointer */
+
+	subl $1, %ecx		/* was alignment = 2? */
+	jz L(1)			/* yes => now it is aligned and start loop */
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L(2)			/* yes => return */
+
+/* Don't change the above `addl $1,%eax' and `subl $1, %ecx' into `incl %eax'
+   and `decl %ecx' resp.  The additional two byte per instruction make the
+   label 4 to be aligned on a 16 byte boundary with nops.
+
+   The following `sub $15, %eax' is part of this trick, too.  Together with
+   the next instruction (`addl $16, %eax') it is in fact a `incl %eax', just
+   as expected from the algorithm.  But doing so has the advantage that
+   no jump to label 1 is necessary and so the pipeline is not flushed.  */
+
+	subl $15, %eax		/* effectively +1 */
+
+
+L(4):	addl $16, %eax		/* adjust pointer for full loop */
+
+L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(3)		/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(3)		/* found NUL => return pointer */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(5)		/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(5)		/* found NUL => return pointer */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(6)		/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(6)		/* found NUL => return pointer */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(7)		/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L(4)			/* no NUL found => continue loop */
+
+L(7):	addl $4, %eax		/* adjust pointer */
+L(6):	addl $4, %eax
+L(5):	addl $4, %eax
+
+L(3):	testb %cl, %cl		/* is first byte NUL? */
+	jz L(2)			/* yes => return */
+	incl %eax		/* increment pointer */
+
+	testb %ch, %ch		/* is second byte NUL? */
+	jz L(2)			/* yes => return */
+	incl %eax		/* increment pointer */
+
+	testl $0xff0000, %ecx	/* is third byte NUL? */
+	jz L(2)			/* yes => return pointer */
+	incl %eax		/* increment pointer */
+
+L(2):	subl STR(%esp), %eax	/* compute difference to string start */
+
+	ret
+END (strlen)
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/i386/strlen.c b/REORG.TODO/sysdeps/i386/strlen.c
new file mode 100644
index 0000000000..0b69957392
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strlen.c
@@ -0,0 +1,35 @@
+/* Determine the length of a string.  For Intel 80x86, x>=3.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Torbjorn Granlund (tege@sics.se).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+
+size_t
+strlen (const char *str)
+{
+  int cnt;
+
+  asm("cld\n"			/* Search forward.  */
+      /* Some old versions of gas need `repne' instead of `repnz'.  */
+      "repnz\n"			/* Look for a zero byte.  */
+      "scasb" /* %0, %1, %3 */ :
+      "=c" (cnt) : "D" (str), "0" (-1), "a" (0));
+
+  return -2 - cnt;
+}
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/i386/strpbrk.S b/REORG.TODO/sysdeps/i386/strpbrk.S
new file mode 100644
index 0000000000..1109b233da
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strpbrk.S
@@ -0,0 +1,243 @@
+/* strcspn (str, ss) -- Return the length of the initial segement of STR
+			which contains no characters from SS.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define RTN	PARMS
+#define STR	RTN
+#define STOP	STR+4
+
+	.text
+ENTRY (strpbrk)
+
+	movl STR(%esp), %edx
+	movl STOP(%esp), %eax
+
+	/* First we create a table with flags for all possible characters.
+	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+	   supported by the C string functions we have 256 characters.
+	   Before inserting marks for the stop characters we clear the whole
+	   table.  The unrolled form is much faster than a loop.  */
+	xorl %ecx, %ecx		/* %ecx = 0 !!! */
+
+	pushl %ecx		/* make a 256 bytes long block filled with 0 */
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* These immediate values make the label 2 */
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* to be aligned on a 16 byte boundary to */
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* get a better performance of the loop.  */
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L(2):	movb (%eax), %cl	/* get byte from stopset */
+	testb %cl, %cl		/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 1(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 2(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 3(%eax), %cl	/* get byte from stopset */
+	addl $4, %eax		/* increment stopset pointer */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+	testb $0xff, %cl	/* is NUL char? */
+	jnz L(2)		/* no => process next dword from stopset */
+
+L(1):	leal -4(%edx), %eax	/* prepare loop */
+
+	/* We use a neat trick for the following loop.  Normally we would
+	   have to test for two termination conditions
+	   1. a character in the stopset was found
+	   and
+	   2. the end of the string was found
+	   But as a sign that the character is in the stopset we store its
+	   value in the table.  But the value of NUL is NUL so the loop
+	   terminates for NUL in every case.  */
+
+L(3):	addl $4, %eax		/* adjust pointer for full loop round */
+
+	movb (%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L(4)			/* yes => return */
+
+	movb 1(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L(5)			/* yes => return */
+
+	movb 2(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L(6)			/* yes => return */
+
+	movb 3(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	jne L(3)			/* yes => return */
+
+	incl %eax		/* adjust pointer */
+L(6):	incl %eax
+L(5):	incl %eax
+
+L(4):	addl $256, %esp		/* remove stopset */
+	cfi_adjust_cfa_offset (-256)
+
+	orb %cl, %cl		/* was last character NUL? */
+	jnz L(7)		/* no => return pointer */
+	xorl %eax, %eax
+
+L(7):	ret
+END (strpbrk)
+libc_hidden_builtin_def (strpbrk)
diff --git a/REORG.TODO/sysdeps/i386/strrchr.S b/REORG.TODO/sysdeps/i386/strrchr.S
new file mode 100644
index 0000000000..95b304dc0b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strrchr.S
@@ -0,0 +1,334 @@
+/* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+8	/* space for 2 saved regs */
+#define RTN	PARMS
+#define STR	RTN
+#define CHR	STR+4
+
+	.text
+ENTRY (strrchr)
+
+	pushl %edi		/* Save callee-safe registers used here.  */
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+	pushl %esi
+	cfi_adjust_cfa_offset (4)
+
+	xorl %eax, %eax
+	movl STR(%esp), %esi
+	cfi_rel_offset (esi, 0)
+	movl CHR(%esp), %ecx
+
+	/* At the moment %ecx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %cl, %ch		/* now it is 0|0|c|c */
+	movl %ecx, %edx
+	shll $16, %ecx		/* now it is c|c|0|0 */
+	movw %dx, %cx		/* and finally c|c|c|c */
+
+	/* Before we start with the main loop we process single bytes
+	   until the source pointer is aligned.  This has two reasons:
+	   1. aligned 32-bit memory access is faster
+	   and (more important)
+	   2. we process in the main loop 32 bit in one step although
+	      we don't know the end of the string.  But accessing at
+	      4-byte alignment guarantees that we never access illegal
+	      memory if this would not also be done by the trivial
+	      implementation (this is because all processor inherent
+	      boundaries are multiples of 4.  */
+
+	testl $3, %esi		/* correctly aligned ? */
+	jz L(19)		/* yes => begin loop */
+	movb (%esi), %dl	/* load byte in question (we need it twice) */
+	cmpb %dl, %cl		/* compare byte */
+	jne L(11)			/* target found => return */
+	movl %esi, %eax		/* remember pointer as possible result */
+L(11):	orb %dl, %dl		/* is NUL? */
+	jz L(2)			/* yes => return NULL */
+	incl %esi		/* increment pointer */
+
+	testl $3, %esi		/* correctly aligned ? */
+	jz L(19)		/* yes => begin loop */
+	movb (%esi), %dl	/* load byte in question (we need it twice) */
+	cmpb %dl, %cl		/* compare byte */
+	jne L(12)			/* target found => return */
+	movl %esi, %eax		/* remember pointer as result */
+L(12):	orb %dl, %dl		/* is NUL? */
+	jz L(2)			/* yes => return NULL */
+	incl %esi		/* increment pointer */
+
+	testl $3, %esi		/* correctly aligned ? */
+	jz L(19)		/* yes => begin loop */
+	movb (%esi), %dl	/* load byte in question (we need it twice) */
+	cmpb %dl, %cl		/* compare byte */
+	jne L(13)			/* target found => return */
+	movl %esi, %eax		/* remember pointer as result */
+L(13):	orb %dl, %dl		/* is NUL? */
+	jz L(2)			/* yes => return NULL */
+	incl %esi		/* increment pointer */
+
+	/* No we have reached alignment.  */
+	jmp L(19)		/* begin loop */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for C, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is C.  This turns each byte that is C
+	 into a zero.  */
+
+	/* Each round the main loop processes 16 bytes.  */
+
+	/* Jump to here when the character is detected.  We chose this
+	   way around because the character one is looking for is not
+	   as frequent as the rest and taking a conditional jump is more
+	   expensive than ignoring it.
+
+	   Some more words to the code below: it might not be obvious why
+	   we decrement the source pointer here.  In the loop the pointer
+	   is not pre-incremented and so it still points before the word
+	   we are looking at.  But you should take a look at the instruction
+	   which gets executed before we get into the loop: `addl $16, %esi'.
+	   This makes the following subs into adds.  */
+
+	/* These fill bytes make the main loop be correctly aligned.
+	   We cannot use align because it is not the following instruction
+	   which should be aligned.  */
+	.byte 0, 0
+#ifndef	PROF
+	/* Profiling adds some code and so changes the alignment.  */
+	.byte 0
+#endif
+
+L(4):	subl $4, %esi		/* adjust pointer */
+L(41):	subl $4, %esi
+L(42):	subl $4, %esi
+L(43):	testl $0xff000000, %edx	/* is highest byte == C? */
+	jnz L(33)		/* no => try other bytes */
+	leal 15(%esi), %eax	/* store address as result */
+	jmp L(1)		/* and start loop again */
+
+L(3):	subl $4, %esi		/* adjust pointer */
+L(31):	subl $4, %esi
+L(32):	subl $4, %esi
+L(33):	testl $0xff0000, %edx	/* is C in third byte? */
+	jnz L(51)		/* no => try other bytes */
+	leal 14(%esi), %eax	/* store address as result */
+	jmp L(1)		/* and start loop again */
+
+L(51):
+	/* At this point we know that the byte is in one of the lower bytes.
+	   We make a guess and correct it if necessary.  This reduces the
+	   number of necessary jumps.  */
+	leal 12(%esi), %eax	/* guess address of lowest byte as result */
+	testb %dh, %dh		/* is guess correct? */
+	jnz L(1)		/* yes => start loop */
+	leal 13(%esi), %eax	/* correct guess to second byte */
+
+L(1):	addl $16, %esi		/* increment pointer for full round */
+
+L(19):	movl (%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occurred in the last byte => it was 0.	*/
+
+	jnc L(20)			/* found NUL => check last word */
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %edx, %edi		/* (word+magic)^word */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %edi.  */
+	jnz L(20)			/* found NUL => check last word */
+
+	/* Now we made sure the dword does not contain the character we are
+	   looking for.  But because we deal with strings we have to check
+	   for the end of string before testing the next dword.  */
+
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(4)		/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(3)		/* C is detected in the word => examine it */
+
+	movl 4(%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(21)		/* found NUL => check last word */
+	xorl %edx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(21)		/* found NUL => check last word */
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(41)		/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(31)		/* C is detected in the word => examine it */
+
+	movl 8(%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(22)		/* found NUL => check last word */
+	xorl %edx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(22)		/* found NUL => check last word */
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(42)		/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(32)		/* C is detected in the word => examine it */
+
+	movl 12(%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(23)		/* found NUL => check last word */
+	xorl %edx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(23)		/* found NUL => check last word */
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(43)		/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L(1)			/* C is not detected => restart loop */
+	jmp L(33)		/* examine word */
+
+L(23):	addl $4, %esi		/* adjust pointer */
+L(22):	addl $4, %esi
+L(21):	addl $4, %esi
+
+	/* What remains to do is to test which byte the NUL char is and
+	   whether the searched character appears in one of the bytes
+	   before.  A special case is that the searched byte maybe NUL.
+	   In this case a pointer to the terminating NUL char has to be
+	   returned.  */
+
+L(20):	cmpb %cl, %dl		/* is first byte == C? */
+	jne L(24)		/* no => skip */
+	movl %esi, %eax		/* store address as result */
+L(24):	testb %dl, %dl		/* is first byte == NUL? */
+	jz L(2)			/* yes => return */
+
+	cmpb %cl, %dh		/* is second byte == C? */
+	jne L(25)		/* no => skip */
+	leal 1(%esi), %eax	/* store address as result */
+L(25):	testb %dh, %dh		/* is second byte == NUL? */
+	jz L(2)			/* yes => return */
+
+	shrl $16,%edx		/* make upper bytes accessible */
+	cmpb %cl, %dl		/* is third byte == C */
+	jne L(26)		/* no => skip */
+	leal 2(%esi), %eax	/* store address as result */
+L(26):	testb %dl, %dl		/* is third byte == NUL */
+	jz L(2)			/* yes => return */
+
+	cmpb %cl, %dh		/* is fourth byte == C */
+	jne L(2)		/* no => skip */
+	leal 3(%esi), %eax	/* store address as result */
+
+L(2):	popl %esi		/* restore saved register content */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl %edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (strrchr)
+
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/REORG.TODO/sysdeps/i386/strspn.S b/REORG.TODO/sysdeps/i386/strspn.S
new file mode 100644
index 0000000000..d433eb6af5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strspn.S
@@ -0,0 +1,240 @@
+/* strcspn (str, ss) -- Return the length of the initial segment of STR
+			which contains only characters from SS.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define STR	PARMS
+#define SKIP	STR+4
+
+	.text
+ENTRY (strspn)
+
+	movl STR(%esp), %edx
+	movl SKIP(%esp), %eax
+
+	/* First we create a table with flags for all possible characters.
+	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+	   supported by the C string functions we have 256 characters.
+	   Before inserting marks for the stop characters we clear the whole
+	   table.  The unrolled form is much faster than a loop.  */
+	xorl %ecx, %ecx		/* %ecx = 0 !!! */
+
+	pushl %ecx		/* make a 256 bytes long block filled with 0 */
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* These immediate values make the label 2 */
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* to be aligned on a 16 byte boundary to */
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* get a better performance of the loop.  */
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L(2):	movb (%eax), %cl	/* get byte from stopset */
+	testb %cl, %cl		/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 1(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 2(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 3(%eax), %cl	/* get byte from stopset */
+	addl $4, %eax		/* increment stopset pointer */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+	testb $0xff, %cl	/* is NUL char? */
+	jnz L(2)		/* no => process next dword from stopset */
+
+L(1):	leal -4(%edx), %eax	/* prepare loop */
+
+	/* We use a neat trick for the following loop.  Normally we would
+	   have to test for two termination conditions
+	   1. a character in the stopset was found
+	   and
+	   2. the end of the string was found
+	   But as a sign that the character is in the stopset we store its
+	   value in the table.  But the value of NUL is NUL so the loop
+	   terminates for NUL in every case.  */
+
+L(3):	addl $4, %eax		/* adjust pointer for full loop round */
+
+	movb (%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jz L(4)			/* no => return */
+
+	movb 1(%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jz L(5)			/* no => return */
+
+	movb 2(%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jz L(6)			/* no => return */
+
+	movb 3(%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jnz L(3)		/* yes => start loop again */
+
+	incl %eax		/* adjust pointer */
+L(6):	incl %eax
+L(5):	incl %eax
+
+L(4):	addl $256, %esp		/* remove stopset */
+	cfi_adjust_cfa_offset (-256)
+	subl %edx, %eax		/* we have to return the number of valid
+				   characters, so compute distance to first
+				   non-valid character */
+	ret
+END (strspn)
+libc_hidden_builtin_def (strspn)
diff --git a/REORG.TODO/sysdeps/i386/sub_n.S b/REORG.TODO/sysdeps/i386/sub_n.S
new file mode 100644
index 0000000000..3649da29e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/sub_n.S
@@ -0,0 +1,111 @@
+/* i80386 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+8		/* space for 2 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define S2	S1+4
+#define SIZE	S2+4
+
+	.text
+ENTRY (__mpn_sub_n)
+
+	pushl %edi
+	cfi_adjust_cfa_offset (4)
+	pushl %esi
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 4)
+	movl	S1(%esp),%esi
+	cfi_rel_offset (esi, 0)
+	movl	S2(%esp),%edx
+	movl	SIZE(%esp),%ecx
+	movl	%ecx,%eax
+	shrl	$3,%ecx			/* compute count for unrolled loop */
+	negl	%eax
+	andl	$7,%eax			/* get index where to start loop */
+	jz	L(oop)			/* necessary special case for 0 */
+	incl	%ecx			/* adjust loop count */
+	shll	$2,%eax			/* adjustment for pointers... */
+	subl	%eax,%edi		/* ... since they are offset ... */
+	subl	%eax,%esi		/* ... by a constant when we ... */
+	subl	%eax,%edx		/* ... enter the loop */
+	shrl	$2,%eax			/* restore previous value */
+#ifdef PIC
+/* Calculate start address in loop for PIC.  Due to limitations in some
+   assemblers, Loop-L0-3 cannot be put into the leal */
+	call	L(0)
+	cfi_adjust_cfa_offset (4)
+L(0):	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$(L(oop)-L(0)-3),%eax
+	addl	$4,%esp
+	cfi_adjust_cfa_offset (-4)
+#else
+/* Calculate start address in loop for non-PIC.  */
+ 	leal	(L(oop) - 3)(%eax,%eax,8),%eax
+#endif
+	jmp	*%eax			/* jump into loop */
+	ALIGN (3)
+L(oop):	movl	(%esi),%eax
+	sbbl	(%edx),%eax
+	movl	%eax,(%edi)
+	movl	4(%esi),%eax
+	sbbl	4(%edx),%eax
+	movl	%eax,4(%edi)
+	movl	8(%esi),%eax
+	sbbl	8(%edx),%eax
+	movl	%eax,8(%edi)
+	movl	12(%esi),%eax
+	sbbl	12(%edx),%eax
+	movl	%eax,12(%edi)
+	movl	16(%esi),%eax
+	sbbl	16(%edx),%eax
+	movl	%eax,16(%edi)
+	movl	20(%esi),%eax
+	sbbl	20(%edx),%eax
+	movl	%eax,20(%edi)
+	movl	24(%esi),%eax
+	sbbl	24(%edx),%eax
+	movl	%eax,24(%edi)
+	movl	28(%esi),%eax
+	sbbl	28(%edx),%eax
+	movl	%eax,28(%edi)
+	leal	32(%edi),%edi
+	leal	32(%esi),%esi
+	leal	32(%edx),%edx
+	decl	%ecx
+	jnz	L(oop)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl %esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl %edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_sub_n)
diff --git a/REORG.TODO/sysdeps/i386/submul_1.S b/REORG.TODO/sysdeps/i386/submul_1.S
new file mode 100644
index 0000000000..c765e8dd79
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/submul_1.S
@@ -0,0 +1,86 @@
+/* i80386 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+   the result from a second limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16	/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define SIZE	S1+4
+#define S2LIMB	SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define sizeP ecx
+#define s2_limb ebx
+
+	.text
+ENTRY (__mpn_submul_1)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 12)
+	cfi_rel_offset (esi, 8)
+	cfi_rel_offset (ebp, 4)
+	cfi_rel_offset (ebx, 0)
+
+	movl	RES(%esp), %res_ptr
+	movl	S1(%esp), %s1_ptr
+	movl	SIZE(%esp), %sizeP
+	movl	S2LIMB(%esp), %s2_limb
+	leal	(%res_ptr,%sizeP,4), %res_ptr
+	leal	(%s1_ptr,%sizeP,4), %s1_ptr
+	negl	%sizeP
+	xorl	%ebp, %ebp
+	ALIGN (3)
+L(oop):
+	movl	(%s1_ptr,%sizeP,4), %eax
+	mull	%s2_limb
+	addl	%ebp, %eax
+	adcl	$0, %edx
+	subl	%eax, (%res_ptr,%sizeP,4)
+	adcl	$0, %edx
+	movl	%edx, %ebp
+
+	incl	%sizeP
+	jnz	L(oop)
+	movl	%ebp, %eax
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_submul_1)
diff --git a/REORG.TODO/sysdeps/i386/symbol-hacks.h b/REORG.TODO/sysdeps/i386/symbol-hacks.h
new file mode 100644
index 0000000000..36a13c83f7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/symbol-hacks.h
@@ -0,0 +1,21 @@
+/* Hacks needed for symbol manipulation.  i386 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/wordsize-32/divdi3-symbol-hacks.h>
+
+#include_next "symbol-hacks.h"
diff --git a/REORG.TODO/sysdeps/i386/sys/ucontext.h b/REORG.TODO/sysdeps/i386/sys/ucontext.h
new file mode 100644
index 0000000000..fb5df11965
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/sys/ucontext.h
@@ -0,0 +1,139 @@
+/* Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* System V/i386 ABI compliant context switching support.  */
+
+#ifndef _SYS_UCONTEXT_H
+#define _SYS_UCONTEXT_H	1
+
+#include <features.h>
+
+#include <bits/types/sigset_t.h>
+#include <bits/sigcontext.h>
+#include <bits/types/stack_t.h>
+
+
+/* Type for general register.  */
+typedef int greg_t;
+
+/* Number of general registers.  */
+#define __NGREG	19
+#ifdef __USE_MISC
+# define NGREG	__NGREG
+#endif
+
+/* Container for all general registers.  */
+typedef greg_t gregset_t[__NGREG];
+
+#ifdef __USE_MISC
+/* Number of each register is the `gregset_t' array.  */
+enum
+{
+  REG_GS = 0,
+# define REG_GS	REG_GS
+  REG_FS,
+# define REG_FS	REG_FS
+  REG_ES,
+# define REG_ES	REG_ES
+  REG_DS,
+# define REG_DS	REG_DS
+  REG_EDI,
+# define REG_EDI	REG_EDI
+  REG_ESI,
+# define REG_ESI	REG_ESI
+  REG_EBP,
+# define REG_EBP	REG_EBP
+  REG_ESP,
+# define REG_ESP	REG_ESP
+  REG_EBX,
+# define REG_EBX	REG_EBX
+  REG_EDX,
+# define REG_EDX	REG_EDX
+  REG_ECX,
+# define REG_ECX	REG_ECX
+  REG_EAX,
+# define REG_EAX	REG_EAX
+  REG_TRAPNO,
+# define REG_TRAPNO	REG_TRAPNO
+  REG_ERR,
+# define REG_ERR	REG_ERR
+  REG_EIP,
+# define REG_EIP	REG_EIP
+  REG_CS,
+# define REG_CS	REG_CS
+  REG_EFL,
+# define REG_EFL	REG_EFL
+  REG_UESP,
+# define REG_UESP	REG_UESP
+  REG_SS
+# define REG_SS	REG_SS
+};
+#endif
+
+#ifdef __USE_MISC
+# define __ctx(fld) fld
+# define __ctxt(tag) tag
+#else
+# define __ctx(fld) __ ## fld
+# define __ctxt(tag) /* Empty.  */
+#endif
+
+/* Structure to describe FPU registers.  */
+typedef struct fpregset
+  {
+    union
+      {
+	struct __ctxt(fpchip_state)
+	  {
+	    int __ctx(state)[27];
+	    int __ctx(status);
+	  } __ctx(fpchip_state);
+
+	struct __ctxt(fp_emul_space)
+	  {
+	    char __ctx(fp_emul)[246];
+	    char __ctx(fp_epad)[2];
+	  } __ctx(fp_emul_space);
+
+	int __ctx(f_fpregs)[62];
+      } __ctx(fp_reg_set);
+
+    long int __ctx(f_wregs)[33];
+  } fpregset_t;
+
+/* Context to describe whole processor state.  */
+typedef struct
+  {
+    gregset_t __ctx(gregs);
+    fpregset_t __ctx(fpregs);
+  } mcontext_t;
+
+#undef __ctx
+#undef __ctxt
+
+/* Userlevel context.  */
+typedef struct ucontext
+  {
+    unsigned long int uc_flags;
+    struct ucontext *uc_link;
+    sigset_t uc_sigmask;
+    stack_t uc_stack;
+    mcontext_t uc_mcontext;
+    long int uc_filler[5];
+  } ucontext_t;
+
+#endif /* sys/ucontext.h */
diff --git a/REORG.TODO/sysdeps/i386/sysdep.h b/REORG.TODO/sysdeps/i386/sysdep.h
new file mode 100644
index 0000000000..d2b0860b99
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/sysdep.h
@@ -0,0 +1,159 @@
+/* Assembler macros for i386.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/generic/sysdep.h>
+
+#include <features.h> /* For __GNUC_PREREQ.  */
+
+/* It is desirable that the names of PIC thunks match those used by
+   GCC so that multiple copies are eliminated by the linker.  Because
+   GCC 4.6 and earlier use __i686 in the names, it is necessary to
+   override that predefined macro.  */
+#if defined __i686 && defined __ASSEMBLER__
+#undef __i686
+#define __i686 __i686
+#endif
+
+#ifdef	__ASSEMBLER__
+# define GET_PC_THUNK(reg) __x86.get_pc_thunk.reg
+#else
+# define GET_PC_THUNK_STR(reg) "__x86.get_pc_thunk." #reg
+#endif
+
+#ifdef	__ASSEMBLER__
+
+/* Syntactic details of assembler.  */
+
+/* ELF uses byte-counts for .align, most others use log2 of count of bytes.  */
+#define ALIGNARG(log2) 1<<log2
+#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+
+
+/* Define an entry point visible from C.
+
+   There is currently a bug in gdb which prevents us from specifying
+   incomplete stabs information.  Fake some entries here which specify
+   the current source file.  */
+#define	ENTRY(name)							      \
+  .globl C_SYMBOL_NAME(name);						      \
+  .type C_SYMBOL_NAME(name),@function;					      \
+  .align ALIGNARG(4);							      \
+  C_LABEL(name)								      \
+  cfi_startproc;							      \
+  CALL_MCOUNT
+
+#undef	END
+#define END(name)							      \
+  cfi_endproc;								      \
+  ASM_SIZE_DIRECTIVE(name)
+
+#define ENTRY_CHK(name) ENTRY (name)
+#define END_CHK(name) END (name)
+
+/* If compiled for profiling, call `mcount' at the start of each function.  */
+#ifdef	PROF
+/* The mcount code relies on a normal frame pointer being on the stack
+   to locate our caller, so push one just for its benefit.  */
+#define CALL_MCOUNT \
+  pushl %ebp; cfi_adjust_cfa_offset (4); movl %esp, %ebp; \
+  cfi_def_cfa_register (ebp); call JUMPTARGET(mcount); \
+  popl %ebp; cfi_def_cfa (esp, 4);
+#else
+#define CALL_MCOUNT		/* Do nothing.  */
+#endif
+
+/* Since C identifiers are not normally prefixed with an underscore
+   on this system, the asm identifier `syscall_error' intrudes on the
+   C name space.  Make sure we use an innocuous name.  */
+#define	syscall_error	__syscall_error
+#define mcount		_mcount
+
+#define	PSEUDO(name, syscall_name, args)				      \
+  .globl syscall_error;							      \
+lose: SYSCALL_PIC_SETUP							      \
+  jmp JUMPTARGET(syscall_error);					      \
+  ENTRY (name)								      \
+  DO_CALL (syscall_name, args);						      \
+  jb lose
+
+#undef	PSEUDO_END
+#define	PSEUDO_END(name)						      \
+  END (name)
+
+# define SETUP_PIC_REG(reg) \
+  .ifndef GET_PC_THUNK(reg);						      \
+  .section .gnu.linkonce.t.GET_PC_THUNK(reg),"ax",@progbits;		      \
+  .globl GET_PC_THUNK(reg);						      \
+  .hidden GET_PC_THUNK(reg);						      \
+  .p2align 4;								      \
+  .type GET_PC_THUNK(reg),@function;					      \
+GET_PC_THUNK(reg):							      \
+  movl (%esp), %e##reg;							      \
+  ret;									      \
+  .size GET_PC_THUNK(reg), . - GET_PC_THUNK(reg);			      \
+  .previous;								      \
+  .endif;								      \
+  call GET_PC_THUNK(reg)
+
+# define LOAD_PIC_REG(reg) \
+  SETUP_PIC_REG(reg); addl $_GLOBAL_OFFSET_TABLE_, %e##reg
+
+#undef JUMPTARGET
+#ifdef PIC
+#define JUMPTARGET(name)	name##@PLT
+#define SYSCALL_PIC_SETUP \
+    pushl %ebx;								      \
+    cfi_adjust_cfa_offset (4);						      \
+    call 0f;								      \
+0:  popl %ebx;								      \
+    cfi_adjust_cfa_offset (-4);						      \
+    addl $_GLOBAL_OFFSET_TABLE_+[.-0b], %ebx;
+
+#else
+#define JUMPTARGET(name)	name
+#define SYSCALL_PIC_SETUP	/* Nothing.  */
+#endif
+
+/* Local label name for asm code. */
+#ifndef L
+#define L(name)		.L##name
+#endif
+
+#define atom_text_section .section ".text.atom", "ax"
+
+#else /* __ASSEMBLER__ */
+
+# define SETUP_PIC_REG_STR(reg)						\
+  ".ifndef " GET_PC_THUNK_STR (reg) "\n"				\
+  ".section .gnu.linkonce.t." GET_PC_THUNK_STR (reg) ",\"ax\",@progbits\n" \
+  ".globl " GET_PC_THUNK_STR (reg) "\n"					\
+  ".hidden " GET_PC_THUNK_STR (reg) "\n"				\
+  ".p2align 4\n"							\
+  ".type " GET_PC_THUNK_STR (reg) ",@function\n"			\
+GET_PC_THUNK_STR (reg) ":"						\
+  "movl (%%esp), %%e" #reg "\n"						\
+  "ret\n"								\
+  ".size " GET_PC_THUNK_STR (reg) ", . - " GET_PC_THUNK_STR (reg) "\n"	\
+  ".previous\n"								\
+  ".endif\n"								\
+  "call " GET_PC_THUNK_STR (reg)
+
+# define LOAD_PIC_REG_STR(reg) \
+  SETUP_PIC_REG_STR (reg) "\naddl $_GLOBAL_OFFSET_TABLE_, %%e" #reg
+
+#endif	/* __ASSEMBLER__ */
diff --git a/REORG.TODO/sysdeps/i386/tls-macros.h b/REORG.TODO/sysdeps/i386/tls-macros.h
new file mode 100644
index 0000000000..053cba05d1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tls-macros.h
@@ -0,0 +1,78 @@
+#include <features.h> /* For __GNUC_PREREQ.  */
+
+#define TLS_LE(x) \
+  ({ int *__l;								      \
+     asm ("movl %%gs:0,%0\n\t"						      \
+	  "subl $" #x "@tpoff,%0"					      \
+	  : "=r" (__l));						      \
+     __l; })
+
+#if defined PIC && !__GNUC_PREREQ (5,0)
+# define TLS_IE(x) \
+  ({ int *__l;								      \
+     asm ("movl %%gs:0,%0\n\t"						      \
+	  "subl " #x "@gottpoff(%%ebx),%0"				      \
+	  : "=r" (__l));						      \
+     __l; })
+#else
+# define TLS_IE(x) \
+  ({ int *__l, __b;							      \
+     asm ("call 1f\n\t"							      \
+	  ".subsection 1\n"						      \
+	  "1:\tmovl (%%esp), %%ebx\n\t"					      \
+	  "ret\n\t"							      \
+	  ".previous\n\t"						      \
+	  "addl $_GLOBAL_OFFSET_TABLE_, %%ebx\n\t"			      \
+	  "movl %%gs:0,%0\n\t"						      \
+	  "subl " #x "@gottpoff(%%ebx),%0"				      \
+	  : "=r" (__l), "=&b" (__b));					      \
+     __l; })
+#endif
+
+#if defined PIC && !__GNUC_PREREQ (5,0)
+# define TLS_LD(x) \
+  ({ int *__l, __c, __d;						      \
+     asm ("leal " #x "@tlsldm(%%ebx),%%eax\n\t"				      \
+	  "call ___tls_get_addr@plt\n\t"				      \
+	  "leal " #x "@dtpoff(%%eax), %%eax"				      \
+	  : "=a" (__l), "=&c" (__c), "=&d" (__d));			      \
+     __l; })
+#else
+# define TLS_LD(x) \
+  ({ int *__l, __b, __c, __d;						      \
+     asm ("call 1f\n\t"							      \
+	  ".subsection 1\n"						      \
+	  "1:\tmovl (%%esp), %%ebx\n\t"					      \
+	  "ret\n\t"							      \
+	  ".previous\n\t"						      \
+	  "addl $_GLOBAL_OFFSET_TABLE_, %%ebx\n\t"			      \
+	  "leal " #x "@tlsldm(%%ebx),%%eax\n\t"				      \
+	  "call ___tls_get_addr@plt\n\t"				      \
+	  "leal " #x "@dtpoff(%%eax), %%eax"				      \
+	  : "=a" (__l), "=&b" (__b), "=&c" (__c), "=&d" (__d));		      \
+     __l; })
+#endif
+
+#if defined PIC && !__GNUC_PREREQ (5,0)
+# define TLS_GD(x) \
+  ({ int *__l, __c, __d;						      \
+     asm ("leal " #x "@tlsgd(%%ebx),%%eax\n\t"				      \
+	  "call ___tls_get_addr@plt\n\t"				      \
+	  "nop"								      \
+	  : "=a" (__l), "=&c" (__c), "=&d" (__d));			      \
+     __l; })
+#else
+# define TLS_GD(x) \
+  ({ int *__l, __b, __c, __d;						      \
+     asm ("call 1f\n\t"							      \
+	  ".subsection 1\n"						      \
+	  "1:\tmovl (%%esp), %%ebx\n\t"					      \
+	  "ret\n\t"							      \
+	  ".previous\n\t"						      \
+	  "addl $_GLOBAL_OFFSET_TABLE_, %%ebx\n\t"			      \
+	  "leal " #x "@tlsgd(%%ebx),%%eax\n\t"				      \
+	  "call ___tls_get_addr@plt\n\t"				      \
+	  "nop"								      \
+	  : "=a" (__l), "=&b" (__b), "=&c" (__c), "=&d" (__d));		      \
+     __l; })
+#endif
diff --git a/REORG.TODO/sysdeps/i386/tlsdesc.c b/REORG.TODO/sysdeps/i386/tlsdesc.c
new file mode 100644
index 0000000000..90de2bb05e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tlsdesc.c
@@ -0,0 +1,268 @@
+/* Manage TLS descriptors.  i386 version.
+   Copyright (C) 2005-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <link.h>
+#include <ldsodefs.h>
+#include <elf/dynamic-link.h>
+#include <tls.h>
+#include <dl-tlsdesc.h>
+#include <dl-unmap-segments.h>
+#include <tlsdeschtab.h>
+
+/* The following 4 functions take an entry_check_offset argument.
+   It's computed by the caller as an offset between its entry point
+   and the call site, such that by adding the built-in return address
+   that is implicitly passed to the function with this offset, we can
+   easily obtain the caller's entry point to compare with the entry
+   point given in the TLS descriptor.  If it's changed, we want to
+   return immediately.  */
+
+/* This function is used to lazily resolve TLS_DESC REL relocations
+   that reference the *ABS* segment in their own link maps.  The
+   argument is the addend originally stored there.  */
+
+void
+__attribute__ ((regparm (3))) attribute_hidden
+_dl_tlsdesc_resolve_abs_plus_addend_fixup (struct tlsdesc volatile *td,
+					   struct link_map *l,
+					   ptrdiff_t entry_check_offset)
+{
+  ptrdiff_t addend = (ptrdiff_t) td->arg;
+
+  if (_dl_tlsdesc_resolve_early_return_p (td, __builtin_return_address (0)
+					  - entry_check_offset))
+    return;
+
+#ifndef SHARED
+  CHECK_STATIC_TLS (l, l);
+#else
+  if (!TRY_STATIC_TLS (l, l))
+    {
+      td->arg = _dl_make_tlsdesc_dynamic (l, addend);
+      td->entry = _dl_tlsdesc_dynamic;
+    }
+  else
+#endif
+    {
+      td->arg = (void*) (addend - l->l_tls_offset);
+      td->entry = _dl_tlsdesc_return;
+    }
+
+  _dl_tlsdesc_wake_up_held_fixups ();
+}
+
+/* This function is used to lazily resolve TLS_DESC REL relocations
+   that originally had zero addends.  The argument location, that
+   originally held the addend, is used to hold a pointer to the
+   relocation, but it has to be restored before we call the function
+   that applies relocations.  */
+
+void
+__attribute__ ((regparm (3))) attribute_hidden
+_dl_tlsdesc_resolve_rel_fixup (struct tlsdesc volatile *td,
+			       struct link_map *l,
+			       ptrdiff_t entry_check_offset)
+{
+  const ElfW(Rel) *reloc = td->arg;
+
+  if (_dl_tlsdesc_resolve_early_return_p (td, __builtin_return_address (0)
+					  - entry_check_offset))
+    return;
+
+  /* The code below was borrowed from _dl_fixup(),
+     except for checking for STB_LOCAL.  */
+  const ElfW(Sym) *const symtab
+    = (const void *) D_PTR (l, l_info[DT_SYMTAB]);
+  const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]);
+  const ElfW(Sym) *sym = &symtab[ELFW(R_SYM) (reloc->r_info)];
+  lookup_t result;
+
+   /* Look up the target symbol.  If the normal lookup rules are not
+      used don't look in the global scope.  */
+  if (ELFW(ST_BIND) (sym->st_info) != STB_LOCAL
+      && __builtin_expect (ELFW(ST_VISIBILITY) (sym->st_other), 0) == 0)
+    {
+      const struct r_found_version *version = NULL;
+
+      if (l->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
+	{
+	  const ElfW(Half) *vernum =
+	    (const void *) D_PTR (l, l_info[VERSYMIDX (DT_VERSYM)]);
+	  ElfW(Half) ndx = vernum[ELFW(R_SYM) (reloc->r_info)] & 0x7fff;
+	  version = &l->l_versions[ndx];
+	  if (version->hash == 0)
+	    version = NULL;
+	}
+
+      result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym,
+				    l->l_scope, version, ELF_RTYPE_CLASS_PLT,
+				    DL_LOOKUP_ADD_DEPENDENCY, NULL);
+    }
+  else
+    {
+      /* We already found the symbol.  The module (and therefore its load
+	 address) is also known.  */
+      result = l;
+    }
+
+  if (!sym)
+    {
+      td->arg = 0;
+      td->entry = _dl_tlsdesc_undefweak;
+    }
+  else
+    {
+#  ifndef SHARED
+      CHECK_STATIC_TLS (l, result);
+#  else
+      if (!TRY_STATIC_TLS (l, result))
+	{
+	  td->arg = _dl_make_tlsdesc_dynamic (result, sym->st_value);
+	  td->entry = _dl_tlsdesc_dynamic;
+	}
+      else
+#  endif
+	{
+	  td->arg = (void*)(sym->st_value - result->l_tls_offset);
+	  td->entry = _dl_tlsdesc_return;
+	}
+    }
+
+  _dl_tlsdesc_wake_up_held_fixups ();
+}
+
+/* This function is used to lazily resolve TLS_DESC RELA relocations.
+   The argument location is used to hold a pointer to the relocation.  */
+
+void
+__attribute__ ((regparm (3))) attribute_hidden
+_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td,
+				struct link_map *l,
+				ptrdiff_t entry_check_offset)
+{
+  const ElfW(Rela) *reloc = td->arg;
+
+  if (_dl_tlsdesc_resolve_early_return_p (td, __builtin_return_address (0)
+					  - entry_check_offset))
+    return;
+
+  /* The code below was borrowed from _dl_fixup(),
+     except for checking for STB_LOCAL.  */
+  const ElfW(Sym) *const symtab
+    = (const void *) D_PTR (l, l_info[DT_SYMTAB]);
+  const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]);
+  const ElfW(Sym) *sym = &symtab[ELFW(R_SYM) (reloc->r_info)];
+  lookup_t result;
+
+   /* Look up the target symbol.  If the normal lookup rules are not
+      used don't look in the global scope.  */
+  if (ELFW(ST_BIND) (sym->st_info) != STB_LOCAL
+      && __builtin_expect (ELFW(ST_VISIBILITY) (sym->st_other), 0) == 0)
+    {
+      const struct r_found_version *version = NULL;
+
+      if (l->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
+	{
+	  const ElfW(Half) *vernum =
+	    (const void *) D_PTR (l, l_info[VERSYMIDX (DT_VERSYM)]);
+	  ElfW(Half) ndx = vernum[ELFW(R_SYM) (reloc->r_info)] & 0x7fff;
+	  version = &l->l_versions[ndx];
+	  if (version->hash == 0)
+	    version = NULL;
+	}
+
+      result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym,
+				    l->l_scope, version, ELF_RTYPE_CLASS_PLT,
+				    DL_LOOKUP_ADD_DEPENDENCY, NULL);
+    }
+  else
+    {
+      /* We already found the symbol.  The module (and therefore its load
+	 address) is also known.  */
+      result = l;
+    }
+
+  if (!sym)
+    {
+      td->arg = (void*) reloc->r_addend;
+      td->entry = _dl_tlsdesc_undefweak;
+    }
+  else
+    {
+#  ifndef SHARED
+      CHECK_STATIC_TLS (l, result);
+#  else
+      if (!TRY_STATIC_TLS (l, result))
+	{
+	  td->arg = _dl_make_tlsdesc_dynamic (result, sym->st_value
+					      + reloc->r_addend);
+	  td->entry = _dl_tlsdesc_dynamic;
+	}
+      else
+#  endif
+	{
+	  td->arg = (void*) (sym->st_value - result->l_tls_offset
+			     + reloc->r_addend);
+	  td->entry = _dl_tlsdesc_return;
+	}
+    }
+
+  _dl_tlsdesc_wake_up_held_fixups ();
+}
+
+/* This function is used to avoid busy waiting for other threads to
+   complete the lazy relocation.  Once another thread wins the race to
+   relocate a TLS descriptor, it sets the descriptor up such that this
+   function is called to wait until the resolver releases the
+   lock.  */
+
+void
+__attribute__ ((regparm (3))) attribute_hidden
+_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc volatile *td,
+				struct link_map *l __attribute__((__unused__)),
+				ptrdiff_t entry_check_offset)
+{
+  /* Maybe we're lucky and can return early.  */
+  if (__builtin_return_address (0) - entry_check_offset != td->entry)
+    return;
+
+  /* Locking here will stop execution until the running resolver runs
+     _dl_tlsdesc_wake_up_held_fixups(), releasing the lock.
+
+     FIXME: We'd be better off waiting on a condition variable, such
+     that we didn't have to hold the lock throughout the relocation
+     processing.  */
+  __rtld_lock_lock_recursive (GL(dl_load_lock));
+  __rtld_lock_unlock_recursive (GL(dl_load_lock));
+}
+
+
+/* Unmap the dynamic object, but also release its TLS descriptor table
+   if there is one.  */
+
+void
+internal_function
+_dl_unmap (struct link_map *map)
+{
+  _dl_unmap_segments (map);
+
+#ifdef SHARED
+  if (map->l_mach.tlsdesc_table)
+    htab_delete (map->l_mach.tlsdesc_table);
+#endif
+}
diff --git a/REORG.TODO/sysdeps/i386/tlsdesc.sym b/REORG.TODO/sysdeps/i386/tlsdesc.sym
new file mode 100644
index 0000000000..33854975d0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tlsdesc.sym
@@ -0,0 +1,17 @@
+#include <stddef.h>
+#include <sysdep.h>
+#include <tls.h>
+#include <link.h>
+#include <dl-tlsdesc.h>
+
+--
+
+-- Abuse tls.h macros to derive offsets relative to the thread register.
+
+DTV_OFFSET			offsetof(struct pthread, header.dtv)
+
+TLSDESC_ARG			offsetof(struct tlsdesc, arg)
+
+TLSDESC_GEN_COUNT		offsetof(struct tlsdesc_dynamic_arg, gen_count)
+TLSDESC_MODID			offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
+TLSDESC_MODOFF			offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
diff --git a/REORG.TODO/sysdeps/i386/tst-audit.h b/REORG.TODO/sysdeps/i386/tst-audit.h
new file mode 100644
index 0000000000..87bf199c85
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-audit.h
@@ -0,0 +1,25 @@
+/* Definitions for testing PLT entry/exit auditing.  i386 version.
+
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define pltenter la_i86_gnu_pltenter
+#define pltexit la_i86_gnu_pltexit
+#define La_regs La_i86_regs
+#define La_retval La_i86_retval
+#define int_retval lrv_eax
diff --git a/REORG.TODO/sysdeps/i386/tst-audit3.c b/REORG.TODO/sysdeps/i386/tst-audit3.c
new file mode 100644
index 0000000000..b67a59d733
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-audit3.c
@@ -0,0 +1,37 @@
+/* Test case for i386 preserved registers in dynamic linker.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include "tst-audit3.h"
+
+static int
+do_test (void)
+{
+  long long ll = audit1_test (1, 2, 3);
+  if (ll != 30)
+    abort ();
+
+  float f = audit2_test (1, 2, 3);
+  if (f != 30)
+    abort ();
+
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/i386/tst-audit3.h b/REORG.TODO/sysdeps/i386/tst-audit3.h
new file mode 100644
index 0000000000..f6d3b9181e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-audit3.h
@@ -0,0 +1,20 @@
+/* Test case for i386 preserved registers in dynamic linker.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+extern long long audit1_test (int, int, int) __attribute__ ((regparm(3)));
+extern float audit2_test (int, int, int) __attribute__ ((regparm(3)));
diff --git a/REORG.TODO/sysdeps/i386/tst-auditmod3a.c b/REORG.TODO/sysdeps/i386/tst-auditmod3a.c
new file mode 100644
index 0000000000..a333cdcff9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-auditmod3a.c
@@ -0,0 +1,38 @@
+/* Test case for i386 preserved registers in dynamic linker.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include "tst-audit3.h"
+
+long long
+__attribute__ ((regparm(3)))
+audit1_test (int i, int j, int k)
+{
+  if (i != 1 || j != 2 || k != 3)
+    abort ();
+  return 30;
+}
+
+float
+__attribute__ ((regparm(3)))
+audit2_test (int i, int j, int k)
+{
+  if (i != 1 || j != 2 || k != 3)
+    abort ();
+  return 30;
+}
diff --git a/REORG.TODO/sysdeps/i386/tst-auditmod3b.c b/REORG.TODO/sysdeps/i386/tst-auditmod3b.c
new file mode 100644
index 0000000000..523f3cec90
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-auditmod3b.c
@@ -0,0 +1,186 @@
+/* Test case for i386 preserved registers in dynamic linker.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <link.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  const char *flagstr;
+  switch (flag)
+    {
+    case LA_ACT_CONSISTENT:
+      flagstr = "consistent";
+      break;
+    case LA_ACT_ADD:
+      flagstr = "add";
+      break;
+    case LA_ACT_DELETE:
+      flagstr = "delete";
+      break;
+    default:
+      printf ("activity: unknown activity %u\n", flag);
+      return;
+    }
+  printf ("activity: %s\n", flagstr);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  const char *flagstr;
+  switch (flag)
+    {
+    case LA_SER_ORIG:
+      flagstr = "LA_SET_ORIG";
+      break;
+    case LA_SER_LIBPATH:
+      flagstr = "LA_SER_LIBPATH";
+      break;
+    case LA_SER_RUNPATH:
+      flagstr = "LA_SER_RUNPATH";
+      break;
+    case LA_SER_CONFIG:
+      flagstr = "LA_SER_CONFIG";
+      break;
+    case LA_SER_DEFAULT:
+      flagstr = "LA_SER_DEFAULT";
+      break;
+    case LA_SER_SECURE:
+      flagstr = "LA_SER_SECURE";
+      break;
+    default:
+      printf ("objsearch: %s, unknown flag %d\n", name, flag);
+      return (char *) name;
+    }
+
+  printf ("objsearch: %s, %s\n", name, flagstr);
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#include "tst-audit.h"
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  if (strcmp (symname, "audit1_test") == 0
+      || strcmp (symname, "audit2_test") == 0)
+    {
+      if (regs->lr_eax != 1
+	  || regs->lr_edx != 2
+	  || regs->lr_ecx != 3)
+	abort ();
+
+      *framesizep = 200;
+    }
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx,
+	  (ptrdiff_t) outregs->int_retval);
+
+  if (strcmp (symname, "audit1_test") == 0
+      || strcmp (symname, "audit2_test") == 0)
+    {
+      if (inregs->lr_eax != 1
+	  || inregs->lr_edx != 2
+	  || inregs->lr_ecx != 3)
+	abort ();
+
+      if (strcmp (symname, "audit1_test") == 0)
+	{
+	  long long x = ((unsigned long long) outregs->lrv_eax
+			 | (unsigned long long) outregs->lrv_edx << 32);
+
+	  if (x != 30)
+	    abort ();
+	}
+      else if (strcmp (symname, "audit2_test") == 0)
+	{
+	  if (outregs->lrv_st0 != 30)
+	    abort ();
+	}
+    }
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/i386/tst-ld-sse-use.sh b/REORG.TODO/sysdeps/i386/tst-ld-sse-use.sh
new file mode 100755
index 0000000000..83a1dc59fb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-ld-sse-use.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+# Make sure no code in ld.so uses xmm/ymm/zmm registers on i386.
+# Copyright (C) 2009-2017 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+set -e
+
+objpfx="$1"
+NM="$2"
+OBJDUMP="$3"
+READELF="$4"
+
+tmp=$(mktemp ${objpfx}tst-ld-sse-use.XXXXXX)
+trap 'rm -f "$tmp"' 1 2 3 15
+
+# List of object files we have to test
+rtldobjs=$($READELF -W -wi ${objpfx}dl-allobjs.os |
+    awk '/^ </ { if ($5 == "(DW_TAG_compile_unit)") c=1; else c=0 } $2 == "DW_AT_name" { if (c == 1) print $NF }' |
+    sed 's,\(.*/\|\)\([_[:alnum:]-]*[.]\).$,\2os,')
+rtldobjs="$rtldobjs $(ar t ${objpfx}rtld-libc.a)"
+
+# OBJECT symbols can be ignored.
+$READELF -sW ${objpfx}dl-allobjs.os ${objpfx}rtld-libc.a |
+egrep " OBJECT  *GLOBAL " |
+awk '{if ($7 != "ABS") print $8 }' |
+sort -u > "$tmp"
+declare -a objects
+objects=($(cat "$tmp"))
+
+objs="dl-runtime.os"
+tocheck="dl-runtime.os"
+
+while test -n "$objs"; do
+  this="$objs"
+  objs=""
+
+  for f in $this; do
+    undef=$($NM -u "$objpfx"../*/"$f" | awk '{print $2}')
+    if test -n "$undef"; then
+      for s in $undef; do
+	for obj in ${objects[*]} "_GLOBAL_OFFSET_TABLE_"; do
+	  if test "$obj" = "$s"; then
+	    continue 2
+	  fi
+	done
+        for o in $rtldobjs; do
+	  ro=$(echo "$objpfx"../*/"$o")
+	  if $NM -g --defined-only "$ro" | egrep -qs " $s\$"; then
+	    if ! (echo "$tocheck $objs" | fgrep -qs "$o"); then
+	      echo "$o needed for $s"
+	      objs="$objs $o"
+	    fi
+	    break;
+	  fi
+	done
+      done
+    fi
+  done
+  tocheck="$tocheck$objs"
+done
+
+echo
+echo
+echo "object files needed: $tocheck"
+
+cp /dev/null "$tmp"
+for f in $tocheck; do
+  $OBJDUMP -d "$objpfx"../*/"$f" |
+  awk 'BEGIN { last="" } /^[[:xdigit:]]* <[_[:alnum:]]*>:$/ { fct=substr($2, 2, length($2)-3) } /,%[xyz]mm[[:digit:]]*$/ { if (last != fct) { print fct; last=fct} }' |
+  while read fct; do
+    if test "$fct" = "_dl_runtime_profile" -o "$fct" = "_dl_x86_64_restore_sse"; then
+      continue;
+    fi
+    echo "function $fct in $f modifies xmm/ymm/zmm" >> "$tmp"
+    result=1
+  done
+done
+
+if test -s "$tmp"; then
+  echo
+  echo
+  cat "$tmp"
+  result=1
+else
+  result=0
+fi
+
+rm "$tmp"
+exit $result
diff --git a/REORG.TODO/sysdeps/i386/tst-stack-align.h b/REORG.TODO/sysdeps/i386/tst-stack-align.h
new file mode 100644
index 0000000000..76276d4a28
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-stack-align.h
@@ -0,0 +1,41 @@
+/* Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <stdint.h>
+
+typedef struct { int i[4]; } int_al16 __attribute__((aligned (16)));
+
+#define TEST_STACK_ALIGN() \
+  ({									     \
+    int_al16 _m;							     \
+    double _d = 12.0;							     \
+    long double _ld = 15.0;						     \
+    int _ret = 0;							     \
+    printf ("int_al16:  %p %zu\n", &_m, __alignof (int_al16));		     \
+    if ((((uintptr_t) &_m) & (__alignof (int_al16) - 1)) != 0)		     \
+      _ret = 1;								     \
+									     \
+    printf ("double:  %g %p %zu\n", _d, &_d, __alignof (double));	     \
+    if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0)		     \
+      _ret = 1;								     \
+									     \
+    printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double));    \
+    if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0)	     \
+      _ret = 1;								     \
+    _ret;								     \
+    })