about summary refs log tree commit diff
path: root/REORG.TODO/sysdeps/i386
diff options
context:
space:
mode:
Diffstat (limited to 'REORG.TODO/sysdeps/i386')
-rw-r--r--REORG.TODO/sysdeps/i386/Implies5
-rw-r--r--REORG.TODO/sysdeps/i386/Makefile103
-rw-r--r--REORG.TODO/sysdeps/i386/Versions35
-rw-r--r--REORG.TODO/sysdeps/i386/____longjmp_chk.S1
-rw-r--r--REORG.TODO/sysdeps/i386/__longjmp.S72
-rw-r--r--REORG.TODO/sysdeps/i386/abort-instr.h2
-rw-r--r--REORG.TODO/sysdeps/i386/add_n.S111
-rw-r--r--REORG.TODO/sysdeps/i386/addmul_1.S86
-rw-r--r--REORG.TODO/sysdeps/i386/asm-syntax.h24
-rw-r--r--REORG.TODO/sysdeps/i386/atomic-machine.h545
-rw-r--r--REORG.TODO/sysdeps/i386/backtrace.c163
-rw-r--r--REORG.TODO/sysdeps/i386/bcopy.S4
-rw-r--r--REORG.TODO/sysdeps/i386/bsd-_setjmp.S56
-rw-r--r--REORG.TODO/sysdeps/i386/bsd-setjmp.S66
-rw-r--r--REORG.TODO/sysdeps/i386/bzero.S5
-rw-r--r--REORG.TODO/sysdeps/i386/cacheinfo.c3
-rw-r--r--REORG.TODO/sysdeps/i386/configure84
-rw-r--r--REORG.TODO/sysdeps/i386/configure.ac52
-rw-r--r--REORG.TODO/sysdeps/i386/crti.S84
-rw-r--r--REORG.TODO/sysdeps/i386/crtn.S47
-rw-r--r--REORG.TODO/sysdeps/i386/dl-irel.h51
-rw-r--r--REORG.TODO/sysdeps/i386/dl-lookupcfg.h32
-rw-r--r--REORG.TODO/sysdeps/i386/dl-machine.h757
-rw-r--r--REORG.TODO/sysdeps/i386/dl-procinfo.c65
-rw-r--r--REORG.TODO/sysdeps/i386/dl-tls.h61
-rw-r--r--REORG.TODO/sysdeps/i386/dl-tlsdesc.S285
-rw-r--r--REORG.TODO/sysdeps/i386/dl-tlsdesc.h61
-rw-r--r--REORG.TODO/sysdeps/i386/dl-trampoline.S215
-rw-r--r--REORG.TODO/sysdeps/i386/ffs.c50
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/Implies1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/Versions6
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/doasin.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acos.S25
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acosf.S24
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acosh.S101
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acoshf.S101
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acoshl.S107
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acosl.c29
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_asin.S38
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_asinf.S39
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atan2.S30
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atan2f.S30
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atan2l.c19
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atanh.S112
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atanhf.S109
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atanhl.S127
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp.S73
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp10.S53
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp10f.S53
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp10l.S2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp2.S52
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp2f.S52
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp2l.S60
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_expf.S74
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_expl.S226
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_fmod.S18
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_fmodf.S19
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_fmodl.c23
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_hypot.S75
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_hypotf.S64
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_ilogb.S42
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S42
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log.S92
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log10.S68
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log10f.S69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log10l.S71
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log2.S69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log2f.S69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log2l.S70
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_logf.S93
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_logl.S97
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_pow.S456
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_powf.S392
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_powl.S459
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c3
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_remainder.S18
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_remainderf.S18
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_remainderl.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_scalb.S100
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_scalbf.S102
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_scalbl.S90
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_sqrt.S23
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S13
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c54
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c54
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fegetenv.c49
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fegetexcept.c31
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fegetmode.c32
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fegetround.c33
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c50
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fenv_private.h501
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fesetenv.c131
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fesetexcept.c31
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fesetmode.c54
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fesetround.c54
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/feupdateenv.c60
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c57
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c124
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/ftestexcept.c40
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/halfulp.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h340
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/libm-test-ulps2202
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/math-tests.h27
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/math_private.h7
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mpatan.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mpatan2.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mpexp.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mplog.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mpsqrt.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_asinh.S139
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_asinhf.S139
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_asinhl.S144
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_atan.S30
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_atanf.S30
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_atanl.c22
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_cbrt.S200
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S177
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S229
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_ceil.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_ceilf.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_ceill.S40
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_copysign.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_copysignf.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_copysignl.S21
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_expm1.S113
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_expm1f.S113
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_expm1l.S2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fabs.S9
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fabsf.S9
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fabsl.S9
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fdim.c50
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_finite.S17
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_finitef.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_finitel.S15
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_floor.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_floorf.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_floorl.S40
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fmax.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S71
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fmin.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fminf.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fminl.S71
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c42
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_frexp.S83
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_frexpf.S80
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_frexpl.S92
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_isinfl.c32
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_isnanl.c43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_llrint.S36
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_llrintf.S36
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_llrintl.S36
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_log1p.S67
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_log1pf.S67
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_log1pl.S76
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_logb.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_logbf.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_logbl.c19
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_lrint.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_lrintf.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_lrintl.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S23
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c125
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c93
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c77
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_remquo.S45
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_remquof.S45
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_remquol.S45
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_rint.S15
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_rintf.S15
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_rintl.c18
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalbln.c2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalbn.S24
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S24
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S23
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_significand.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_significandf.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_significandl.c19
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_trunc.S37
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_truncf.S37
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_truncl.S40
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/slowexp.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/slowpow.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/t_exp.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c8
-rw-r--r--REORG.TODO/sysdeps/i386/gccframe.h27
-rw-r--r--REORG.TODO/sysdeps/i386/gmp-mparam.h28
-rw-r--r--REORG.TODO/sysdeps/i386/htonl.S34
-rw-r--r--REORG.TODO/sysdeps/i386/htons.S35
-rw-r--r--REORG.TODO/sysdeps/i386/i386-mcount.S79
-rw-r--r--REORG.TODO/sysdeps/i386/i586/add_n.S143
-rw-r--r--REORG.TODO/sysdeps/i386/i586/addmul_1.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i586/bzero.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i586/init-arch.h19
-rw-r--r--REORG.TODO/sysdeps/i386/i586/lshift.S255
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memcopy.h95
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memcpy.S124
-rw-r--r--REORG.TODO/sysdeps/i386/i586/mempcpy.S8
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memset.S121
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memusage.h1
-rw-r--r--REORG.TODO/sysdeps/i386/i586/mul_1.S90
-rw-r--r--REORG.TODO/sysdeps/i386/i586/rshift.S255
-rw-r--r--REORG.TODO/sysdeps/i386/i586/stpcpy.S8
-rw-r--r--REORG.TODO/sysdeps/i386/i586/strchr.S348
-rw-r--r--REORG.TODO/sysdeps/i386/i586/strcpy.S169
-rw-r--r--REORG.TODO/sysdeps/i386/i586/strlen.S182
-rw-r--r--REORG.TODO/sysdeps/i386/i586/sub_n.S143
-rw-r--r--REORG.TODO/sysdeps/i386/i586/submul_1.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i686/Makefile12
-rw-r--r--REORG.TODO/sysdeps/i386/i686/add_n.S110
-rw-r--r--REORG.TODO/sysdeps/i386/i686/bcopy.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/bzero.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/dl-hash.h79
-rw-r--r--REORG.TODO/sysdeps/i386/i686/ffs.c48
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/e_log.S29
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S30
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S22
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S325
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps2188
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S553
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c29
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S586
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c30
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S566
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c28
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S58
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S58
-rw-r--r--REORG.TODO/sysdeps/i386/i686/hp-timing.h42
-rw-r--r--REORG.TODO/sysdeps/i386/i686/init-arch.h19
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memcmp.S408
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memcpy.S98
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memmove.S120
-rw-r--r--REORG.TODO/sysdeps/i386/i686/mempcpy.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memset.S100
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memusage.h21
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/Makefile44
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S59
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S62
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c376
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym11
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S502
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S709
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S1225
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S2157
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S62
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S681
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S1809
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S3162
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S78
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S50
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S89
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S81
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S50
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S417
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S724
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S45
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S811
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S860
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset.S75
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S82
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c27
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c34
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c27
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c34
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c12
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c13
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S1245
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S572
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S92
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S158
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S348
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S57
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S804
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S2810
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S95
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S2250
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S3901
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S116
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S75
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S125
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S695
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S60
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c13
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c10
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S282
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S708
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S57
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S56
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c22
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S219
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S36
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c14
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S1018
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S600
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S36
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S193
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S354
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S35
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S40
-rw-r--r--REORG.TODO/sysdeps/i386/i686/nptl/tls.h35
-rw-r--r--REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S20
-rw-r--r--REORG.TODO/sysdeps/i386/i686/stack-aliasing.h23
-rw-r--r--REORG.TODO/sysdeps/i386/i686/strcmp.S52
-rw-r--r--REORG.TODO/sysdeps/i386/i686/tst-stack-align.h44
-rw-r--r--REORG.TODO/sysdeps/i386/i786/Implies2
-rw-r--r--REORG.TODO/sysdeps/i386/init-arch.h19
-rw-r--r--REORG.TODO/sysdeps/i386/jmpbuf-offsets.h25
-rw-r--r--REORG.TODO/sysdeps/i386/jmpbuf-unwind.h47
-rw-r--r--REORG.TODO/sysdeps/i386/ldbl2mpn.c120
-rw-r--r--REORG.TODO/sysdeps/i386/ldsodefs.h41
-rw-r--r--REORG.TODO/sysdeps/i386/link-defines.sym20
-rw-r--r--REORG.TODO/sysdeps/i386/lshift.S103
-rw-r--r--REORG.TODO/sysdeps/i386/machine-gmon.h40
-rw-r--r--REORG.TODO/sysdeps/i386/memchr.S322
-rw-r--r--REORG.TODO/sysdeps/i386/memcmp.S73
-rw-r--r--REORG.TODO/sysdeps/i386/memcopy.h92
-rw-r--r--REORG.TODO/sysdeps/i386/memcpy.S95
-rw-r--r--REORG.TODO/sysdeps/i386/memcpy_chk.S34
-rw-r--r--REORG.TODO/sysdeps/i386/memmove.S4
-rw-r--r--REORG.TODO/sysdeps/i386/memmove_chk.S33
-rw-r--r--REORG.TODO/sysdeps/i386/mempcpy.S7
-rw-r--r--REORG.TODO/sysdeps/i386/mempcpy_chk.S33
-rw-r--r--REORG.TODO/sysdeps/i386/memset.S68
-rw-r--r--REORG.TODO/sysdeps/i386/memset_chk.S33
-rw-r--r--REORG.TODO/sysdeps/i386/memusage.h20
-rw-r--r--REORG.TODO/sysdeps/i386/mp_clz_tab.c1
-rw-r--r--REORG.TODO/sysdeps/i386/mul_1.S86
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/Makefile26
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c19
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S37
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S31
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/pthreaddef.h40
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym17
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/tls.h435
-rw-r--r--REORG.TODO/sysdeps/i386/preconfigure5
-rw-r--r--REORG.TODO/sysdeps/i386/pthread_spin_trylock.S46
-rw-r--r--REORG.TODO/sysdeps/i386/rawmemchr.S222
-rw-r--r--REORG.TODO/sysdeps/i386/rshift.S105
-rw-r--r--REORG.TODO/sysdeps/i386/setfpucw.c54
-rw-r--r--REORG.TODO/sysdeps/i386/setjmp.S58
-rw-r--r--REORG.TODO/sysdeps/i386/stackguard-macros.h12
-rw-r--r--REORG.TODO/sysdeps/i386/stackinfo.h43
-rw-r--r--REORG.TODO/sysdeps/i386/start.S139
-rw-r--r--REORG.TODO/sysdeps/i386/stpcpy.S88
-rw-r--r--REORG.TODO/sysdeps/i386/stpncpy.S147
-rw-r--r--REORG.TODO/sysdeps/i386/strcat.S265
-rw-r--r--REORG.TODO/sysdeps/i386/strchr.S290
-rw-r--r--REORG.TODO/sysdeps/i386/strchrnul.S278
-rw-r--r--REORG.TODO/sysdeps/i386/strcspn.S240
-rw-r--r--REORG.TODO/sysdeps/i386/string-inlines.c47
-rw-r--r--REORG.TODO/sysdeps/i386/strlen.S132
-rw-r--r--REORG.TODO/sysdeps/i386/strlen.c35
-rw-r--r--REORG.TODO/sysdeps/i386/strpbrk.S243
-rw-r--r--REORG.TODO/sysdeps/i386/strrchr.S334
-rw-r--r--REORG.TODO/sysdeps/i386/strspn.S240
-rw-r--r--REORG.TODO/sysdeps/i386/sub_n.S111
-rw-r--r--REORG.TODO/sysdeps/i386/submul_1.S86
-rw-r--r--REORG.TODO/sysdeps/i386/symbol-hacks.h21
-rw-r--r--REORG.TODO/sysdeps/i386/sys/ucontext.h139
-rw-r--r--REORG.TODO/sysdeps/i386/sysdep.h159
-rw-r--r--REORG.TODO/sysdeps/i386/tls-macros.h78
-rw-r--r--REORG.TODO/sysdeps/i386/tlsdesc.c268
-rw-r--r--REORG.TODO/sysdeps/i386/tlsdesc.sym17
-rw-r--r--REORG.TODO/sysdeps/i386/tst-audit.h25
-rw-r--r--REORG.TODO/sysdeps/i386/tst-audit3.c37
-rw-r--r--REORG.TODO/sysdeps/i386/tst-audit3.h20
-rw-r--r--REORG.TODO/sysdeps/i386/tst-auditmod3a.c38
-rw-r--r--REORG.TODO/sysdeps/i386/tst-auditmod3b.c186
-rwxr-xr-xREORG.TODO/sysdeps/i386/tst-ld-sse-use.sh103
-rw-r--r--REORG.TODO/sysdeps/i386/tst-stack-align.h41
450 files changed, 62011 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/i386/Implies b/REORG.TODO/sysdeps/i386/Implies
new file mode 100644
index 0000000000..20b2dffc29
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/Implies
@@ -0,0 +1,5 @@
+x86
+wordsize-32
+ieee754/ldbl-96
+ieee754/dbl-64
+ieee754/flt-32
diff --git a/REORG.TODO/sysdeps/i386/Makefile b/REORG.TODO/sysdeps/i386/Makefile
new file mode 100644
index 0000000000..e30e1339f0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/Makefile
@@ -0,0 +1,103 @@
+# The mpn functions need a #define for asm syntax flavor.
+# Every i386 port in use uses gas syntax (I think).
+asm-CPPFLAGS += -DGAS_SYNTAX
+
+# The i386 `long double' is a distinct type we support.
+long-double-fcts = yes
+
+ifeq ($(subdir),string)
+sysdep_routines += cacheinfo
+endif
+
+ifeq ($(subdir),gmon)
+sysdep_routines += i386-mcount
+endif
+
+ifeq ($(subdir),elf)
+CFLAGS-rtld.c += -Wno-uninitialized -Wno-unused
+CFLAGS-dl-load.c += -Wno-unused
+CFLAGS-dl-reloc.c += -Wno-unused
+endif
+
+ifeq ($(subdir),debug)
+CFLAGS-backtrace.c += -fexceptions
+endif
+
+# Most of the glibc routines don't ever call user defined callbacks
+# nor use any FPU or SSE* and as such don't need bigger %esp alignment
+# than 4 bytes.
+# Lots of routines in math will use FPU, so make math subdir an exception
+# here.
+# In gcc 4.6 (and maybe earlier?) giving -mpreferred-stack-boundary=2 is
+# an error, so don't try to reduce it here like we used to.  We still
+# explicit set -mpreferred-stack-boundary=4 the places where it matters,
+# in case an older compiler defaulted to 2.
+ifeq ($(subdir),math)
+sysdep-CFLAGS += -mpreferred-stack-boundary=4
+else
+ifeq ($(subdir),csu)
+sysdep-CFLAGS += -mpreferred-stack-boundary=4
+gen-as-const-headers += link-defines.sym
+else
+# Likewise, any function which calls user callbacks
+uses-callbacks += -mpreferred-stack-boundary=4
+# Likewise, any stack alignment tests
+stack-align-test-flags += -malign-double -mpreferred-stack-boundary=4
+endif
+endif
+
+# And a couple of other routines
+ifeq ($(subdir),stdlib)
+CFLAGS-exit.c += -mpreferred-stack-boundary=4
+CFLAGS-cxa_finalize.c += -mpreferred-stack-boundary=4
+endif
+ifeq ($(subdir),elf)
+CFLAGS-dl-init.c += -mpreferred-stack-boundary=4
+CFLAGS-dl-fini.c += -mpreferred-stack-boundary=4
+CFLAGS-dl-open.c += -mpreferred-stack-boundary=4
+CFLAGS-dl-close.c += -mpreferred-stack-boundary=4
+CFLAGS-dl-error.c += -mpreferred-stack-boundary=4
+endif
+ifeq ($(subdir),dlfcn)
+CFLAGS-dlopen.c += -mpreferred-stack-boundary=4
+CFLAGS-dlopenold.c += -mpreferred-stack-boundary=4
+CFLAGS-dlclose.c += -mpreferred-stack-boundary=4
+CFLAGS-dlerror.c += -mpreferred-stack-boundary=4
+endif
+
+ifneq (,$(filter -mno-tls-direct-seg-refs,$(CFLAGS)))
+defines += -DNO_TLS_DIRECT_SEG_REFS
+endif
+
+ifeq ($(subdir),elf)
+sysdep-dl-routines += tlsdesc dl-tlsdesc
+
+tests += tst-audit3
+modules-names += tst-auditmod3a tst-auditmod3b
+
+$(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so
+$(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so
+tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so
+endif
+
+ifeq ($(subdir),csu)
+gen-as-const-headers += tlsdesc.sym
+endif
+
+# Make sure no code in ld.so uses mm/xmm/ymm/zmm registers on i386 since
+# the first 3 mm/xmm/ymm/zmm registers are used to pass vector parameters
+# which must be preserved.
+# With SSE disabled, ensure -fpmath is not set to use sse either.
+rtld-CFLAGS += -mno-sse -mno-mmx -mfpmath=387
+ifeq ($(subdir),elf)
+CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
+		   $(rtld-CFLAGS))
+
+tests-special += $(objpfx)tst-ld-sse-use.out
+$(objpfx)tst-ld-sse-use.out: ../sysdeps/i386/tst-ld-sse-use.sh $(objpfx)ld.so
+	@echo "Checking ld.so for SSE register use.  This will take a few seconds..."
+	$(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \
+	$(evaluate-test)
+else
+CFLAGS-.os += $(if $(filter rtld-%.os,$(@F)), $(rtld-CFLAGS))
+endif
diff --git a/REORG.TODO/sysdeps/i386/Versions b/REORG.TODO/sysdeps/i386/Versions
new file mode 100644
index 0000000000..7be44aad7a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/Versions
@@ -0,0 +1,35 @@
+ld {
+  GLIBC_2.3 {
+    # The alternative i386 runtime interface to TLS.
+    ___tls_get_addr;
+  }
+}
+libc {
+  GLIBC_2.0 {
+    # Functions from libgcc.
+    __divdi3; __moddi3; __udivdi3; __umoddi3;
+  }
+  GLIBC_2.1 {
+    # global variable
+    _fp_hw;
+  }
+  GLIBC_2.1.1 {
+    # extern inline functions used by <bits/string.h>
+    __memcpy_c; __memset_cc; __memset_cg; __memset_gg;
+    __memcpy_by2; __memcpy_by4; __memcpy_g; __mempcpy_by2; __mempcpy_by4;
+    __mempcpy_byn; __memset_ccn_by2; __memset_ccn_by4; __memset_gcn_by2;
+    __memset_gcn_by4; __stpcpy_g; __strcat_c; __strcat_g; __strchr_c;
+    __strchr_g; __strchrnul_c; __strchrnul_g; __strcmp_gg; __strcpy_g;
+    __strcspn_c1; __strcspn_cg; __strcspn_g; __strlen_g; __strncat_g;
+    __strncmp_g; __strncpy_by2; __strncpy_by4; __strncpy_byn; __strncpy_gg;
+    __strpbrk_cg; __strpbrk_g; __strrchr_c; __strrchr_g; __strspn_c1;
+    __strspn_cg; __strspn_g; __strstr_cg; __strstr_g;
+  }
+}
+libm {
+  GLIBC_2.1 {
+    # A generic bug got this omitted from other configurations' version
+    # sets, but we always had it.
+    exp2l;
+  }
+}
diff --git a/REORG.TODO/sysdeps/i386/____longjmp_chk.S b/REORG.TODO/sysdeps/i386/____longjmp_chk.S
new file mode 100644
index 0000000000..0910861a9d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/____longjmp_chk.S
@@ -0,0 +1 @@
+#error "OS-specific version needed"
diff --git a/REORG.TODO/sysdeps/i386/__longjmp.S b/REORG.TODO/sysdeps/i386/__longjmp.S
new file mode 100644
index 0000000000..3719763cd6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/__longjmp.S
@@ -0,0 +1,72 @@
+/* longjmp for i386.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <jmpbuf-offsets.h>
+#include <asm-syntax.h>
+#include <stap-probe.h>
+
+	.text
+ENTRY (__longjmp)
+#ifdef PTR_DEMANGLE
+	movl 4(%esp), %eax	/* User's jmp_buf in %eax.  */
+
+	/* Save the return address now.  */
+	movl (JB_PC*4)(%eax), %edx
+	/* Get the stack pointer.  */
+	movl (JB_SP*4)(%eax), %ecx
+	PTR_DEMANGLE (%edx)
+	PTR_DEMANGLE (%ecx)
+	LIBC_PROBE (longjmp, 3, 4@%eax, -4@8(%esp), 4@%edx)
+	cfi_def_cfa(%eax, 0)
+	cfi_register(%eip, %edx)
+	cfi_register(%esp, %ecx)
+	cfi_offset(%ebx, JB_BX*4)
+	cfi_offset(%esi, JB_SI*4)
+	cfi_offset(%edi, JB_DI*4)
+	cfi_offset(%ebp, JB_BP*4)
+	/* Restore registers.  */
+	movl (JB_BX*4)(%eax), %ebx
+	movl (JB_SI*4)(%eax), %esi
+	movl (JB_DI*4)(%eax), %edi
+	movl (JB_BP*4)(%eax), %ebp
+	cfi_restore(%ebx)
+	cfi_restore(%esi)
+	cfi_restore(%edi)
+	cfi_restore(%ebp)
+
+	LIBC_PROBE (longjmp_target, 3, 4@%eax, -4@8(%esp), 4@%edx)
+	movl 8(%esp), %eax	/* Second argument is return value.  */
+	movl %ecx, %esp
+#else
+	movl 4(%esp), %ecx	/* User's jmp_buf in %ecx.  */
+	movl 8(%esp), %eax	/* Second argument is return value.  */
+	/* Save the return address now.  */
+	movl (JB_PC*4)(%ecx), %edx
+	LIBC_PROBE (longjmp, 3, 4@%ecx, -4@%eax, 4@%edx)
+	/* Restore registers.  */
+	movl (JB_BX*4)(%ecx), %ebx
+	movl (JB_SI*4)(%ecx), %esi
+	movl (JB_DI*4)(%ecx), %edi
+	movl (JB_BP*4)(%ecx), %ebp
+	movl (JB_SP*4)(%ecx), %esp
+	LIBC_PROBE (longjmp_target, 3, 4@%ecx, -4@%ecx, 4@%edx)
+#endif
+	/* Jump to saved PC.  */
+	jmp *%edx
+END (__longjmp)
diff --git a/REORG.TODO/sysdeps/i386/abort-instr.h b/REORG.TODO/sysdeps/i386/abort-instr.h
new file mode 100644
index 0000000000..810f10379b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/abort-instr.h
@@ -0,0 +1,2 @@
+/* An instruction which should crash any program is `hlt'.  */
+#define ABORT_INSTRUCTION asm ("hlt")
diff --git a/REORG.TODO/sysdeps/i386/add_n.S b/REORG.TODO/sysdeps/i386/add_n.S
new file mode 100644
index 0000000000..c2923094a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/add_n.S
@@ -0,0 +1,111 @@
+/* Add two limb vectors of the same length > 0 and store sum in a third
+   limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+8	/* space for 2 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define S2	S1+4
+#define SIZE	S2+4
+
+	.text
+ENTRY (__mpn_add_n)
+
+	pushl %edi
+	cfi_adjust_cfa_offset (4)
+	pushl %esi
+	cfi_adjust_cfa_offset (4)
+
+	movl RES(%esp),%edi
+	cfi_rel_offset (edi, 4)
+	movl S1(%esp),%esi
+	cfi_rel_offset (esi, 0)
+	movl S2(%esp),%edx
+	movl SIZE(%esp),%ecx
+	movl	%ecx,%eax
+	shrl	$3,%ecx			/* compute count for unrolled loop */
+	negl	%eax
+	andl	$7,%eax			/* get index where to start loop */
+	jz	L(oop)			/* necessary special case for 0 */
+	incl	%ecx			/* adjust loop count */
+	shll	$2,%eax			/* adjustment for pointers... */
+	subl	%eax,%edi		/* ... since they are offset ... */
+	subl	%eax,%esi		/* ... by a constant when we ... */
+	subl	%eax,%edx		/* ... enter the loop */
+	shrl	$2,%eax			/* restore previous value */
+#ifdef PIC
+/* Calculate start address in loop for PIC.  Due to limitations in some
+   assemblers, Loop-L0-3 cannot be put into the leal */
+	call	L(0)
+	cfi_adjust_cfa_offset (4)
+L(0):	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$(L(oop)-L(0)-3),%eax
+	addl	$4,%esp
+	cfi_adjust_cfa_offset (-4)
+#else
+/* Calculate start address in loop for non-PIC.  */
+ 	leal	(L(oop) - 3)(%eax,%eax,8),%eax
+#endif
+	jmp	*%eax			/* jump into loop */
+	ALIGN (3)
+L(oop):	movl	(%esi),%eax
+	adcl	(%edx),%eax
+	movl	%eax,(%edi)
+	movl	4(%esi),%eax
+	adcl	4(%edx),%eax
+	movl	%eax,4(%edi)
+	movl	8(%esi),%eax
+	adcl	8(%edx),%eax
+	movl	%eax,8(%edi)
+	movl	12(%esi),%eax
+	adcl	12(%edx),%eax
+	movl	%eax,12(%edi)
+	movl	16(%esi),%eax
+	adcl	16(%edx),%eax
+	movl	%eax,16(%edi)
+	movl	20(%esi),%eax
+	adcl	20(%edx),%eax
+	movl	%eax,20(%edi)
+	movl	24(%esi),%eax
+	adcl	24(%edx),%eax
+	movl	%eax,24(%edi)
+	movl	28(%esi),%eax
+	adcl	28(%edx),%eax
+	movl	%eax,28(%edi)
+	leal	32(%edi),%edi
+	leal	32(%esi),%esi
+	leal	32(%edx),%edx
+	decl	%ecx
+	jnz	L(oop)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl %esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl %edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_add_n)
diff --git a/REORG.TODO/sysdeps/i386/addmul_1.S b/REORG.TODO/sysdeps/i386/addmul_1.S
new file mode 100644
index 0000000000..ad90ea53e5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/addmul_1.S
@@ -0,0 +1,86 @@
+/* i80386 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+   the result to a second limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16	/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define SIZE	S1+4
+#define S2LIMB	SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define sizeP ecx
+#define s2_limb ebx
+
+	.text
+ENTRY (__mpn_addmul_1)
+
+	pushl	%res_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%s1_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%s2_limb
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp), %res_ptr
+	cfi_rel_offset (res_ptr, 12)
+	movl	S1(%esp), %s1_ptr
+	cfi_rel_offset (s1_ptr, 8)
+	movl	SIZE(%esp), %sizeP
+	movl	S2LIMB(%esp), %s2_limb
+	cfi_rel_offset (s2_limb, 0)
+	leal	(%res_ptr,%sizeP,4), %res_ptr
+	leal	(%s1_ptr,%sizeP,4), %s1_ptr
+	negl	%sizeP
+	xorl	%ebp, %ebp
+	cfi_rel_offset (ebp, 4)
+	ALIGN (3)
+L(oop):
+	movl	(%s1_ptr,%sizeP,4), %eax
+	mull	%s2_limb
+	addl	%ebp, %eax
+	adcl	$0, %edx
+	addl	%eax, (%res_ptr,%sizeP,4)
+	adcl	$0, %edx
+	movl	%edx, %ebp
+
+	incl	%sizeP
+	jnz	L(oop)
+	movl	%ebp, %eax
+
+	popl	%s2_limb
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s2_limb)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%s1_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s1_ptr)
+	popl	%res_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (res_ptr)
+
+	ret
+END (__mpn_addmul_1)
diff --git a/REORG.TODO/sysdeps/i386/asm-syntax.h b/REORG.TODO/sysdeps/i386/asm-syntax.h
new file mode 100644
index 0000000000..a992da2dd1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/asm-syntax.h
@@ -0,0 +1,24 @@
+/* Definitions for x86 syntax variations.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.  Its master source is NOT part of
+   the C library, however.  The master source lives in the GNU MP Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#undef ALIGN
+#define ALIGN(log) .align 1<<log
+
+#undef L
+#define L(body) .L##body
diff --git a/REORG.TODO/sysdeps/i386/atomic-machine.h b/REORG.TODO/sysdeps/i386/atomic-machine.h
new file mode 100644
index 0000000000..0e24200617
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/atomic-machine.h
@@ -0,0 +1,545 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <tls.h>	/* For tcbhead_t.  */
+
+
+typedef int8_t atomic8_t;
+typedef uint8_t uatomic8_t;
+typedef int_fast8_t atomic_fast8_t;
+typedef uint_fast8_t uatomic_fast8_t;
+
+typedef int16_t atomic16_t;
+typedef uint16_t uatomic16_t;
+typedef int_fast16_t atomic_fast16_t;
+typedef uint_fast16_t uatomic_fast16_t;
+
+typedef int32_t atomic32_t;
+typedef uint32_t uatomic32_t;
+typedef int_fast32_t atomic_fast32_t;
+typedef uint_fast32_t uatomic_fast32_t;
+
+typedef int64_t atomic64_t;
+typedef uint64_t uatomic64_t;
+typedef int_fast64_t atomic_fast64_t;
+typedef uint_fast64_t uatomic_fast64_t;
+
+typedef intptr_t atomicptr_t;
+typedef uintptr_t uatomicptr_t;
+typedef intmax_t atomic_max_t;
+typedef uintmax_t uatomic_max_t;
+
+
+#ifndef LOCK_PREFIX
+# ifdef UP
+#  define LOCK_PREFIX	/* nothing */
+# else
+#  define LOCK_PREFIX "lock;"
+# endif
+#endif
+
+#define __HAVE_64B_ATOMICS 0
+#define USE_ATOMIC_COMPILER_BUILTINS 0
+#define ATOMIC_EXCHANGE_USES_CAS 0
+
+
+#define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \
+  __sync_val_compare_and_swap (mem, oldval, newval)
+#define atomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
+  (! __sync_bool_compare_and_swap (mem, oldval, newval))
+
+
+#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile ("cmpl $0, %%gs:%P5\n\t"                                \
+                       "je 0f\n\t"                                            \
+                       "lock\n"                                               \
+                       "0:\tcmpxchgb %b2, %1"				      \
+		       : "=a" (ret), "=m" (*mem)			      \
+		       : "q" (newval), "m" (*mem), "0" (oldval),	      \
+			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     ret; })
+
+#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile ("cmpl $0, %%gs:%P5\n\t"                                \
+                       "je 0f\n\t"                                            \
+                       "lock\n"                                               \
+                       "0:\tcmpxchgw %w2, %1"				      \
+		       : "=a" (ret), "=m" (*mem)			      \
+		       : "r" (newval), "m" (*mem), "0" (oldval),	      \
+			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     ret; })
+
+#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile ("cmpl $0, %%gs:%P5\n\t"                                \
+                       "je 0f\n\t"                                            \
+                       "lock\n"                                               \
+                       "0:\tcmpxchgl %2, %1"				      \
+		       : "=a" (ret), "=m" (*mem)			      \
+		       : "r" (newval), "m" (*mem), "0" (oldval),	      \
+			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     ret; })
+
+/* XXX We do not really need 64-bit compare-and-exchange.  At least
+   not in the moment.  Using it would mean causing portability
+   problems since not many other 32-bit architectures have support for
+   such an operation.  So don't define any code for now.  If it is
+   really going to be used the code below can be used on Intel Pentium
+   and later, but NOT on i486.  */
+#if 1
+# define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval)	      \
+  ({ __typeof (*mem) ret = *(mem);					      \
+     abort ();								      \
+     ret = (newval);							      \
+     ret = (oldval);							      \
+     ret; })
+# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval)	      \
+  ({ __typeof (*mem) ret = *(mem);					      \
+     abort ();								      \
+     ret = (newval);							      \
+     ret = (oldval);							      \
+     ret; })
+#else
+# ifdef __PIC__
+#  define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile ("xchgl %2, %%ebx\n\t"				      \
+		       LOCK_PREFIX "cmpxchg8b %1\n\t"			      \
+		       "xchgl %2, %%ebx"				      \
+		       : "=A" (ret), "=m" (*mem)			      \
+		       : "DS" (((unsigned long long int) (newval))	      \
+			       & 0xffffffff),				      \
+			 "c" (((unsigned long long int) (newval)) >> 32),     \
+			 "m" (*mem), "a" (((unsigned long long int) (oldval)) \
+					  & 0xffffffff),		      \
+			 "d" (((unsigned long long int) (oldval)) >> 32));    \
+     ret; })
+
+#  define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile ("xchgl %2, %%ebx\n\t"				      \
+		       "cmpl $0, %%gs:%P7\n\t"				      \
+		       "je 0f\n\t"					      \
+		       "lock\n"						      \
+		       "0:\tcmpxchg8b %1\n\t"				      \
+		       "xchgl %2, %%ebx"				      \
+		       : "=A" (ret), "=m" (*mem)			      \
+		       : "DS" (((unsigned long long int) (newval))	      \
+			       & 0xffffffff),				      \
+			 "c" (((unsigned long long int) (newval)) >> 32),     \
+			 "m" (*mem), "a" (((unsigned long long int) (oldval)) \
+					  & 0xffffffff),		      \
+			 "d" (((unsigned long long int) (oldval)) >> 32),     \
+			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     ret; })
+# else
+#  define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile (LOCK_PREFIX "cmpxchg8b %1"			      \
+		       : "=A" (ret), "=m" (*mem)			      \
+		       : "b" (((unsigned long long int) (newval))	      \
+			      & 0xffffffff),				      \
+			 "c" (((unsigned long long int) (newval)) >> 32),     \
+			 "m" (*mem), "a" (((unsigned long long int) (oldval)) \
+					  & 0xffffffff),		      \
+			 "d" (((unsigned long long int) (oldval)) >> 32));    \
+     ret; })
+
+#  define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile ("cmpl $0, %%gs:%P7\n\t"				      \
+		       "je 0f\n\t"					      \
+		       "lock\n"						      \
+		       "0:\tcmpxchg8b %1"				      \
+		       : "=A" (ret), "=m" (*mem)			      \
+		       : "b" (((unsigned long long int) (newval))	      \
+			      & 0xffffffff),				      \
+			 "c" (((unsigned long long int) (newval)) >> 32),     \
+			 "m" (*mem), "a" (((unsigned long long int) (oldval)) \
+					  & 0xffffffff),		      \
+			 "d" (((unsigned long long int) (oldval)) >> 32),     \
+			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     ret; })
+# endif
+#endif
+
+
+/* Note that we need no lock prefix.  */
+#define atomic_exchange_acq(mem, newvalue) \
+  ({ __typeof (*mem) result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile ("xchgb %b0, %1"				      \
+			 : "=q" (result), "=m" (*mem)			      \
+			 : "0" (newvalue), "m" (*mem));			      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile ("xchgw %w0, %1"				      \
+			 : "=r" (result), "=m" (*mem)			      \
+			 : "0" (newvalue), "m" (*mem));			      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile ("xchgl %0, %1"					      \
+			 : "=r" (result), "=m" (*mem)			      \
+			 : "0" (newvalue), "m" (*mem));			      \
+     else								      \
+       {								      \
+	 result = 0;							      \
+	 abort ();							      \
+       }								      \
+     result; })
+
+
+#define __arch_exchange_and_add_body(lock, pfx, mem, value) \
+  ({ __typeof (*mem) __result;						      \
+     __typeof (value) __addval = (value);				      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (lock "xaddb %b0, %1"				      \
+			 : "=q" (__result), "=m" (*mem)			      \
+			 : "0" (__addval), "m" (*mem),			      \
+			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (lock "xaddw %w0, %1"				      \
+			 : "=r" (__result), "=m" (*mem)			      \
+			 : "0" (__addval), "m" (*mem),			      \
+			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (lock "xaddl %0, %1"				      \
+			 : "=r" (__result), "=m" (*mem)			      \
+			 : "0" (__addval), "m" (*mem),			      \
+			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+     else								      \
+       {								      \
+	 __typeof (mem) __memp = (mem);					      \
+	 __typeof (*mem) __tmpval;					      \
+	 __result = *__memp;						      \
+	 do								      \
+	   __tmpval = __result;						      \
+	 while ((__result = pfx##_compare_and_exchange_val_64_acq	      \
+		 (__memp, __result + __addval, __result)) == __tmpval);	      \
+       }								      \
+     __result; })
+
+#define atomic_exchange_and_add(mem, value) \
+  __sync_fetch_and_add (mem, value)
+
+#define __arch_exchange_and_add_cprefix \
+  "cmpl $0, %%gs:%P4\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_exchange_and_add(mem, value) \
+  __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, __arch_c,    \
+				mem, value)
+
+
+#define __arch_add_body(lock, pfx, mem, value) \
+  do {									      \
+    if (__builtin_constant_p (value) && (value) == 1)			      \
+      atomic_increment (mem);						      \
+    else if (__builtin_constant_p (value) && (value) == -1)		      \
+      atomic_decrement (mem);						      \
+    else if (sizeof (*mem) == 1)					      \
+      __asm __volatile (lock "addb %b1, %0"				      \
+			: "=m" (*mem)					      \
+			: "iq" (value), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "addw %w1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (value), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "addl %1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (value), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      {									      \
+	__typeof (value) __addval = (value);				      \
+	__typeof (mem) __memp = (mem);					      \
+	__typeof (*mem) __oldval = *__memp;				      \
+	__typeof (*mem) __tmpval;					      \
+	do								      \
+	  __tmpval = __oldval;						      \
+	while ((__oldval = pfx##_compare_and_exchange_val_64_acq	      \
+		(__memp, __oldval + __addval, __oldval)) == __tmpval);	      \
+      }									      \
+  } while (0)
+
+#define atomic_add(mem, value) \
+  __arch_add_body (LOCK_PREFIX, __arch, mem, value)
+
+#define __arch_add_cprefix \
+  "cmpl $0, %%gs:%P3\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_add(mem, value) \
+  __arch_add_body (__arch_add_cprefix, __arch_c, mem, value)
+
+
+#define atomic_add_negative(mem, value) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "addb %b2, %0; sets %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "iq" (value), "m" (*mem));			      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "addw %w2, %0; sets %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "ir" (value), "m" (*mem));			      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "addl %2, %0; sets %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "ir" (value), "m" (*mem));			      \
+     else								      \
+       abort ();							      \
+     __result; })
+
+
+#define atomic_add_zero(mem, value) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "addb %b2, %0; setz %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "iq" (value), "m" (*mem));			      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "addw %w2, %0; setz %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "ir" (value), "m" (*mem));			      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "addl %2, %0; setz %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "ir" (value), "m" (*mem));			      \
+     else								      \
+       abort ();							      \
+     __result; })
+
+
+#define __arch_increment_body(lock,  pfx, mem) \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (lock "incb %b0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "incw %w0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "incl %0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      {									      \
+	__typeof (mem) __memp = (mem);					      \
+	__typeof (*mem) __oldval = *__memp;				      \
+	__typeof (*mem) __tmpval;					      \
+	do								      \
+	  __tmpval = __oldval;						      \
+	while ((__oldval = pfx##_compare_and_exchange_val_64_acq	      \
+		(__memp, __oldval + 1, __oldval)) == __tmpval);		      \
+      }									      \
+  } while (0)
+
+#define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, __arch, mem)
+
+#define __arch_increment_cprefix \
+  "cmpl $0, %%gs:%P2\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_increment(mem) \
+  __arch_increment_body (__arch_increment_cprefix, __arch_c, mem)
+
+
+#define atomic_increment_and_test(mem) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "incb %0; sete %b1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "incw %0; sete %w1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "incl %0; sete %1"			      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else								      \
+       abort ();							      \
+     __result; })
+
+
+#define __arch_decrement_body(lock, pfx, mem) \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (lock "decb %b0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "decw %w0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "decl %0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      {									      \
+	__typeof (mem) __memp = (mem);					      \
+	__typeof (*mem) __oldval = *__memp;				      \
+	__typeof (*mem) __tmpval;					      \
+	do								      \
+	  __tmpval = __oldval;						      \
+	while ((__oldval = pfx##_compare_and_exchange_val_64_acq	      \
+		(__memp, __oldval - 1, __oldval)) == __tmpval); 	      \
+      }									      \
+  } while (0)
+
+#define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, __arch, mem)
+
+#define __arch_decrement_cprefix \
+  "cmpl $0, %%gs:%P2\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_decrement(mem) \
+  __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem)
+
+
+#define atomic_decrement_and_test(mem) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "decb %b0; sete %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "decw %w0; sete %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "decl %0; sete %1"			      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else								      \
+       abort ();							      \
+     __result; })
+
+
+#define atomic_bit_set(mem, bit) \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (LOCK_PREFIX "orb %b2, %0"			      \
+			: "=m" (*mem)					      \
+			: "m" (*mem), "iq" (1 << (bit)));		      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (LOCK_PREFIX "orw %w2, %0"			      \
+			: "=m" (*mem)					      \
+			: "m" (*mem), "ir" (1 << (bit)));		      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (LOCK_PREFIX "orl %2, %0"			      \
+			: "=m" (*mem)					      \
+			: "m" (*mem), "ir" (1 << (bit)));		      \
+    else								      \
+      abort ();								      \
+  } while (0)
+
+
+#define atomic_bit_test_set(mem, bit) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "btsb %3, %1; setc %0"		      \
+			 : "=q" (__result), "=m" (*mem)			      \
+			 : "m" (*mem), "ir" (bit));			      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "btsw %3, %1; setc %0"		      \
+			 : "=q" (__result), "=m" (*mem)			      \
+			 : "m" (*mem), "ir" (bit));			      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "btsl %3, %1; setc %0"		      \
+			 : "=q" (__result), "=m" (*mem)			      \
+			 : "m" (*mem), "ir" (bit));			      \
+     else							      	      \
+       abort ();							      \
+     __result; })
+
+
+#define atomic_spin_nop() asm ("rep; nop")
+
+
+#define __arch_and_body(lock, mem, mask) \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (lock "andb %b1, %0"				      \
+			: "=m" (*mem)					      \
+			: "iq" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "andw %w1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "andl %1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      abort ();								      \
+  } while (0)
+
+#define __arch_cprefix \
+  "cmpl $0, %%gs:%P3\n\tje 0f\n\tlock\n0:\t"
+
+#define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask)
+
+#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask)
+
+
+#define __arch_or_body(lock, mem, mask) \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (lock "orb %b1, %0"				      \
+			: "=m" (*mem)					      \
+			: "iq" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "orw %w1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "orl %1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      abort ();								      \
+  } while (0)
+
+#define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask)
+
+#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask)
+
+/* We don't use mfence because it is supposedly slower due to having to
+   provide stronger guarantees (e.g., regarding self-modifying code).  */
+#define atomic_full_barrier() \
+    __asm __volatile (LOCK_PREFIX "orl $0, (%%esp)" ::: "memory")
+#define atomic_read_barrier() __asm ("" ::: "memory")
+#define atomic_write_barrier() __asm ("" ::: "memory")
diff --git a/REORG.TODO/sysdeps/i386/backtrace.c b/REORG.TODO/sysdeps/i386/backtrace.c
new file mode 100644
index 0000000000..ee8238d0ce
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/backtrace.c
@@ -0,0 +1,163 @@
+/* Return backtrace of current program state.
+   Copyright (C) 1998-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <libc-lock.h>
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <stdlib.h>
+#include <unwind.h>
+
+struct trace_arg
+{
+  void **array;
+  int cnt, size;
+  void *lastebp, *lastesp;
+};
+
+#ifdef SHARED
+static _Unwind_Reason_Code (*unwind_backtrace) (_Unwind_Trace_Fn, void *);
+static _Unwind_Ptr (*unwind_getip) (struct _Unwind_Context *);
+static _Unwind_Ptr (*unwind_getcfa) (struct _Unwind_Context *);
+static _Unwind_Ptr (*unwind_getgr) (struct _Unwind_Context *, int);
+static void *libgcc_handle;
+
+static void
+init (void)
+{
+  libgcc_handle = __libc_dlopen ("libgcc_s.so.1");
+
+  if (libgcc_handle == NULL)
+    return;
+
+  unwind_backtrace = __libc_dlsym (libgcc_handle, "_Unwind_Backtrace");
+  unwind_getip = __libc_dlsym (libgcc_handle, "_Unwind_GetIP");
+  unwind_getcfa = __libc_dlsym (libgcc_handle, "_Unwind_GetCFA");
+  unwind_getgr = __libc_dlsym (libgcc_handle, "_Unwind_GetGR");
+  if (unwind_getip == NULL || unwind_getgr == NULL || unwind_getcfa == NULL)
+    {
+      unwind_backtrace = NULL;
+      __libc_dlclose (libgcc_handle);
+      libgcc_handle = NULL;
+    }
+}
+#else
+# define unwind_backtrace _Unwind_Backtrace
+# define unwind_getip _Unwind_GetIP
+# define unwind_getcfa _Unwind_GetCFA
+# define unwind_getgr _Unwind_GetGR
+#endif
+
+static _Unwind_Reason_Code
+backtrace_helper (struct _Unwind_Context *ctx, void *a)
+{
+  struct trace_arg *arg = a;
+
+  /* We are first called with address in the __backtrace function.
+     Skip it.  */
+  if (arg->cnt != -1)
+    arg->array[arg->cnt] = (void *) unwind_getip (ctx);
+  if (++arg->cnt == arg->size)
+    return _URC_END_OF_STACK;
+
+  /* %ebp is DWARF2 register 5 on IA-32.  */
+  arg->lastebp = (void *) unwind_getgr (ctx, 5);
+  arg->lastesp = (void *) unwind_getcfa (ctx);
+  return _URC_NO_REASON;
+}
+
+
+/* This is a global variable set at program start time.  It marks the
+   highest used stack address.  */
+extern void *__libc_stack_end;
+
+
+/* This is the stack layout we see with every stack frame
+   if not compiled without frame pointer.
+
+            +-----------------+        +-----------------+
+    %ebp -> | %ebp last frame--------> | %ebp last frame--->...
+            |                 |        |                 |
+            | return address  |        | return address  |
+            +-----------------+        +-----------------+
+
+   First try as far to get as far as possible using
+   _Unwind_Backtrace which handles -fomit-frame-pointer
+   as well, but requires .eh_frame info.  Then fall back to
+   walking the stack manually.  */
+
+struct layout
+{
+  struct layout *ebp;
+  void *ret;
+};
+
+
+int
+__backtrace (void **array, int size)
+{
+  struct trace_arg arg = { .array = array, .size = size, .cnt = -1 };
+
+  if (size <= 0)
+    return 0;
+
+#ifdef SHARED
+  __libc_once_define (static, once);
+
+  __libc_once (once, init);
+  if (unwind_backtrace == NULL)
+    return 0;
+#endif
+
+  unwind_backtrace (backtrace_helper, &arg);
+
+  if (arg.cnt > 1 && arg.array[arg.cnt - 1] == NULL)
+    --arg.cnt;
+  else if (arg.cnt < size)
+    {
+      struct layout *ebp = (struct layout *) arg.lastebp;
+
+      while (arg.cnt < size)
+	{
+	  /* Check for out of range.  */
+	  if ((void *) ebp < arg.lastesp || (void *) ebp > __libc_stack_end
+	      || ((long) ebp & 3))
+	    break;
+
+	  array[arg.cnt++] = ebp->ret;
+	  ebp = ebp->ebp;
+	}
+    }
+  return arg.cnt != -1 ? arg.cnt : 0;
+}
+weak_alias (__backtrace, backtrace)
+libc_hidden_def (__backtrace)
+
+
+#ifdef SHARED
+/* Free all resources if necessary.  */
+libc_freeres_fn (free_mem)
+{
+  unwind_backtrace = NULL;
+  if (libgcc_handle != NULL)
+    {
+      __libc_dlclose (libgcc_handle);
+      libgcc_handle = NULL;
+    }
+}
+#endif
diff --git a/REORG.TODO/sysdeps/i386/bcopy.S b/REORG.TODO/sysdeps/i386/bcopy.S
new file mode 100644
index 0000000000..12b8ddb886
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/bcopy.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		bcopy
+#include "memcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/bsd-_setjmp.S b/REORG.TODO/sysdeps/i386/bsd-_setjmp.S
new file mode 100644
index 0000000000..6496304946
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/bsd-_setjmp.S
@@ -0,0 +1,56 @@
+/* BSD `_setjmp' entry point to `sigsetjmp (..., 0)'.  i386 version.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This just does a tail-call to `__sigsetjmp (ARG, 0)'.
+   We cannot do it in C because it must be a tail-call, so frame-unwinding
+   in setjmp doesn't clobber the state restored by longjmp.  */
+
+#include <sysdep.h>
+#include <jmpbuf-offsets.h>
+#include <stap-probe.h>
+
+#define PARMS	4		/* no space for saved regs */
+#define JMPBUF	PARMS
+#define SIGMSK	JMPBUF+4
+
+ENTRY (_setjmp)
+
+	xorl %eax, %eax
+	movl JMPBUF(%esp), %edx
+
+	/* Save registers.  */
+	movl %ebx, (JB_BX*4)(%edx)
+	movl %esi, (JB_SI*4)(%edx)
+	movl %edi, (JB_DI*4)(%edx)
+	leal JMPBUF(%esp), %ecx	/* Save SP as it will be after we return.  */
+#ifdef PTR_MANGLE
+	PTR_MANGLE (%ecx)
+#endif
+	movl %ecx, (JB_SP*4)(%edx)
+	movl 0(%esp), %ecx	/* Save PC we are returning to now.  */
+	LIBC_PROBE (setjmp, 3, 4@%edx, -4@$0, 4@%ecx)
+#ifdef PTR_MANGLE
+	PTR_MANGLE (%ecx)
+#endif
+	movl %ecx, (JB_PC*4)(%edx)
+	movl %ebp, (JB_BP*4)(%edx) /* Save caller's frame pointer.  */
+
+	movl %eax, JB_SIZE(%edx) /* No signal mask set.  */
+	ret
+END (_setjmp)
+libc_hidden_def (_setjmp)
diff --git a/REORG.TODO/sysdeps/i386/bsd-setjmp.S b/REORG.TODO/sysdeps/i386/bsd-setjmp.S
new file mode 100644
index 0000000000..5710e1f42b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/bsd-setjmp.S
@@ -0,0 +1,66 @@
+/* BSD `setjmp' entry point to `sigsetjmp (..., 1)'.  i386 version.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This just does a tail-call to `__sigsetjmp (ARG, 1)'.
+   We cannot do it in C because it must be a tail-call, so frame-unwinding
+   in setjmp doesn't clobber the state restored by longjmp.  */
+
+#include <sysdep.h>
+#include <jmpbuf-offsets.h>
+#include <stap-probe.h>
+
+#define PARMS  4		/* no space for saved regs */
+#define JMPBUF PARMS
+#define SIGMSK JMPBUF+4
+
+ENTRY (setjmp)
+	/* Note that we have to use a non-exported symbol in the next
+	   jump since otherwise gas will emit it as a jump through the
+	   PLT which is what we cannot use here.  */
+
+	movl JMPBUF(%esp), %eax
+
+	/* Save registers.  */
+	movl %ebx, (JB_BX*4)(%eax)
+	movl %esi, (JB_SI*4)(%eax)
+	movl %edi, (JB_DI*4)(%eax)
+	leal JMPBUF(%esp), %ecx	/* Save SP as it will be after we return.  */
+#ifdef PTR_MANGLE
+	PTR_MANGLE (%ecx)
+#endif
+	movl %ecx, (JB_SP*4)(%eax)
+	movl 0(%esp), %ecx	/* Save PC we are returning to now.  */
+	LIBC_PROBE (setjmp, 3, 4@%eax, -4@$1, 4@%ecx)
+#ifdef PTR_MANGLE
+	PTR_MANGLE (%ecx)
+#endif
+	movl %ecx, (JB_PC*4)(%eax)
+	movl %ebp, (JB_BP*4)(%eax) /* Save caller's frame pointer.  */
+
+	/* Call __sigjmp_save.  */
+	pushl $1
+	cfi_adjust_cfa_offset (4)
+	pushl 8(%esp)
+	cfi_adjust_cfa_offset (4)
+	call __sigjmp_save
+	popl %ecx
+	cfi_adjust_cfa_offset (-4)
+	popl %edx
+	cfi_adjust_cfa_offset (-4)
+	ret
+END (setjmp)
diff --git a/REORG.TODO/sysdeps/i386/bzero.S b/REORG.TODO/sysdeps/i386/bzero.S
new file mode 100644
index 0000000000..c8dd47b4da
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/bzero.S
@@ -0,0 +1,5 @@
+#define USE_AS_BZERO
+#define memset __bzero
+#include "memset.S"
+
+weak_alias (__bzero, bzero)
diff --git a/REORG.TODO/sysdeps/i386/cacheinfo.c b/REORG.TODO/sysdeps/i386/cacheinfo.c
new file mode 100644
index 0000000000..f15fe0779a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/cacheinfo.c
@@ -0,0 +1,3 @@
+#define DISABLE_PREFETCHW
+
+#include <sysdeps/x86/cacheinfo.c>
diff --git a/REORG.TODO/sysdeps/i386/configure b/REORG.TODO/sysdeps/i386/configure
new file mode 100644
index 0000000000..5b55c5affe
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/configure
@@ -0,0 +1,84 @@
+# This file is generated from configure.ac by Autoconf.  DO NOT EDIT!
+ # Local configure fragment for sysdeps/i386.
+
+# We no longer support i386 since it lacks the atomic instructions
+# required to implement NPTL threading.
+if test "$config_machine" = i386; then
+  as_fn_error $? "
+*** ERROR: Support for i386 is deprecated.
+*** Please use host i786, i686, i585 or i486.
+*** For example: /src/glibc/configure --host=i686-pc-linux-gnu ...\"" "$LINENO" 5
+fi
+
+# The GNU C Library can't be built for i386.  There are several reasons for
+# this restriction.  The primary reason is that i386 lacks the atomic
+# operations required to support the current NPTL implementation.  While it is
+# possible that such atomic operations could be emulated in the kernel to date
+# no such work has been done to enable this.  Even with NPTL disabled you still
+# have no atomic.h implementation.  Given the declining use of i386 we disable
+# support for building with `-march=i386' or `-mcpu=i386.' We don't explicitly
+# check for i386, instead we make sure the compiler has support for inlining
+# the builtin __sync_val_compare_and_swap. If it does then we should have no
+# problem building for i386.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for compiler support of inlined builtin function __sync_val_compare_and_swap" >&5
+$as_echo_n "checking for compiler support of inlined builtin function __sync_val_compare_and_swap... " >&6; }
+libc_compiler_builtin_inlined=no
+cat > conftest.c <<EOF
+int _start (void) { int a, b, c; __sync_val_compare_and_swap (&a, b, c); return 0; }
+EOF
+if ! { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS
+		     -O0 -nostdlib -nostartfiles
+		     -S conftest.c -o - | fgrep "__sync_val_compare_and_swap"
+		     1>&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+then
+  libc_compiler_builtin_inlined=yes
+fi
+rm -f conftest*
+if test $libc_compiler_builtin_inlined = yes; then
+  libc_cv_unsupported_i386=no
+else
+  as_fn_error $? "
+*** Building with -march=i386/-mcpu=i386 is not supported.
+*** Please use host i786, i686, i586, or i486.
+*** For example: /source/glibc/configure CFLAGS='-O2 -march=i686' ..." "$LINENO" 5
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_compiler_builtin_inlined" >&5
+$as_echo "$libc_compiler_builtin_inlined" >&6; }
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Intel MPX support" >&5
+$as_echo_n "checking for Intel MPX support... " >&6; }
+if ${libc_cv_asm_mpx+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.s <<\EOF
+        bndmov %bnd0,(%esp)
+EOF
+if { ac_try='${CC-cc} -c $ASFLAGS conftest.s 1>&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+  libc_cv_asm_mpx=yes
+else
+  libc_cv_asm_mpx=no
+fi
+rm -f conftest*
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_mpx" >&5
+$as_echo "$libc_cv_asm_mpx" >&6; }
+if test $libc_cv_asm_mpx = yes; then
+  $as_echo "#define HAVE_MPX_SUPPORT 1" >>confdefs.h
+
+fi
+
+$as_echo "#define USE_REGPARMS 1" >>confdefs.h
+
+
+$as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h
+
diff --git a/REORG.TODO/sysdeps/i386/configure.ac b/REORG.TODO/sysdeps/i386/configure.ac
new file mode 100644
index 0000000000..19ef33f34a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/configure.ac
@@ -0,0 +1,52 @@
+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
+# Local configure fragment for sysdeps/i386.
+
+# We no longer support i386 since it lacks the atomic instructions
+# required to implement NPTL threading.
+if test "$config_machine" = i386; then
+  AC_MSG_ERROR([
+*** ERROR: Support for i386 is deprecated.
+*** Please use host i786, i686, i585 or i486.
+*** For example: /src/glibc/configure --host=i686-pc-linux-gnu ..."])
+fi
+
+# The GNU C Library can't be built for i386.  There are several reasons for
+# this restriction.  The primary reason is that i386 lacks the atomic
+# operations required to support the current NPTL implementation.  While it is
+# possible that such atomic operations could be emulated in the kernel to date
+# no such work has been done to enable this.  Even with NPTL disabled you still
+# have no atomic.h implementation.  Given the declining use of i386 we disable
+# support for building with `-march=i386' or `-mcpu=i386.' We don't explicitly
+# check for i386, instead we make sure the compiler has support for inlining
+# the builtin __sync_val_compare_and_swap. If it does then we should have no
+# problem building for i386.
+LIBC_COMPILER_BUILTIN_INLINED(
+  [__sync_val_compare_and_swap],
+  [int a, b, c; __sync_val_compare_and_swap (&a, b, c);],
+  [-O0],
+  [libc_cv_unsupported_i386=no],
+  [AC_MSG_ERROR([
+*** Building with -march=i386/-mcpu=i386 is not supported.
+*** Please use host i786, i686, i586, or i486.
+*** For example: /source/glibc/configure CFLAGS='-O2 -march=i686' ...])])
+
+dnl Check whether asm supports Intel MPX
+AC_CACHE_CHECK(for Intel MPX support, libc_cv_asm_mpx, [dnl
+cat > conftest.s <<\EOF
+        bndmov %bnd0,(%esp)
+EOF
+if AC_TRY_COMMAND(${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD); then
+  libc_cv_asm_mpx=yes
+else
+  libc_cv_asm_mpx=no
+fi
+rm -f conftest*])
+if test $libc_cv_asm_mpx = yes; then
+  AC_DEFINE(HAVE_MPX_SUPPORT)
+fi
+
+AC_DEFINE(USE_REGPARMS)
+
+dnl It is always possible to access static and hidden symbols in an
+dnl position independent way.
+AC_DEFINE(PI_STATIC_AND_HIDDEN)
diff --git a/REORG.TODO/sysdeps/i386/crti.S b/REORG.TODO/sysdeps/i386/crti.S
new file mode 100644
index 0000000000..f800209990
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/crti.S
@@ -0,0 +1,84 @@
+/* Special .init and .fini section support for x86.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file with other
+   programs, and to distribute those programs without any restriction
+   coming from the use of this file. (The GNU Lesser General Public
+   License restrictions do apply in other respects; for example, they
+   cover modification of the file, and distribution when not linked
+   into another program.)
+
+   Note that people who make modified versions of this file are not
+   obligated to grant this special exception for their modified
+   versions; it is their choice whether to do so. The GNU Lesser
+   General Public License gives permission to release a modified
+   version without this exception; this exception also makes it
+   possible to release a modified version which carries forward this
+   exception.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* crti.S puts a function prologue at the beginning of the .init and
+   .fini sections and defines global symbols for those addresses, so
+   they can be called as functions.  The symbols _init and _fini are
+   magic and cause the linker to emit DT_INIT and DT_FINI.  */
+
+#include <libc-symbols.h>
+#include <sysdep.h>
+
+#ifndef PREINIT_FUNCTION
+# define PREINIT_FUNCTION __gmon_start__
+#endif
+
+#ifndef PREINIT_FUNCTION_WEAK
+# define PREINIT_FUNCTION_WEAK 1
+#endif
+
+#if PREINIT_FUNCTION_WEAK
+	weak_extern (PREINIT_FUNCTION)
+#else
+	.hidden PREINIT_FUNCTION
+#endif
+
+	.section .init,"ax",@progbits
+	.p2align 2
+	.globl _init
+	.type _init, @function
+_init:
+	pushl %ebx
+	/* Maintain 16-byte stack alignment for called functions.  */
+	subl $8, %esp
+	LOAD_PIC_REG (bx)
+#if PREINIT_FUNCTION_WEAK
+	movl PREINIT_FUNCTION@GOT(%ebx), %eax
+	testl %eax, %eax
+	je .Lno_weak_fn
+	call PREINIT_FUNCTION@PLT
+.Lno_weak_fn:
+#else
+	call PREINIT_FUNCTION
+#endif
+
+	.section .fini,"ax",@progbits
+	.p2align 2
+	.globl _fini
+	.type _fini, @function
+_fini:
+	pushl %ebx
+	subl $8, %esp
+	LOAD_PIC_REG (bx)
diff --git a/REORG.TODO/sysdeps/i386/crtn.S b/REORG.TODO/sysdeps/i386/crtn.S
new file mode 100644
index 0000000000..b18b9c171a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/crtn.S
@@ -0,0 +1,47 @@
+/* Special .init and .fini section support for x86.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file with other
+   programs, and to distribute those programs without any restriction
+   coming from the use of this file. (The GNU Lesser General Public
+   License restrictions do apply in other respects; for example, they
+   cover modification of the file, and distribution when not linked
+   into another program.)
+
+   Note that people who make modified versions of this file are not
+   obligated to grant this special exception for their modified
+   versions; it is their choice whether to do so. The GNU Lesser
+   General Public License gives permission to release a modified
+   version without this exception; this exception also makes it
+   possible to release a modified version which carries forward this
+   exception.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* crtn.S puts function epilogues in the .init and .fini sections
+   corresponding to the prologues in crti.S. */
+
+	.section .init,"ax",@progbits
+	addl $8, %esp
+	popl %ebx
+	ret
+
+	.section .fini,"ax",@progbits
+	addl $8, %esp
+	popl %ebx
+	ret
diff --git a/REORG.TODO/sysdeps/i386/dl-irel.h b/REORG.TODO/sysdeps/i386/dl-irel.h
new file mode 100644
index 0000000000..824e81aed1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-irel.h
@@ -0,0 +1,51 @@
+/* Machine-dependent ELF indirect relocation inline functions.
+   i386 version.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_IREL_H
+#define _DL_IREL_H
+
+#include <stdio.h>
+#include <unistd.h>
+
+#define ELF_MACHINE_IREL	1
+
+static inline Elf32_Addr
+__attribute ((always_inline))
+elf_ifunc_invoke (Elf32_Addr addr)
+{
+  return ((Elf32_Addr (*) (void)) (addr)) ();
+}
+
+static inline void
+__attribute ((always_inline))
+elf_irel (const Elf32_Rel *reloc)
+{
+  Elf32_Addr *const reloc_addr = (void *) reloc->r_offset;
+  const unsigned long int r_type = ELF32_R_TYPE (reloc->r_info);
+
+  if (__glibc_likely (r_type == R_386_IRELATIVE))
+    {
+      Elf32_Addr value = elf_ifunc_invoke(*reloc_addr);
+      *reloc_addr = value;
+    }
+  else
+    __libc_fatal ("unexpected reloc type in static binary");
+}
+
+#endif /* dl-irel.h */
diff --git a/REORG.TODO/sysdeps/i386/dl-lookupcfg.h b/REORG.TODO/sysdeps/i386/dl-lookupcfg.h
new file mode 100644
index 0000000000..47b534a059
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-lookupcfg.h
@@ -0,0 +1,32 @@
+/* Configuration of lookup functions.
+   Copyright (C) 2005-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define DL_UNMAP_IS_SPECIAL
+
+#include_next <dl-lookupcfg.h>
+
+/* Address of protected data defined in the shared library may be
+   external due to copy relocation.   */
+#define DL_EXTERN_PROTECTED_DATA
+
+struct link_map;
+
+extern void _dl_unmap (struct link_map *map)
+  internal_function attribute_hidden;
+
+#define DL_UNMAP(map) _dl_unmap (map)
diff --git a/REORG.TODO/sysdeps/i386/dl-machine.h b/REORG.TODO/sysdeps/i386/dl-machine.h
new file mode 100644
index 0000000000..57d4a0bdbd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-machine.h
@@ -0,0 +1,757 @@
+/* Machine-dependent ELF dynamic relocation inline functions.  i386 version.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef dl_machine_h
+#define dl_machine_h
+
+#define ELF_MACHINE_NAME "i386"
+
+#include <sys/param.h>
+#include <sysdep.h>
+#include <tls.h>
+#include <dl-tlsdesc.h>
+#include <cpu-features.c>
+
+/* Return nonzero iff ELF header is compatible with the running host.  */
+static inline int __attribute__ ((unused))
+elf_machine_matches_host (const Elf32_Ehdr *ehdr)
+{
+  return ehdr->e_machine == EM_386;
+}
+
+
+/* Return the link-time address of _DYNAMIC.  Conveniently, this is the
+   first element of the GOT, a special entry that is never relocated.  */
+static inline Elf32_Addr __attribute__ ((unused, const))
+elf_machine_dynamic (void)
+{
+  /* This produces a GOTOFF reloc that resolves to zero at link time, so in
+     fact just loads from the GOT register directly.  By doing it without
+     an asm we can let the compiler choose any register.  */
+  extern const Elf32_Addr _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
+  return _GLOBAL_OFFSET_TABLE_[0];
+}
+
+/* Return the run-time load address of the shared object.  */
+static inline Elf32_Addr __attribute__ ((unused))
+elf_machine_load_address (void)
+{
+  /* Compute the difference between the runtime address of _DYNAMIC as seen
+     by a GOTOFF reference, and the link-time address found in the special
+     unrelocated first GOT entry.  */
+  extern Elf32_Dyn bygotoff[] asm ("_DYNAMIC") attribute_hidden;
+  return (Elf32_Addr) &bygotoff - elf_machine_dynamic ();
+}
+
+/* Set up the loaded object described by L so its unrelocated PLT
+   entries will jump to the on-demand fixup code in dl-runtime.c.  */
+
+static inline int __attribute__ ((unused, always_inline))
+elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
+{
+  Elf32_Addr *got;
+  extern void _dl_runtime_resolve (Elf32_Word) attribute_hidden;
+  extern void _dl_runtime_profile (Elf32_Word) attribute_hidden;
+
+  if (l->l_info[DT_JMPREL] && lazy)
+    {
+      /* The GOT entries for functions in the PLT have not yet been filled
+	 in.  Their initial contents will arrange when called to push an
+	 offset into the .rel.plt section, push _GLOBAL_OFFSET_TABLE_[1],
+	 and then jump to _GLOBAL_OFFSET_TABLE[2].  */
+      got = (Elf32_Addr *) D_PTR (l, l_info[DT_PLTGOT]);
+      /* If a library is prelinked but we have to relocate anyway,
+	 we have to be able to undo the prelinking of .got.plt.
+	 The prelinker saved us here address of .plt + 0x16.  */
+      if (got[1])
+	{
+	  l->l_mach.plt = got[1] + l->l_addr;
+	  l->l_mach.gotplt = (Elf32_Addr) &got[3];
+	}
+      got[1] = (Elf32_Addr) l;	/* Identify this shared object.  */
+
+      /* The got[2] entry contains the address of a function which gets
+	 called to get the address of a so far unresolved function and
+	 jump to it.  The profiling extension of the dynamic linker allows
+	 to intercept the calls to collect information.  In this case we
+	 don't store the address in the GOT so that all future calls also
+	 end in this function.  */
+      if (__glibc_unlikely (profile))
+	{
+	  got[2] = (Elf32_Addr) &_dl_runtime_profile;
+
+	  if (GLRO(dl_profile) != NULL
+	      && _dl_name_match_p (GLRO(dl_profile), l))
+	    /* This is the object we are looking for.  Say that we really
+	       want profiling and the timers are started.  */
+	    GL(dl_profile_map) = l;
+	}
+      else
+	/* This function will get called to fix up the GOT entry indicated by
+	   the offset on the stack, and then jump to the resolved address.  */
+	got[2] = (Elf32_Addr) &_dl_runtime_resolve;
+    }
+
+  return lazy;
+}
+
+#ifdef IN_DL_RUNTIME
+
+# ifndef PROF
+/* We add a declaration of this function here so that in dl-runtime.c
+   the ELF_MACHINE_RUNTIME_TRAMPOLINE macro really can pass the parameters
+   in registers.
+
+   We cannot use this scheme for profiling because the _mcount call
+   destroys the passed register information.  */
+#define ARCH_FIXUP_ATTRIBUTE __attribute__ ((regparm (3), stdcall, unused))
+
+extern ElfW(Addr) _dl_fixup (struct link_map *l,
+			     ElfW(Word) reloc_offset)
+     ARCH_FIXUP_ATTRIBUTE;
+extern ElfW(Addr) _dl_profile_fixup (struct link_map *l,
+				     ElfW(Word) reloc_offset,
+				     ElfW(Addr) retaddr, void *regs,
+				     long int *framesizep)
+     ARCH_FIXUP_ATTRIBUTE;
+# endif
+
+#endif
+
+/* Mask identifying addresses reserved for the user program,
+   where the dynamic linker should not map anything.  */
+#define ELF_MACHINE_USER_ADDRESS_MASK	0xf8000000UL
+
+/* Initial entry point code for the dynamic linker.
+   The C function `_dl_start' is the real entry point;
+   its return value is the user program's entry point.  */
+
+#define RTLD_START asm ("\n\
+	.text\n\
+	.align 16\n\
+0:	movl (%esp), %ebx\n\
+	ret\n\
+	.align 16\n\
+.globl _start\n\
+.globl _dl_start_user\n\
+_start:\n\
+	# Note that _dl_start gets the parameter in %eax.\n\
+	movl %esp, %eax\n\
+	call _dl_start\n\
+_dl_start_user:\n\
+	# Save the user entry point address in %edi.\n\
+	movl %eax, %edi\n\
+	# Point %ebx at the GOT.\n\
+	call 0b\n\
+	addl $_GLOBAL_OFFSET_TABLE_, %ebx\n\
+	# See if we were run as a command with the executable file\n\
+	# name as an extra leading argument.\n\
+	movl _dl_skip_args@GOTOFF(%ebx), %eax\n\
+	# Pop the original argument count.\n\
+	popl %edx\n\
+	# Adjust the stack pointer to skip _dl_skip_args words.\n\
+	leal (%esp,%eax,4), %esp\n\
+	# Subtract _dl_skip_args from argc.\n\
+	subl %eax, %edx\n\
+	# Push argc back on the stack.\n\
+	push %edx\n\
+	# The special initializer gets called with the stack just\n\
+	# as the application's entry point will see it; it can\n\
+	# switch stacks if it moves these contents over.\n\
+" RTLD_START_SPECIAL_INIT "\n\
+	# Load the parameters again.\n\
+	# (eax, edx, ecx, *--esp) = (_dl_loaded, argc, argv, envp)\n\
+	movl _rtld_local@GOTOFF(%ebx), %eax\n\
+	leal 8(%esp,%edx,4), %esi\n\
+	leal 4(%esp), %ecx\n\
+	movl %esp, %ebp\n\
+	# Make sure _dl_init is run with 16 byte aligned stack.\n\
+	andl $-16, %esp\n\
+	pushl %eax\n\
+	pushl %eax\n\
+	pushl %ebp\n\
+	pushl %esi\n\
+	# Clear %ebp, so that even constructors have terminated backchain.\n\
+	xorl %ebp, %ebp\n\
+	# Call the function to run the initializers.\n\
+	call _dl_init\n\
+	# Pass our finalizer function to the user in %edx, as per ELF ABI.\n\
+	leal _dl_fini@GOTOFF(%ebx), %edx\n\
+	# Restore %esp _start expects.\n\
+	movl (%esp), %esp\n\
+	# Jump to the user's entry point.\n\
+	jmp *%edi\n\
+	.previous\n\
+");
+
+#ifndef RTLD_START_SPECIAL_INIT
+# define RTLD_START_SPECIAL_INIT /* nothing */
+#endif
+
+/* ELF_RTYPE_CLASS_PLT iff TYPE describes relocation of a PLT entry or
+   TLS variable, so undefined references should not be allowed to
+   define the value.
+   ELF_RTYPE_CLASS_COPY iff TYPE should not be allowed to resolve to one
+   of the main executable's symbols, as for a COPY reloc.
+   ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA iff TYPE describes relocation may
+   against protected data whose address be external due to copy relocation.
+ */
+# define elf_machine_type_class(type) \
+  ((((type) == R_386_JMP_SLOT || (type) == R_386_TLS_DTPMOD32		      \
+     || (type) == R_386_TLS_DTPOFF32 || (type) == R_386_TLS_TPOFF32	      \
+     || (type) == R_386_TLS_TPOFF || (type) == R_386_TLS_DESC)		      \
+    * ELF_RTYPE_CLASS_PLT)						      \
+   | (((type) == R_386_COPY) * ELF_RTYPE_CLASS_COPY)			      \
+   | (((type) == R_386_GLOB_DAT) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA))
+
+/* A reloc type used for ld.so cmdline arg lookups to reject PLT entries.  */
+#define ELF_MACHINE_JMP_SLOT	R_386_JMP_SLOT
+
+/* The i386 never uses Elf32_Rela relocations for the dynamic linker.
+   Prelinked libraries may use Elf32_Rela though.  */
+#define ELF_MACHINE_PLT_REL 1
+
+/* We define an initialization functions.  This is called very early in
+   _dl_sysdep_start.  */
+#define DL_PLATFORM_INIT dl_platform_init ()
+
+static inline void __attribute__ ((unused))
+dl_platform_init (void)
+{
+#if IS_IN (rtld)
+  /* init_cpu_features has been called early from __libc_start_main in
+     static executable.  */
+  init_cpu_features (&GLRO(dl_x86_cpu_features));
+#else
+  if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
+    /* Avoid an empty string which would disturb us.  */
+    GLRO(dl_platform) = NULL;
+#endif
+}
+
+static inline Elf32_Addr
+elf_machine_fixup_plt (struct link_map *map, lookup_t t,
+		       const Elf32_Rel *reloc,
+		       Elf32_Addr *reloc_addr, Elf32_Addr value)
+{
+  return *reloc_addr = value;
+}
+
+/* Return the final value of a plt relocation.  */
+static inline Elf32_Addr
+elf_machine_plt_value (struct link_map *map, const Elf32_Rel *reloc,
+		       Elf32_Addr value)
+{
+  return value;
+}
+
+
+/* Names of the architecture-specific auditing callback functions.  */
+#define ARCH_LA_PLTENTER i86_gnu_pltenter
+#define ARCH_LA_PLTEXIT i86_gnu_pltexit
+
+#endif /* !dl_machine_h */
+
+/* The i386 never uses Elf32_Rela relocations for the dynamic linker.
+   Prelinked libraries may use Elf32_Rela though.  */
+#define ELF_MACHINE_NO_RELA defined RTLD_BOOTSTRAP
+#define ELF_MACHINE_NO_REL 0
+
+#ifdef RESOLVE_MAP
+
+/* Perform the relocation specified by RELOC and SYM (which is fully resolved).
+   MAP is the object containing the reloc.  */
+
+auto inline void
+__attribute ((always_inline))
+elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
+		 const Elf32_Sym *sym, const struct r_found_version *version,
+		 void *const reloc_addr_arg, int skip_ifunc)
+{
+  Elf32_Addr *const reloc_addr = reloc_addr_arg;
+  const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+
+# if !defined RTLD_BOOTSTRAP || !defined HAVE_Z_COMBRELOC
+  if (__glibc_unlikely (r_type == R_386_RELATIVE))
+    {
+#  if !defined RTLD_BOOTSTRAP && !defined HAVE_Z_COMBRELOC
+      /* This is defined in rtld.c, but nowhere in the static libc.a;
+	 make the reference weak so static programs can still link.
+	 This declaration cannot be done when compiling rtld.c
+	 (i.e. #ifdef RTLD_BOOTSTRAP) because rtld.c contains the
+	 common defn for _dl_rtld_map, which is incompatible with a
+	 weak decl in the same file.  */
+#   ifndef SHARED
+      weak_extern (_dl_rtld_map);
+#   endif
+      if (map != &GL(dl_rtld_map)) /* Already done in rtld itself.  */
+#  endif
+	*reloc_addr += map->l_addr;
+    }
+#  ifndef RTLD_BOOTSTRAP
+  else if (__glibc_unlikely (r_type == R_386_NONE))
+    return;
+#  endif
+  else
+# endif	/* !RTLD_BOOTSTRAP and have no -z combreloc */
+    {
+# ifndef RTLD_BOOTSTRAP
+      const Elf32_Sym *const refsym = sym;
+# endif
+      struct link_map *sym_map = RESOLVE_MAP (&sym, version, r_type);
+      Elf32_Addr value = sym_map == NULL ? 0 : sym_map->l_addr + sym->st_value;
+
+      if (sym != NULL
+	  && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC,
+			       0)
+	  && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)
+	  && __builtin_expect (!skip_ifunc, 1))
+	{
+# ifndef RTLD_BOOTSTRAP
+	  if (sym_map != map
+	      && sym_map->l_type != lt_executable
+	      && !sym_map->l_relocated)
+	    {
+	      const char *strtab
+		= (const char *) D_PTR (map, l_info[DT_STRTAB]);
+	      _dl_error_printf ("\
+%s: Relink `%s' with `%s' for IFUNC symbol `%s'\n",
+				RTLD_PROGNAME, map->l_name,
+				sym_map->l_name,
+				strtab + refsym->st_name);
+	    }
+# endif
+	  value = ((Elf32_Addr (*) (void)) value) ();
+	}
+
+      switch (r_type)
+	{
+# ifndef RTLD_BOOTSTRAP
+	case R_386_SIZE32:
+	  /* Set to symbol size plus addend.  */
+	  *reloc_addr += sym->st_size;
+	  break;
+# endif
+	case R_386_GLOB_DAT:
+	case R_386_JMP_SLOT:
+	  *reloc_addr = value;
+	  break;
+
+	case R_386_TLS_DTPMOD32:
+# ifdef RTLD_BOOTSTRAP
+	  /* During startup the dynamic linker is always the module
+	     with index 1.
+	     XXX If this relocation is necessary move before RESOLVE
+	     call.  */
+	  *reloc_addr = 1;
+# else
+	  /* Get the information from the link map returned by the
+	     resolv function.  */
+	  if (sym_map != NULL)
+	    *reloc_addr = sym_map->l_tls_modid;
+# endif
+	  break;
+	case R_386_TLS_DTPOFF32:
+# ifndef RTLD_BOOTSTRAP
+	  /* During relocation all TLS symbols are defined and used.
+	     Therefore the offset is already correct.  */
+	  if (sym != NULL)
+	    *reloc_addr = sym->st_value;
+# endif
+	  break;
+	case R_386_TLS_DESC:
+	  {
+	    struct tlsdesc volatile *td =
+	      (struct tlsdesc volatile *)reloc_addr;
+
+# ifndef RTLD_BOOTSTRAP
+	    if (! sym)
+	      td->entry = _dl_tlsdesc_undefweak;
+	    else
+# endif
+	      {
+# ifndef RTLD_BOOTSTRAP
+#  ifndef SHARED
+		CHECK_STATIC_TLS (map, sym_map);
+#  else
+		if (!TRY_STATIC_TLS (map, sym_map))
+		  {
+		    td->arg = _dl_make_tlsdesc_dynamic
+		      (sym_map, sym->st_value + (ElfW(Word))td->arg);
+		    td->entry = _dl_tlsdesc_dynamic;
+		  }
+		else
+#  endif
+# endif
+		  {
+		    td->arg = (void*)(sym->st_value - sym_map->l_tls_offset
+				      + (ElfW(Word))td->arg);
+		    td->entry = _dl_tlsdesc_return;
+		  }
+	      }
+	    break;
+	  }
+	case R_386_TLS_TPOFF32:
+	  /* The offset is positive, backward from the thread pointer.  */
+#  ifdef RTLD_BOOTSTRAP
+	  *reloc_addr += map->l_tls_offset - sym->st_value;
+#  else
+	  /* We know the offset of object the symbol is contained in.
+	     It is a positive value which will be subtracted from the
+	     thread pointer.  To get the variable position in the TLS
+	     block we subtract the offset from that of the TLS block.  */
+	  if (sym != NULL)
+	    {
+	      CHECK_STATIC_TLS (map, sym_map);
+	      *reloc_addr += sym_map->l_tls_offset - sym->st_value;
+	    }
+# endif
+	  break;
+	case R_386_TLS_TPOFF:
+	  /* The offset is negative, forward from the thread pointer.  */
+# ifdef RTLD_BOOTSTRAP
+	  *reloc_addr += sym->st_value - map->l_tls_offset;
+# else
+	  /* We know the offset of object the symbol is contained in.
+	     It is a negative value which will be added to the
+	     thread pointer.  */
+	  if (sym != NULL)
+	    {
+	      CHECK_STATIC_TLS (map, sym_map);
+	      *reloc_addr += sym->st_value - sym_map->l_tls_offset;
+	    }
+# endif
+	  break;
+
+# ifndef RTLD_BOOTSTRAP
+	case R_386_32:
+	  *reloc_addr += value;
+	  break;
+	case R_386_PC32:
+	  *reloc_addr += (value - (Elf32_Addr) reloc_addr);
+	  break;
+	case R_386_COPY:
+	  if (sym == NULL)
+	    /* This can happen in trace mode if an object could not be
+	       found.  */
+	    break;
+	  if (__builtin_expect (sym->st_size > refsym->st_size, 0)
+	      || (__builtin_expect (sym->st_size < refsym->st_size, 0)
+		  && GLRO(dl_verbose)))
+	    {
+	      const char *strtab;
+
+	      strtab = (const char *) D_PTR (map, l_info[DT_STRTAB]);
+	      _dl_error_printf ("\
+%s: Symbol `%s' has different size in shared object, consider re-linking\n",
+				RTLD_PROGNAME, strtab + refsym->st_name);
+	    }
+	  memcpy (reloc_addr_arg, (void *) value,
+		  MIN (sym->st_size, refsym->st_size));
+	  break;
+	case R_386_IRELATIVE:
+	  value = map->l_addr + *reloc_addr;
+	  value = ((Elf32_Addr (*) (void)) value) ();
+	  *reloc_addr = value;
+	  break;
+	default:
+	  _dl_reloc_bad_type (map, r_type, 0);
+	  break;
+# endif	/* !RTLD_BOOTSTRAP */
+	}
+    }
+}
+
+# ifndef RTLD_BOOTSTRAP
+auto inline void
+__attribute__ ((always_inline))
+elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc,
+		  const Elf32_Sym *sym, const struct r_found_version *version,
+		  void *const reloc_addr_arg, int skip_ifunc)
+{
+  Elf32_Addr *const reloc_addr = reloc_addr_arg;
+  const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+
+  if (ELF32_R_TYPE (reloc->r_info) == R_386_RELATIVE)
+    *reloc_addr = map->l_addr + reloc->r_addend;
+  else if (r_type != R_386_NONE)
+    {
+#  ifndef RESOLVE_CONFLICT_FIND_MAP
+      const Elf32_Sym *const refsym = sym;
+#  endif
+      struct link_map *sym_map = RESOLVE_MAP (&sym, version, r_type);
+      Elf32_Addr value = sym == NULL ? 0 : sym_map->l_addr + sym->st_value;
+
+      if (sym != NULL
+	  && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)
+	  && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0)
+	  && __builtin_expect (!skip_ifunc, 1))
+	value = ((Elf32_Addr (*) (void)) value) ();
+
+      switch (ELF32_R_TYPE (reloc->r_info))
+	{
+	case R_386_SIZE32:
+	  /* Set to symbol size plus addend.  */
+	  value = sym->st_size;
+	case R_386_GLOB_DAT:
+	case R_386_JMP_SLOT:
+	case R_386_32:
+	  *reloc_addr = value + reloc->r_addend;
+	  break;
+#  ifndef RESOLVE_CONFLICT_FIND_MAP
+	  /* Not needed for dl-conflict.c.  */
+	case R_386_PC32:
+	  *reloc_addr = (value + reloc->r_addend - (Elf32_Addr) reloc_addr);
+	  break;
+
+	case R_386_TLS_DTPMOD32:
+	  /* Get the information from the link map returned by the
+	     resolv function.  */
+	  if (sym_map != NULL)
+	    *reloc_addr = sym_map->l_tls_modid;
+	  break;
+	case R_386_TLS_DTPOFF32:
+	  /* During relocation all TLS symbols are defined and used.
+	     Therefore the offset is already correct.  */
+	  *reloc_addr = (sym == NULL ? 0 : sym->st_value) + reloc->r_addend;
+	  break;
+	case R_386_TLS_DESC:
+	  {
+	    struct tlsdesc volatile *td =
+	      (struct tlsdesc volatile *)reloc_addr;
+
+#   ifndef RTLD_BOOTSTRAP
+	    if (!sym)
+	      {
+		td->arg = (void*)reloc->r_addend;
+		td->entry = _dl_tlsdesc_undefweak;
+	      }
+	    else
+#   endif
+	      {
+#   ifndef RTLD_BOOTSTRAP
+#    ifndef SHARED
+		CHECK_STATIC_TLS (map, sym_map);
+#    else
+		if (!TRY_STATIC_TLS (map, sym_map))
+		  {
+		    td->arg = _dl_make_tlsdesc_dynamic
+		      (sym_map, sym->st_value + reloc->r_addend);
+		    td->entry = _dl_tlsdesc_dynamic;
+		  }
+		else
+#    endif
+#   endif
+		  {
+		    td->arg = (void*)(sym->st_value - sym_map->l_tls_offset
+				      + reloc->r_addend);
+		    td->entry = _dl_tlsdesc_return;
+		  }
+	      }
+	  }
+	  break;
+	case R_386_TLS_TPOFF32:
+	  /* The offset is positive, backward from the thread pointer.  */
+	  /* We know the offset of object the symbol is contained in.
+	     It is a positive value which will be subtracted from the
+	     thread pointer.  To get the variable position in the TLS
+	     block we subtract the offset from that of the TLS block.  */
+	  if (sym != NULL)
+	    {
+	      CHECK_STATIC_TLS (map, sym_map);
+	      *reloc_addr = sym_map->l_tls_offset - sym->st_value
+			    + reloc->r_addend;
+	    }
+	  break;
+	case R_386_TLS_TPOFF:
+	  /* The offset is negative, forward from the thread pointer.  */
+	  /* We know the offset of object the symbol is contained in.
+	     It is a negative value which will be added to the
+	     thread pointer.  */
+	  if (sym != NULL)
+	    {
+	      CHECK_STATIC_TLS (map, sym_map);
+	      *reloc_addr = sym->st_value - sym_map->l_tls_offset
+			    + reloc->r_addend;
+	    }
+	  break;
+	case R_386_COPY:
+	  if (sym == NULL)
+	    /* This can happen in trace mode if an object could not be
+	       found.  */
+	    break;
+	  if (__builtin_expect (sym->st_size > refsym->st_size, 0)
+	      || (__builtin_expect (sym->st_size < refsym->st_size, 0)
+		  && GLRO(dl_verbose)))
+	    {
+	      const char *strtab;
+
+	      strtab = (const char *) D_PTR (map, l_info[DT_STRTAB]);
+	      _dl_error_printf ("\
+%s: Symbol `%s' has different size in shared object, consider re-linking\n",
+				RTLD_PROGNAME, strtab + refsym->st_name);
+	    }
+	  memcpy (reloc_addr_arg, (void *) value,
+		  MIN (sym->st_size, refsym->st_size));
+	  break;
+#  endif /* !RESOLVE_CONFLICT_FIND_MAP */
+	case R_386_IRELATIVE:
+	  value = map->l_addr + reloc->r_addend;
+	  value = ((Elf32_Addr (*) (void)) value) ();
+	  *reloc_addr = value;
+	  break;
+	default:
+	  /* We add these checks in the version to relocate ld.so only
+	     if we are still debugging.  */
+	  _dl_reloc_bad_type (map, r_type, 0);
+	  break;
+	}
+    }
+}
+# endif	/* !RTLD_BOOTSTRAP */
+
+auto inline void
+__attribute ((always_inline))
+elf_machine_rel_relative (Elf32_Addr l_addr, const Elf32_Rel *reloc,
+			  void *const reloc_addr_arg)
+{
+  Elf32_Addr *const reloc_addr = reloc_addr_arg;
+  assert (ELF32_R_TYPE (reloc->r_info) == R_386_RELATIVE);
+  *reloc_addr += l_addr;
+}
+
+# ifndef RTLD_BOOTSTRAP
+auto inline void
+__attribute__ ((always_inline))
+elf_machine_rela_relative (Elf32_Addr l_addr, const Elf32_Rela *reloc,
+			   void *const reloc_addr_arg)
+{
+  Elf32_Addr *const reloc_addr = reloc_addr_arg;
+  *reloc_addr = l_addr + reloc->r_addend;
+}
+# endif	/* !RTLD_BOOTSTRAP */
+
+auto inline void
+__attribute__ ((always_inline))
+elf_machine_lazy_rel (struct link_map *map,
+		      Elf32_Addr l_addr, const Elf32_Rel *reloc,
+		      int skip_ifunc)
+{
+  Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
+  const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+  /* Check for unexpected PLT reloc type.  */
+  if (__glibc_likely (r_type == R_386_JMP_SLOT))
+    {
+      if (__builtin_expect (map->l_mach.plt, 0) == 0)
+	*reloc_addr += l_addr;
+      else
+	*reloc_addr = (map->l_mach.plt
+		       + (((Elf32_Addr) reloc_addr) - map->l_mach.gotplt) * 4);
+    }
+  else if (__glibc_likely (r_type == R_386_TLS_DESC))
+    {
+      struct tlsdesc volatile * __attribute__((__unused__)) td =
+	(struct tlsdesc volatile *)reloc_addr;
+
+      /* Handle relocations that reference the local *ABS* in a simple
+	 way, so as to preserve a potential addend.  */
+      if (ELF32_R_SYM (reloc->r_info) == 0)
+	td->entry = _dl_tlsdesc_resolve_abs_plus_addend;
+      /* Given a known-zero addend, we can store a pointer to the
+	 reloc in the arg position.  */
+      else if (td->arg == 0)
+	{
+	  td->arg = (void*)reloc;
+	  td->entry = _dl_tlsdesc_resolve_rel;
+	}
+      else
+	{
+	  /* We could handle non-*ABS* relocations with non-zero addends
+	     by allocating dynamically an arg to hold a pointer to the
+	     reloc, but that sounds pointless.  */
+	  const Elf32_Rel *const r = reloc;
+	  /* The code below was borrowed from elf_dynamic_do_rel().  */
+	  const ElfW(Sym) *const symtab =
+	    (const void *) D_PTR (map, l_info[DT_SYMTAB]);
+
+# ifdef RTLD_BOOTSTRAP
+	  /* The dynamic linker always uses versioning.  */
+	  assert (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL);
+# else
+	  if (map->l_info[VERSYMIDX (DT_VERSYM)])
+# endif
+	    {
+	      const ElfW(Half) *const version =
+		(const void *) D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
+	      ElfW(Half) ndx = version[ELFW(R_SYM) (r->r_info)] & 0x7fff;
+	      elf_machine_rel (map, r, &symtab[ELFW(R_SYM) (r->r_info)],
+			       &map->l_versions[ndx],
+			       (void *) (l_addr + r->r_offset), skip_ifunc);
+	    }
+# ifndef RTLD_BOOTSTRAP
+	  else
+	    elf_machine_rel (map, r, &symtab[ELFW(R_SYM) (r->r_info)], NULL,
+			     (void *) (l_addr + r->r_offset), skip_ifunc);
+# endif
+	}
+    }
+  else if (__glibc_unlikely (r_type == R_386_IRELATIVE))
+    {
+      Elf32_Addr value = map->l_addr + *reloc_addr;
+      if (__glibc_likely (!skip_ifunc))
+	value = ((Elf32_Addr (*) (void)) value) ();
+      *reloc_addr = value;
+    }
+  else
+    _dl_reloc_bad_type (map, r_type, 1);
+}
+
+# ifndef RTLD_BOOTSTRAP
+
+auto inline void
+__attribute__ ((always_inline))
+elf_machine_lazy_rela (struct link_map *map,
+		       Elf32_Addr l_addr, const Elf32_Rela *reloc,
+		       int skip_ifunc)
+{
+  Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
+  const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+  if (__glibc_likely (r_type == R_386_JMP_SLOT))
+    ;
+  else if (__glibc_likely (r_type == R_386_TLS_DESC))
+    {
+      struct tlsdesc volatile * __attribute__((__unused__)) td =
+	(struct tlsdesc volatile *)reloc_addr;
+
+      td->arg = (void*)reloc;
+      td->entry = _dl_tlsdesc_resolve_rela;
+    }
+  else if (__glibc_unlikely (r_type == R_386_IRELATIVE))
+    {
+      Elf32_Addr value = map->l_addr + reloc->r_addend;
+      if (__glibc_likely (!skip_ifunc))
+	value = ((Elf32_Addr (*) (void)) value) ();
+      *reloc_addr = value;
+    }
+  else
+    _dl_reloc_bad_type (map, r_type, 1);
+}
+
+# endif	/* !RTLD_BOOTSTRAP */
+
+#endif /* RESOLVE_MAP */
diff --git a/REORG.TODO/sysdeps/i386/dl-procinfo.c b/REORG.TODO/sysdeps/i386/dl-procinfo.c
new file mode 100644
index 0000000000..7237f778b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-procinfo.c
@@ -0,0 +1,65 @@
+/* Data for i386 version of processor capability information.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2001.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* If anything should be added here check whether the size of each string
+   is still ok with the given array size.
+
+   All the #ifdefs in the definitions are quite irritating but
+   necessary if we want to avoid duplicating the information.  There
+   are three different modes:
+
+   - PROCINFO_DECL is defined.  This means we are only interested in
+     declarations.
+
+   - PROCINFO_DECL is not defined:
+
+     + if SHARED is defined the file is included in an array
+       initializer.  The .element = { ... } syntax is needed.
+
+     + if SHARED is not defined a normal array initialization is
+       needed.
+  */
+
+#ifndef PROCINFO_CLASS
+# define PROCINFO_CLASS
+#endif
+
+#include <sysdeps/x86/dl-procinfo.c>
+
+#if !defined PROCINFO_DECL && defined SHARED
+  ._dl_x86_cap_flags
+#else
+PROCINFO_CLASS const char _dl_x86_cap_flags[32][8]
+#endif
+#ifndef PROCINFO_DECL
+= {
+    "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
+    "cx8", "apic", "10", "sep", "mtrr", "pge", "mca", "cmov",
+    "pat", "pse36", "pn", "clflush", "20", "dts", "acpi", "mmx",
+    "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe"
+  }
+#endif
+#if !defined SHARED || defined PROCINFO_DECL
+;
+#else
+,
+#endif
+
+#undef PROCINFO_DECL
+#undef PROCINFO_CLASS
diff --git a/REORG.TODO/sysdeps/i386/dl-tls.h b/REORG.TODO/sysdeps/i386/dl-tls.h
new file mode 100644
index 0000000000..525ebab992
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-tls.h
@@ -0,0 +1,61 @@
+/* Thread-local storage handling in the ELF dynamic linker.  i386 version.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+/* Type used for the representation of TLS information in the GOT.  */
+typedef struct dl_tls_index
+{
+  unsigned long int ti_module;
+  unsigned long int ti_offset;
+} tls_index;
+
+
+#ifdef SHARED
+/* This is the prototype for the GNU version.  */
+extern void *___tls_get_addr (tls_index *ti)
+     __attribute__ ((__regparm__ (1)));
+extern void *___tls_get_addr_internal (tls_index *ti)
+     __attribute__ ((__regparm__ (1))) attribute_hidden;
+
+# if IS_IN (rtld)
+/* The special thing about the x86 TLS ABI is that we have two
+   variants of the __tls_get_addr function with different calling
+   conventions.  The GNU version, which we are mostly concerned here,
+   takes the parameter in a register.  The name is changed by adding
+   an additional underscore at the beginning.  The Sun version uses
+   the normal calling convention.  */
+void *
+__tls_get_addr (tls_index *ti)
+{
+  return ___tls_get_addr_internal (ti);
+}
+
+
+/* Prepare using the definition of __tls_get_addr in the generic
+   version of this file.  */
+# define __tls_get_addr __attribute__ ((__regparm__ (1))) ___tls_get_addr
+strong_alias (___tls_get_addr, ___tls_get_addr_internal)
+rtld_hidden_proto (___tls_get_addr)
+rtld_hidden_def (___tls_get_addr)
+#else
+
+/* Users should get the better interface.  */
+# define __tls_get_addr ___tls_get_addr
+
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/dl-tlsdesc.S b/REORG.TODO/sysdeps/i386/dl-tlsdesc.S
new file mode 100644
index 0000000000..8befdc2b39
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-tlsdesc.S
@@ -0,0 +1,285 @@
+/* Thread-local storage handling in the ELF dynamic linker.  i386 version.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <tls.h>
+#include "tlsdesc.h"
+
+	.text
+
+     /* This function is used to compute the TP offset for symbols in
+	Static TLS, i.e., whose TP offset is the same for all
+	threads.
+
+	The incoming %eax points to the TLS descriptor, such that
+	0(%eax) points to _dl_tlsdesc_return itself, and 4(%eax) holds
+	the TP offset of the symbol corresponding to the object
+	denoted by the argument.  */
+
+	.hidden _dl_tlsdesc_return
+	.global	_dl_tlsdesc_return
+	.type	_dl_tlsdesc_return,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_return:
+	movl	4(%eax), %eax
+	ret
+	cfi_endproc
+	.size	_dl_tlsdesc_return, .-_dl_tlsdesc_return
+
+     /* This function is used for undefined weak TLS symbols, for
+	which the base address (i.e., disregarding any addend) should
+	resolve to NULL.
+
+	%eax points to the TLS descriptor, such that 0(%eax) points to
+	_dl_tlsdesc_undefweak itself, and 4(%eax) holds the addend.
+	We return the addend minus the TP, such that, when the caller
+	adds TP, it gets the addend back.  If that's zero, as usual,
+	that's most likely a NULL pointer.  */
+
+	.hidden _dl_tlsdesc_undefweak
+	.global	_dl_tlsdesc_undefweak
+	.type	_dl_tlsdesc_undefweak,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_undefweak:
+	movl	4(%eax), %eax
+	subl	%gs:0, %eax
+	ret
+	cfi_endproc
+	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
+
+#ifdef SHARED
+	.hidden _dl_tlsdesc_dynamic
+	.global	_dl_tlsdesc_dynamic
+	.type	_dl_tlsdesc_dynamic,@function
+
+     /* This function is used for symbols that need dynamic TLS.
+
+	%eax points to the TLS descriptor, such that 0(%eax) points to
+	_dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
+	tlsdesc_dynamic_arg object.  It must return in %eax the offset
+	between the thread pointer and the object denoted by the
+	argument, without clobbering any registers.
+
+	The assembly code that follows is a rendition of the following
+	C code, hand-optimized a little bit.
+
+ptrdiff_t
+__attribute__ ((__regparm__ (1)))
+_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
+{
+  struct tlsdesc_dynamic_arg *td = tdp->arg;
+  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
+  if (__builtin_expect (td->gen_count <= dtv[0].counter
+			&& (dtv[td->tlsinfo.ti_module].pointer.val
+			    != TLS_DTV_UNALLOCATED),
+			1))
+    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
+      - __thread_pointer;
+
+  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
+}
+*/
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_dynamic:
+	/* Like all TLS resolvers, preserve call-clobbered registers.
+	   We need two scratch regs anyway.  */
+	subl	$28, %esp
+	cfi_adjust_cfa_offset (28)
+	movl	%ecx, 20(%esp)
+	movl	%edx, 24(%esp)
+	movl	TLSDESC_ARG(%eax), %eax
+	movl	%gs:DTV_OFFSET, %edx
+	movl	TLSDESC_GEN_COUNT(%eax), %ecx
+	cmpl	(%edx), %ecx
+	ja	.Lslow
+	movl	TLSDESC_MODID(%eax), %ecx
+	movl	(%edx,%ecx,8), %edx
+	cmpl	$-1, %edx
+	je	.Lslow
+	movl	TLSDESC_MODOFF(%eax), %eax
+	addl	%edx, %eax
+.Lret:
+	movl	20(%esp), %ecx
+	subl	%gs:0, %eax
+	movl	24(%esp), %edx
+	addl	$28, %esp
+	cfi_adjust_cfa_offset (-28)
+	ret
+	.p2align 4,,7
+.Lslow:
+	cfi_adjust_cfa_offset (28)
+	call	HIDDEN_JUMPTARGET (___tls_get_addr)
+	jmp	.Lret
+	cfi_endproc
+	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+#endif /* SHARED */
+
+     /* This function is a wrapper for a lazy resolver for TLS_DESC
+	REL relocations that reference the *ABS* segment in their own
+	link maps.  %ebx points to the caller's GOT.  %eax points to a
+	TLS descriptor, such that 0(%eax) holds the address of the
+	resolver wrapper itself (unless some other thread beat us to
+	it) and 4(%eax) holds the addend in the relocation.
+
+	When the actual resolver returns, it will have adjusted the
+	TLS descriptor such that we can tail-call it for it to return
+	the TP offset of the symbol.  */
+
+	.hidden _dl_tlsdesc_resolve_abs_plus_addend
+	.global	_dl_tlsdesc_resolve_abs_plus_addend
+	.type	_dl_tlsdesc_resolve_abs_plus_addend,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_resolve_abs_plus_addend:
+0:
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	pushl	%ecx
+	cfi_adjust_cfa_offset (4)
+	pushl	%edx
+	cfi_adjust_cfa_offset (4)
+	movl	$1f - 0b, %ecx
+	movl	4(%ebx), %edx
+	call	_dl_tlsdesc_resolve_abs_plus_addend_fixup
+1:
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	popl	%ecx
+	cfi_adjust_cfa_offset (-4)
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	jmp	*(%eax)
+	cfi_endproc
+	.size	_dl_tlsdesc_resolve_abs_plus_addend, .-_dl_tlsdesc_resolve_abs_plus_addend
+
+     /* This function is a wrapper for a lazy resolver for TLS_DESC
+	REL relocations that had zero addends.  %ebx points to the
+	caller's GOT.  %eax points to a TLS descriptor, such that
+	0(%eax) holds the address of the resolver wrapper itself
+	(unless some other thread beat us to it) and 4(%eax) holds a
+	pointer to the relocation.
+
+	When the actual resolver returns, it will have adjusted the
+	TLS descriptor such that we can tail-call it for it to return
+	the TP offset of the symbol.  */
+
+	.hidden _dl_tlsdesc_resolve_rel
+	.global	_dl_tlsdesc_resolve_rel
+	.type	_dl_tlsdesc_resolve_rel,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_resolve_rel:
+0:
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	pushl	%ecx
+	cfi_adjust_cfa_offset (4)
+	pushl	%edx
+	cfi_adjust_cfa_offset (4)
+	movl	$1f - 0b, %ecx
+	movl	4(%ebx), %edx
+	call	_dl_tlsdesc_resolve_rel_fixup
+1:
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	popl	%ecx
+	cfi_adjust_cfa_offset (-4)
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	jmp	*(%eax)
+	cfi_endproc
+	.size	_dl_tlsdesc_resolve_rel, .-_dl_tlsdesc_resolve_rel
+
+     /* This function is a wrapper for a lazy resolver for TLS_DESC
+	RELA relocations.  %ebx points to the caller's GOT.  %eax
+	points to a TLS descriptor, such that 0(%eax) holds the
+	address of the resolver wrapper itself (unless some other
+	thread beat us to it) and 4(%eax) holds a pointer to the
+	relocation.
+
+	When the actual resolver returns, it will have adjusted the
+	TLS descriptor such that we can tail-call it for it to return
+	the TP offset of the symbol.  */
+
+	.hidden _dl_tlsdesc_resolve_rela
+	.global	_dl_tlsdesc_resolve_rela
+	.type	_dl_tlsdesc_resolve_rela,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_resolve_rela:
+0:
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	pushl	%ecx
+	cfi_adjust_cfa_offset (4)
+	pushl	%edx
+	cfi_adjust_cfa_offset (4)
+	movl	$1f - 0b, %ecx
+	movl	4(%ebx), %edx
+	call	_dl_tlsdesc_resolve_rela_fixup
+1:
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	popl	%ecx
+	cfi_adjust_cfa_offset (-4)
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	jmp	*(%eax)
+	cfi_endproc
+	.size	_dl_tlsdesc_resolve_rela, .-_dl_tlsdesc_resolve_rela
+
+     /* This function is a placeholder for lazy resolving of TLS
+	relocations.  Once some thread starts resolving a TLS
+	relocation, it sets up the TLS descriptor to use this
+	resolver, such that other threads that would attempt to
+	resolve it concurrently may skip the call to the original lazy
+	resolver and go straight to a condition wait.
+
+	When the actual resolver returns, it will have adjusted the
+	TLS descriptor such that we can tail-call it for it to return
+	the TP offset of the symbol.  */
+
+	.hidden _dl_tlsdesc_resolve_hold
+	.global	_dl_tlsdesc_resolve_hold
+	.type	_dl_tlsdesc_resolve_hold,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_resolve_hold:
+0:
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	pushl	%ecx
+	cfi_adjust_cfa_offset (4)
+	pushl	%edx
+	cfi_adjust_cfa_offset (4)
+	movl	$1f - 0b, %ecx
+	movl	4(%ebx), %edx
+	call	_dl_tlsdesc_resolve_hold_fixup
+1:
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	popl	%ecx
+	cfi_adjust_cfa_offset (-4)
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	jmp	*(%eax)
+	cfi_endproc
+	.size	_dl_tlsdesc_resolve_hold, .-_dl_tlsdesc_resolve_hold
diff --git a/REORG.TODO/sysdeps/i386/dl-tlsdesc.h b/REORG.TODO/sysdeps/i386/dl-tlsdesc.h
new file mode 100644
index 0000000000..242bebfc8e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-tlsdesc.h
@@ -0,0 +1,61 @@
+/* Thread-local storage descriptor handling in the ELF dynamic linker.
+   i386 version.
+   Copyright (C) 2005-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _I386_DL_TLSDESC_H
+# define _I386_DL_TLSDESC_H 1
+
+/* Type used to represent a TLS descriptor in the GOT.  */
+struct tlsdesc
+{
+  ptrdiff_t __attribute__ ((regparm (1))) (*entry) (struct tlsdesc *);
+  void *arg;
+};
+
+typedef struct dl_tls_index
+{
+  unsigned long int ti_module;
+  unsigned long int ti_offset;
+} tls_index;
+
+/* Type used as the argument in a TLS descriptor for a symbol that
+   needs dynamic TLS offsets.  */
+struct tlsdesc_dynamic_arg
+{
+  tls_index tlsinfo;
+  size_t gen_count;
+};
+
+extern ptrdiff_t attribute_hidden __attribute__ ((regparm (1)))
+  _dl_tlsdesc_return (struct tlsdesc *),
+  _dl_tlsdesc_undefweak (struct tlsdesc *),
+  _dl_tlsdesc_resolve_abs_plus_addend (struct tlsdesc *),
+  _dl_tlsdesc_resolve_rel (struct tlsdesc *),
+  _dl_tlsdesc_resolve_rela (struct tlsdesc *),
+  _dl_tlsdesc_resolve_hold (struct tlsdesc *);
+
+# ifdef SHARED
+extern void *_dl_make_tlsdesc_dynamic (struct link_map *map,
+				       size_t ti_offset)
+  internal_function attribute_hidden;
+
+extern ptrdiff_t attribute_hidden __attribute__ ((regparm (1)))
+  _dl_tlsdesc_dynamic (struct tlsdesc *);
+# endif
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/dl-trampoline.S b/REORG.TODO/sysdeps/i386/dl-trampoline.S
new file mode 100644
index 0000000000..6e7f3aef92
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-trampoline.S
@@ -0,0 +1,215 @@
+/* PLT trampolines.  i386 version.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <link-defines.h>
+
+#ifdef HAVE_MPX_SUPPORT
+# define PRESERVE_BND_REGS_PREFIX bnd
+#else
+# define PRESERVE_BND_REGS_PREFIX .byte 0xf2
+#endif
+
+	.text
+	.globl _dl_runtime_resolve
+	.type _dl_runtime_resolve, @function
+	cfi_startproc
+	.align 16
+_dl_runtime_resolve:
+	cfi_adjust_cfa_offset (8)
+	pushl %eax		# Preserve registers otherwise clobbered.
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %edx
+	cfi_adjust_cfa_offset (4)
+	movl 16(%esp), %edx	# Copy args pushed by PLT in register.  Note
+	movl 12(%esp), %eax	# that `fixup' takes its parameters in regs.
+	call _dl_fixup		# Call resolver.
+	popl %edx		# Get register content back.
+	cfi_adjust_cfa_offset (-4)
+	movl (%esp), %ecx
+	movl %eax, (%esp)	# Store the function address.
+	movl 4(%esp), %eax
+	ret $12			# Jump to function address.
+	cfi_endproc
+	.size _dl_runtime_resolve, .-_dl_runtime_resolve
+
+
+#ifndef PROF
+	.globl _dl_runtime_profile
+	.type _dl_runtime_profile, @function
+	cfi_startproc
+	.align 16
+_dl_runtime_profile:
+	cfi_adjust_cfa_offset (8)
+	pushl %esp
+	cfi_adjust_cfa_offset (4)
+	addl $8, (%esp)		# Account for the pushed PLT data
+	pushl %ebp
+	cfi_adjust_cfa_offset (4)
+	pushl %eax		# Preserve registers otherwise clobbered.
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %edx
+	cfi_adjust_cfa_offset (4)
+	movl %esp, %ecx
+	subl $8, %esp
+	cfi_adjust_cfa_offset (8)
+	movl $-1, 4(%esp)
+	leal 4(%esp), %edx
+	movl %edx, (%esp)
+	pushl %ecx		# Address of the register structure
+	cfi_adjust_cfa_offset (4)
+	movl 40(%esp), %ecx	# Load return address
+	movl 36(%esp), %edx	# Copy args pushed by PLT in register.  Note
+	movl 32(%esp), %eax	# that `fixup' takes its parameters in regs.
+	call _dl_profile_fixup	# Call resolver.
+	cfi_adjust_cfa_offset (-8)
+	movl (%esp), %edx
+	testl %edx, %edx
+	jns 1f
+	popl %edx
+	cfi_adjust_cfa_offset (-4)
+	popl %edx		# Get register content back.
+	cfi_adjust_cfa_offset (-4)
+	movl (%esp), %ecx
+	movl %eax, (%esp)	# Store the function address.
+	movl 4(%esp), %eax
+	ret $20			# Jump to function address.
+
+	/*
+	    +32     return address
+	    +28     PLT1
+	    +24     PLT2
+	    +20     %esp
+	    +16     %ebp
+	    +12     %eax
+	    +8      %ecx
+	    +4      %edx
+	   %esp     free
+	*/
+	cfi_adjust_cfa_offset (8)
+1:	movl %ebx, (%esp)
+	cfi_rel_offset (ebx, 0)
+	movl %edx, %ebx		# This is the frame buffer size
+	pushl %edi
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+	pushl %esi
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (esi, 0)
+	leal 44(%esp), %esi
+	movl %ebx, %ecx
+	orl $4, %ebx		# Increase frame size if necessary to align
+				# stack for the function call
+	andl $~3, %ebx
+	movl %esp, %edi
+	subl %ebx, %edi
+	movl %esp, %ebx
+	cfi_def_cfa_register (ebx)
+	movl %edi, %esp
+	shrl $2, %ecx
+	rep
+	movsl
+	movl (%ebx), %esi
+	cfi_restore (esi)
+	movl 4(%ebx), %edi
+	cfi_restore (edi)
+	/*
+	   %ebx+40  return address
+	   %ebx+36  PLT1
+	   %ebx+32  PLT2
+	   %ebx+28  %esp
+	   %ebx+24  %ebp
+	   %ebx+20  %eax
+	   %ebx+16  %ecx
+	   %ebx+12  %edx
+	   %ebx+8   %ebx
+	   %ebx+4   free
+	   %ebx     free
+	   %esp     copied stack frame
+	*/
+	movl %eax, (%ebx)
+	movl 12(%ebx), %edx
+	movl 16(%ebx), %ecx
+	movl 20(%ebx), %eax
+	call *(%ebx)
+	movl %ebx, %esp
+	cfi_def_cfa_register (esp)
+	movl 8(%esp), %ebx
+	cfi_restore (ebx)
+	/*
+	    +40     return address
+	    +36     PLT1
+	    +32     PLT2
+	    +28     %esp
+	    +24     %ebp
+	    +20     %eax
+	    +16     %ecx
+	    +12     %edx
+	    +8      free
+	    +4      free
+	   %esp     free
+	*/
+#if LONG_DOUBLE_SIZE != 12
+# error "long double size must be 12 bytes"
+#endif
+	# Allocate space for La_i86_retval and subtract 12 free bytes.
+	subl $(LRV_SIZE - 12), %esp
+	cfi_adjust_cfa_offset (LRV_SIZE - 12)
+	movl %eax, LRV_EAX_OFFSET(%esp)
+	movl %edx, LRV_EDX_OFFSET(%esp)
+	fstpt LRV_ST0_OFFSET(%esp)
+	fstpt LRV_ST1_OFFSET(%esp)
+#ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, LRV_BND0_OFFSET(%esp)
+	bndmov %bnd1, LRV_BND1_OFFSET(%esp)
+#else
+	.byte 0x66,0x0f,0x1b,0x44,0x24,LRV_BND0_OFFSET
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,LRV_BND1_OFFSET
+#endif
+	pushl %esp
+	cfi_adjust_cfa_offset (4)
+	# Address of La_i86_regs area.
+	leal (LRV_SIZE + 4)(%esp), %ecx
+	# PLT2
+	movl (LRV_SIZE + 4 + LR_SIZE)(%esp), %eax
+	# PLT1
+	movl (LRV_SIZE + 4 + LR_SIZE + 4)(%esp), %edx
+	call _dl_call_pltexit
+	movl LRV_EAX_OFFSET(%esp), %eax
+	movl LRV_EDX_OFFSET(%esp), %edx
+	fldt LRV_ST1_OFFSET(%esp)
+	fldt LRV_ST0_OFFSET(%esp)
+#ifdef HAVE_MPX_SUPPORT
+	bndmov LRV_BND0_OFFSET(%esp), %bnd0
+	bndmov LRV_BND1_OFFSET(%esp), %bnd1
+#else
+	.byte 0x66,0x0f,0x1a,0x44,0x24,LRV_BND0_OFFSET
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,LRV_BND1_OFFSET
+#endif
+	# Restore stack before return.
+	addl $(LRV_SIZE + 4 + LR_SIZE + 4), %esp
+	cfi_adjust_cfa_offset (-(LRV_SIZE + 4 + LR_SIZE + 4))
+	PRESERVE_BND_REGS_PREFIX
+	ret
+	cfi_endproc
+	.size _dl_runtime_profile, .-_dl_runtime_profile
+#endif
diff --git a/REORG.TODO/sysdeps/i386/ffs.c b/REORG.TODO/sysdeps/i386/ffs.c
new file mode 100644
index 0000000000..c229c8166e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/ffs.c
@@ -0,0 +1,50 @@
+/* ffs -- find first set bit in a word, counted from least significant end.
+   For Intel 80x86, x>=3.
+   This file is part of the GNU C Library.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   Contributed by Torbjorn Granlund (tege@sics.se).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define ffsl __something_else
+#include <string.h>
+
+#undef	ffs
+
+#ifdef	__GNUC__
+
+int
+__ffs (int x)
+{
+  int cnt;
+  int tmp;
+
+  asm ("xorl %0,%0\n"		/* Set CNT to zero.  */
+       "bsfl %2,%1\n"		/* Count low bits in X and store in %1.  */
+       "jz 1f\n"		/* Jump if OK, i.e. X was non-zero.  */
+       "leal 1(%1),%0\n"	/* Return bsfl-result plus one on %0.  */
+       "1:" : "=&a" (cnt), "=r" (tmp) : "rm" (x));
+
+  return cnt;
+}
+weak_alias (__ffs, ffs)
+libc_hidden_def (__ffs)
+libc_hidden_builtin_def (ffs)
+#undef ffsl
+weak_alias (__ffs, ffsl)
+
+#else
+#include <string/ffs.c>
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/Implies b/REORG.TODO/sysdeps/i386/fpu/Implies
new file mode 100644
index 0000000000..2b745a34fb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/Implies
@@ -0,0 +1 @@
+x86/fpu
diff --git a/REORG.TODO/sysdeps/i386/fpu/Versions b/REORG.TODO/sysdeps/i386/fpu/Versions
new file mode 100644
index 0000000000..a2eec371f1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/Versions
@@ -0,0 +1,6 @@
+libm {
+  GLIBC_2.2 {
+    # functions used in inline functions or macros
+    __expl; __expm1l;
+  }
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/doasin.c b/REORG.TODO/sysdeps/i386/fpu/doasin.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/doasin.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acos.S b/REORG.TODO/sysdeps/i386/fpu/e_acos.S
new file mode 100644
index 0000000000..586c7fc406
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acos.S
@@ -0,0 +1,25 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: e_acos.S,v 1.4 1995/05/08 23:44:37 jtc Exp $")
+
+/* acos = atan (sqrt((1-x) (1+x)) / x) */
+ENTRY(__ieee754_acos)
+	fldl	4(%esp)			/* x */
+	fld	%st			/* x : x */
+	fld1				/* 1 : x : x */
+	fsubp				/* 1 - x : x */
+	fld1				/* 1 : 1 - x : x */
+	fadd	%st(2)			/* 1 + x : 1 - x : x */
+	fmulp				/* 1 - x^2 : x */
+	fsqrt				/* sqrt (1 - x^2) : x */
+	fabs
+	fxch	%st(1)			/* x : sqrt (1 - x^2) */
+	fpatan				/* atan (sqrt(1 - x^2) / x) */
+	ret
+END (__ieee754_acos)
+strong_alias (__ieee754_acos, __acos_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acosf.S b/REORG.TODO/sysdeps/i386/fpu/e_acosf.S
new file mode 100644
index 0000000000..54930af8b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acosf.S
@@ -0,0 +1,24 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+/* acos = atan (sqrt(1 - x^2) / x) */
+ENTRY(__ieee754_acosf)
+	flds	4(%esp)			/* x */
+	fld	%st
+	fmul	%st(0)			/* x^2 */
+	fld1
+	fsubp				/* 1 - x^2 */
+	fsqrt				/* sqrt (1 - x^2) */
+	fabs
+	fxch	%st(1)
+	fpatan
+	ret
+END (__ieee754_acosf)
+strong_alias (__ieee754_acosf, __acosf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acosh.S b/REORG.TODO/sysdeps/i386/fpu/e_acosh.S
new file mode 100644
index 0000000000..9555ef8078
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acosh.S
@@ -0,0 +1,101 @@
+/* ix87 specific implementation of arcsinh.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_acosh)
+	movl	8(%esp), %ecx
+	cmpl	$0x3ff00000, %ecx
+	jl	5f			// < 1 => invalid
+	fldln2				// log(2)
+	fldl	4(%esp)			// x : log(2)
+	cmpl	$0x41b00000, %ecx
+	ja	3f			// x > 2^28
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpl	$0x40000000, %ecx
+	ja	4f			// x > 2
+
+	// 1 <= x <= 2 => y = log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+	fsubl	MO(one)			// x-1 : log(2)
+	fabs				// acosh(1) is +0 in all rounding modes
+	fld	%st			// x-1 : x-1 : log(2)
+	fmul	%st(1)			// (x-1)^2 : x-1 : log(2)
+	fadd	%st(1)			// x-1+(x-1)^2 : x-1 : log(2)
+	fadd	%st(1)			// 2*(x-1)+(x-1)^2 : x-1 : log(2)
+	fsqrt				// sqrt(2*(x-1)+(x-1)^2) : x-1 : log(2)
+	faddp				// x-1+sqrt(2*(x-1)+(x-1)^2) : log(2)
+	fcoml	MO(limit)
+	fnstsw
+	sahf
+	ja	2f
+	fyl2xp1				// log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+	ret
+
+2:	faddl	MO(one)			// x+sqrt(2*(x-1)+(x-1)^2) : log(2)
+	fyl2x				// log(x+sqrt(2*(x-1)+(x-1)^2))
+	ret
+
+	// x > 2^28 => y = log(x) + log(2)
+	.align ALIGNARG(4)
+3:	fyl2x				// log(x)
+	fldln2				// log(2) : log(x)
+	faddp				// log(x)+log(2)
+	ret
+
+	// 2^28 > x > 2 => y = log(2*x - 1/(x+sqrt(x*x-1)))
+	.align ALIGNARG(4)
+4:	fld	%st			// x : x : log(2)
+	fadd	%st, %st(1)		// x : 2*x : log(2)
+	fld	%st			// x : x : 2*x : log(2)
+	fmul	%st(1)			// x^2 : x : 2*x : log(2)
+	fsubl	MO(one)			// x^2-1 : x : 2*x : log(2)
+	fsqrt				// sqrt(x^2-1) : x : 2*x : log(2)
+	faddp				// x+sqrt(x^2-1) : 2*x : log(2)
+	fdivrl	MO(one)			// 1/(x+sqrt(x^2-1)) : 2*x : log(2)
+	fsubrp				// 2*x+1/(x+sqrt(x^2)-1) : log(2)
+	fyl2x				// log(2*x+1/(x+sqrt(x^2-1)))
+	ret
+
+	// x < 1 (or -NaN) => NaN
+	.align ALIGNARG(4)
+5:	fldl	4(%esp)
+	fsub	%st
+	fdiv	%st, %st(0)
+	ret
+END(__ieee754_acosh)
+strong_alias (__ieee754_acosh, __acosh_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acoshf.S b/REORG.TODO/sysdeps/i386/fpu/e_acoshf.S
new file mode 100644
index 0000000000..662fda3c06
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acoshf.S
@@ -0,0 +1,101 @@
+/* ix87 specific implementation of arcsinh.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_acoshf)
+	movl	4(%esp), %ecx
+	cmpl	$0x3f800000, %ecx
+	jl	5f			// < 1 => invalid
+	fldln2				// log(2)
+	flds	4(%esp)			// x : log(2)
+	cmpl	$0x47000000, %ecx
+	ja	3f			// x > 2^14
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpl	$0x40000000, %ecx
+	ja	4f			// x > 2
+
+	// 1 <= x <= 2 => y = log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+	fsubl	MO(one)			// x-1 : log(2)
+	fabs				// acosh(1) is +0 in all rounding modes
+	fld	%st			// x-1 : x-1 : log(2)
+	fmul	%st(1)			// (x-1)^2 : x-1 : log(2)
+	fadd	%st(1)			// x-1+(x-1)^2 : x-1 : log(2)
+	fadd	%st(1)			// 2*(x-1)+(x-1)^2 : x-1 : log(2)
+	fsqrt				// sqrt(2*(x-1)+(x-1)^2) : x-1 : log(2)
+	faddp				// x-1+sqrt(2*(x-1)+(x-1)^2) : log(2)
+	fcoml	MO(limit)
+	fnstsw
+	sahf
+	ja	2f
+	fyl2xp1				// log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+	ret
+
+2:	faddl	MO(one)			// x+sqrt(2*(x-1)+(x-1)^2) : log(2)
+	fyl2x				// log(x+sqrt(2*(x-1)+(x-1)^2))
+	ret
+
+	// x > 2^14 => y = log(x) + log(2)
+	.align ALIGNARG(4)
+3:	fyl2x				// log(x)
+	fldln2				// log(2) : log(x)
+	faddp				// log(x)+log(2)
+	ret
+
+	// 2^28 > x > 2 => y = log(2*x - 1/(x+sqrt(x*x-1)))
+	.align ALIGNARG(4)
+4:	fld	%st			// x : x : log(2)
+	fadd	%st, %st(1)		// x : 2*x : log(2)
+	fld	%st			// x : x : 2*x : log(2)
+	fmul	%st(1)			// x^2 : x : 2*x : log(2)
+	fsubl	MO(one)			// x^2-1 : x : 2*x : log(2)
+	fsqrt				// sqrt(x^2-1) : x : 2*x : log(2)
+	faddp				// x+sqrt(x^2-1) : 2*x : log(2)
+	fdivrl	MO(one)			// 1/(x+sqrt(x^2-1)) : 2*x : log(2)
+	fsubrp				// 2*x+1/(x+sqrt(x^2)-1) : log(2)
+	fyl2x				// log(2*x+1/(x+sqrt(x^2-1)))
+	ret
+
+	// x < 1 (or -NaN) => NaN
+	.align ALIGNARG(4)
+5:	flds	4(%esp)
+	fsub	%st
+	fdiv	%st, %st(0)
+	ret
+END(__ieee754_acoshf)
+strong_alias (__ieee754_acoshf, __acoshf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acoshl.S b/REORG.TODO/sysdeps/i386/fpu/e_acoshl.S
new file mode 100644
index 0000000000..e0d6466aac
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acoshl.S
@@ -0,0 +1,107 @@
+/* ix87 specific implementation of arcsinh.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	/* Please note that we use double value for 1.0.  This number
+	   has an exact representation and so we don't get accuracy
+	   problems.  The advantage is that the code is simpler.  */
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_acoshl)
+	movl	12(%esp), %ecx
+	andl	$0xffff, %ecx
+	cmpl	$0x3fff, %ecx
+	jl	5f			// < 1 => invalid
+	fldln2				// log(2)
+	fldt	4(%esp)			// x : log(2)
+	cmpl	$0x4020, %ecx
+	ja	3f			// x > 2^34
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpl	$0x4000, %ecx
+	ja	4f			// x > 2
+
+	// 1 <= x <= 2 => y = log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+	fsubl	MO(one)			// x-1 : log(2)
+	fabs				// acosh(1) is +0 in all rounding modes
+	fld	%st			// x-1 : x-1 : log(2)
+	fmul	%st(1)			// (x-1)^2 : x-1 : log(2)
+	fadd	%st(1)			// x-1+(x-1)^2 : x-1 : log(2)
+	fadd	%st(1)			// 2*(x-1)+(x-1)^2 : x-1 : log(2)
+	fsqrt				// sqrt(2*(x-1)+(x-1)^2) : x-1 : log(2)
+	faddp				// x-1+sqrt(2*(x-1)+(x-1)^2) : log(2)
+	fcoml	MO(limit)
+	fnstsw
+	sahf
+	ja	2f
+	fyl2xp1				// log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+	ret
+
+2:	faddl	MO(one)			// x+sqrt(2*(x-1)+(x-1)^2) : log(2)
+	fyl2x				// log(x+sqrt(2*(x-1)+(x-1)^2))
+	ret
+
+	// x > 2^34 => y = log(x) + log(2)
+	.align ALIGNARG(4)
+3:	fyl2x				// log(x)
+	fldln2				// log(2) : log(x)
+	faddp				// log(x)+log(2)
+	ret
+
+	// 2^34 > x > 2 => y = log(2*x - 1/(x+sqrt(x*x-1)))
+	.align ALIGNARG(4)
+4:	fld	%st			// x : x : log(2)
+	fadd	%st, %st(1)		// x : 2*x : log(2)
+	fld	%st			// x : x : 2*x : log(2)
+	fmul	%st(1)			// x^2 : x : 2*x : log(2)
+	fsubl	MO(one)			// x^2-1 : x : 2*x : log(2)
+	fsqrt				// sqrt(x^2-1) : x : 2*x : log(2)
+	faddp				// x+sqrt(x^2-1) : 2*x : log(2)
+	fdivrl	MO(one)			// 1/(x+sqrt(x^2-1)) : 2*x : log(2)
+	fsubrp				// 2*x+1/(x+sqrt(x^2)-1) : log(2)
+	fyl2x				// log(2*x+1/(x+sqrt(x^2-1)))
+	ret
+
+	// x < 1 => NaN
+	.align ALIGNARG(4)
+5:	fldz
+	fdiv	%st, %st(0)
+	ret
+END(__ieee754_acoshl)
+strong_alias (__ieee754_acoshl, __acoshl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acosl.c b/REORG.TODO/sysdeps/i386/fpu/e_acosl.c
new file mode 100644
index 0000000000..ab08931924
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acosl.c
@@ -0,0 +1,29 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+long double
+__ieee754_acosl (long double x)
+{
+  long double res;
+
+  /* acosl = atanl (sqrtl((1-x) (1+x)) / x) */
+  asm (	"fld	%%st\n"
+	"fld1\n"
+	"fsubp\n"
+	"fld1\n"
+	"fadd	%%st(2)\n"
+	"fmulp\n"			/* 1 - x^2 */
+	"fsqrt\n"			/* sqrtl (1 - x^2) */
+	"fabs\n"
+	"fxch	%%st(1)\n"
+	"fpatan"
+	: "=t" (res) : "0" (x) : "st(1)");
+  return res;
+}
+strong_alias (__ieee754_acosl, __acosl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_asin.S b/REORG.TODO/sysdeps/i386/fpu/e_asin.S
new file mode 100644
index 0000000000..39c8b47da4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_asin.S
@@ -0,0 +1,38 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: e_asin.S,v 1.4 1995/05/08 23:45:40 jtc Exp $")
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+
+/* asin = atan (x / sqrt((1-x) (1+x))) */
+ENTRY(__ieee754_asin)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	4(%esp)			/* x */
+	fld	%st
+	fld1				/* 1 : x : x */
+	fsubp				/* 1 - x : x */
+	fld1				/* 1 : 1 - x : x */
+	fadd	%st(2)			/* 1 + x : 1 - x : x */
+	fmulp				/* 1 - x^2 */
+	fsqrt				/* sqrt (1 - x^2) */
+	fpatan
+	DBL_CHECK_FORCE_UFLOW
+	ret
+END (__ieee754_asin)
+strong_alias (__ieee754_asin, __asin_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_asinf.S b/REORG.TODO/sysdeps/i386/fpu/e_asinf.S
new file mode 100644
index 0000000000..1102bdedfd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_asinf.S
@@ -0,0 +1,39 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: $")
+
+	.section .rodata.cst4,"aM",@progbits,4
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+
+/* asin = atan (x / sqrt(1 - x^2)) */
+ENTRY(__ieee754_asinf)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	flds	4(%esp)			/* x */
+	fld	%st
+	fmul	%st(0)			/* x^2 */
+	fld1
+	fsubp				/* 1 - x^2 */
+	fsqrt				/* sqrt (1 - x^2) */
+	fpatan
+	FLT_CHECK_FORCE_UFLOW
+	ret
+END (__ieee754_asinf)
+strong_alias (__ieee754_asinf, __asinf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atan2.S b/REORG.TODO/sysdeps/i386/fpu/e_atan2.S
new file mode 100644
index 0000000000..25f43bb5a1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atan2.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: e_atan2.S,v 1.4 1995/05/08 23:46:28 jtc Exp $")
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_atan2)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	 4(%esp)
+	fldl	12(%esp)
+	fpatan
+	DBL_CHECK_FORCE_UFLOW_NARROW
+	ret
+END (__ieee754_atan2)
+strong_alias (__ieee754_atan2, __atan2_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atan2f.S b/REORG.TODO/sysdeps/i386/fpu/e_atan2f.S
new file mode 100644
index 0000000000..2bc909a762
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atan2f.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: e_atan2f.S,v 1.1 1995/05/08 23:35:10 jtc Exp $")
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_atan2f)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	flds	4(%esp)
+	flds	8(%esp)
+	fpatan
+	FLT_CHECK_FORCE_UFLOW_NARROW
+	ret
+END (__ieee754_atan2f)
+strong_alias (__ieee754_atan2f, __atan2f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atan2l.c b/REORG.TODO/sysdeps/i386/fpu/e_atan2l.c
new file mode 100644
index 0000000000..9f88bfcc08
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atan2l.c
@@ -0,0 +1,19 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+long double
+__ieee754_atan2l (long double y, long double x)
+{
+  long double res;
+
+  asm ("fpatan" : "=t" (res) : "u" (y), "0" (x) : "st(1)");
+
+  return res;
+}
+strong_alias (__ieee754_atan2l, __atan2l_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atanh.S b/REORG.TODO/sysdeps/i386/fpu/e_atanh.S
new file mode 100644
index 0000000000..cbc93d5da2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atanh.S
@@ -0,0 +1,112 @@
+/* ix87 specific implementation of arctanh function.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type half,@object
+half:	.double 0.5
+	ASM_SIZE_DIRECTIVE(half)
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	.type ln2_2,@object
+ln2_2:	.tfloat 0.3465735902799726547086160
+	ASM_SIZE_DIRECTIVE(ln2_2)
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_atanh)
+	movl	8(%esp), %ecx
+
+	movl	%ecx, %eax
+	andl	$0x7fffffff, %eax
+	cmpl	$0x7ff00000, %eax
+	jae	5f
+7:
+
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+	andl	$0x80000000, %ecx // ECX == 0 iff X >= 0
+
+	fldt	MO(ln2_2)	// 0.5*ln2
+	xorl	%ecx, 8(%esp)
+	fldl	4(%esp)		// |x| : 0.5*ln2
+	fcoml	MO(half)	// |x| : 0.5*ln2
+	fld	%st		// |x| : |x| : 0.5*ln2
+	fnstsw			// |x| : |x| : 0.5*ln2
+	sahf
+	jae	2f
+	fadd	%st, %st(1)	// |x| : 2*|x| : 0.5*ln2
+	fld	%st		// |x| : |x| : 2*|x| : 0.5*ln2
+	fsubrl	MO(one)		// 1-|x| : |x| : 2*|x| : 0.5*ln2
+	fxch			// |x| : 1-|x| : 2*|x| : 0.5*ln2
+	fmul	%st(2)		// 2*|x|^2 : 1-|x| : 2*|x| : 0.5*ln2
+	fdivp			// (2*|x|^2)/(1-|x|) : 2*|x| : 0.5*ln2
+	faddp			// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fcoml	MO(limit)	// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fnstsw			// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	sahf
+	jae	4f
+	fyl2xp1			// 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+	DBL_CHECK_FORCE_UFLOW_NONNEG
+	jecxz	3f
+	fchs			// 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3:	ret
+
+	.align ALIGNARG(4)
+4:	faddl	MO(one)		// 1+2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fyl2x			// 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+	jecxz	3f
+	fchs			// 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3:	ret
+
+	.align ALIGNARG(4)
+2:	faddl	MO(one)		// 1+|x| : |x| : 0.5*ln2
+	fxch			// |x| : 1+|x| : 0.5*ln2
+	fsubrl	MO(one)		// 1-|x| : 1+|x| : 0.5*ln2
+	fdivrp			// (1+|x|)/(1-|x|) : 0.5*ln2
+	fyl2x			// 0.5*ln2*ld((1+|x|)/(1-|x|))
+	jecxz	3f
+	fchs			// 0.5*ln2*ld((1+x)/(1-x))
+3:	ret
+
+	// x == NaN or ±Inf
+5:	ja	6f
+	cmpl	$0, 4(%esp)
+	je	7b
+6:	fldl	4(%esp)
+	ret
+END(__ieee754_atanh)
+strong_alias (__ieee754_atanh, __atanh_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atanhf.S b/REORG.TODO/sysdeps/i386/fpu/e_atanhf.S
new file mode 100644
index 0000000000..92fda3fd82
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atanhf.S
@@ -0,0 +1,109 @@
+/* ix87 specific implementation of arctanh function.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type half,@object
+half:	.double 0.5
+	ASM_SIZE_DIRECTIVE(half)
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	.align ALIGNARG(4)
+	.type ln2_2,@object
+ln2_2:	.tfloat 0.3465735902799726547086160
+	ASM_SIZE_DIRECTIVE(ln2_2)
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_atanhf)
+	movl	4(%esp), %ecx
+
+	movl	%ecx, %eax
+	andl	$0x7fffffff, %eax
+	cmpl	$0x7f800000, %eax
+	ja	5f
+
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+	andl	$0x80000000, %ecx // ECX == 0 iff X >= 0
+
+	fldt	MO(ln2_2)	// 0.5*ln2
+	xorl	%ecx, 4(%esp)
+	flds	4(%esp)		// |x| : 0.5*ln2
+	fcoml	MO(half)	// |x| : 0.5*ln2
+	fld	%st(0)		// |x| : |x| : 0.5*ln2
+	fnstsw			// |x| : |x| : 0.5*ln2
+	sahf
+	jae	2f
+	fadd	%st, %st(1)	// |x| : 2*|x| : 0.5*ln2
+	fld	%st		// |x| : |x| : 2*|x| : 0.5*ln2
+	fsubrl	MO(one)		// 1-|x| : |x| : 2*|x| : 0.5*ln2
+	fxch			// |x| : 1-|x| : 2*|x| : 0.5*ln2
+	fmul	%st(2)		// 2*|x|^2 : 1-|x| : 2*|x| : 0.5*ln2
+	fdivp			// (2*|x|^2)/(1-|x|) : 2*|x| : 0.5*ln2
+	faddp			// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fcoml	MO(limit)	// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fnstsw			// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	sahf
+	jae	4f
+	fyl2xp1			// 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+	FLT_CHECK_FORCE_UFLOW_NONNEG
+	jecxz	3f
+	fchs			// 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3:	ret
+
+	.align ALIGNARG(4)
+4:	faddl	MO(one)		// 1+2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fyl2x			// 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+	jecxz	3f
+	fchs			// 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3:	ret
+
+	.align ALIGNARG(4)
+2:	faddl	MO(one)		// 1+|x| : |x| : 0.5*ln2
+	fxch			// |x| : 1+|x| : 0.5*ln2
+	fsubrl	MO(one)		// 1-|x| : 1+|x| : 0.5*ln2
+	fdivrp			// (1+|x|)/(1-|x|) : 0.5*ln2
+	fyl2x			// 0.5*ln2*ld((1+|x|)/(1-|x|))
+	jecxz	3f
+	fchs			// 0.5*ln2*ld((1+x)/(1-x))
+3:	ret
+
+	// x == NaN
+5:	flds	4(%esp)
+	ret
+END(__ieee754_atanhf)
+strong_alias (__ieee754_atanhf, __atanhf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atanhl.S b/REORG.TODO/sysdeps/i386/fpu/e_atanhl.S
new file mode 100644
index 0000000000..31ff7e5182
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atanhl.S
@@ -0,0 +1,127 @@
+/* ix87 specific implementation of arctanh function.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	/* Please note that we use double values for 0.5 and 1.0.  These
+	   numbers have exact representations and so we don't get accuracy
+	   problems.  The advantage is that the code is simpler.  */
+	.type half,@object
+half:	.double 0.5
+	ASM_SIZE_DIRECTIVE(half)
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	.align ALIGNARG(4)
+	.type ln2_2,@object
+ln2_2:	.tfloat 0.3465735902799726547086160
+	ASM_SIZE_DIRECTIVE(ln2_2)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_atanhl)
+	movl	12(%esp), %ecx
+
+	movl	%ecx, %eax
+	andl	$0x7fff, %eax
+	cmpl	$0x7fff, %eax
+	je	5f
+	cmpl	$0x3fdf, %eax
+	jge	7f
+	// Exponent below -32; return x, with underflow if subnormal.
+	fldt	4(%esp)
+	cmpl	$0, %eax
+	jne	8f
+	fld	%st(0)
+	fmul	%st(0)
+	fstp	%st(0)
+8:	ret
+7:
+
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+	andl	$0x8000, %ecx	// ECX == 0 iff X >= 0
+
+	fldt	MO(ln2_2)	// 0.5*ln2
+	xorl	%ecx, 12(%esp)
+	fldt	4(%esp)		// |x| : 0.5*ln2
+	fcoml	MO(half)	// |x| : 0.5*ln2
+	fld	%st(0)		// |x| : |x| : 0.5*ln2
+	fnstsw			// |x| : |x| : 0.5*ln2
+	sahf
+	jae	2f
+	fadd	%st, %st(1)	// |x| : 2*|x| : 0.5*ln2
+	fld	%st		// |x| : |x| : 2*|x| : 0.5*ln2
+	fsubrl	MO(one)		// 1-|x| : |x| : 2*|x| : 0.5*ln2
+	fxch			// |x| : 1-|x| : 2*|x| : 0.5*ln2
+	fmul	%st(2)		// 2*|x|^2 : 1-|x| : 2*|x| : 0.5*ln2
+	fdivp			// (2*|x|^2)/(1-|x|) : 2*|x| : 0.5*ln2
+	faddp			// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fcoml	MO(limit)	// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fnstsw			// 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	sahf
+	jae	4f
+	fyl2xp1			// 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+	jecxz	3f
+	fchs			// 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3:	ret
+
+	.align ALIGNARG(4)
+4:	faddl	MO(one)		// 1+2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+	fyl2x			// 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+	jecxz	3f
+	fchs			// 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3:	ret
+
+	.align ALIGNARG(4)
+2:	faddl	MO(one)		// 1+|x| : |x| : 0.5*ln2
+	fxch			// |x| : 1+|x| : 0.5*ln2
+	fsubrl	MO(one)		// 1-|x| : 1+|x| : 0.5*ln2
+	fdivrp			// (1+|x|)/(1-|x|) : 0.5*ln2
+	fyl2x			// 0.5*ln2*ld((1+|x|)/(1-|x|))
+	jecxz	3f
+	fchs			// 0.5*ln2*ld((1+x)/(1-x))
+3:	ret
+
+	// x == NaN or ±Inf
+5:	cmpl	$0x80000000, 8(%esp)
+	ja	6f
+	cmpl	$0, 4(%esp)
+	je	7b
+6:	fldt	4(%esp)
+	fadd	%st(0)
+	ret
+END(__ieee754_atanhl)
+strong_alias (__ieee754_atanhl, __atanhl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp.S b/REORG.TODO/sysdeps/i386/fpu/e_exp.S
new file mode 100644
index 0000000000..a7e7f13f6f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp.S
@@ -0,0 +1,73 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+/* e^x = 2^(x * log2(e)) */
+ENTRY(__ieee754_exp)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	fldl2e
+	fmulp				/* x * log2(e) */
+	fld	%st
+	frndint				/* int(x * log2(e)) */
+	fsubr	%st,%st(1)		/* fract(x * log2(e)) */
+	fxch
+	f2xm1				/* 2^(fract(x * log2(e))) - 1 */
+	fld1
+	faddp				/* 2^(fract(x * log2(e))) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	DBL_NARROW_EVAL_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_exp)
+
+
+ENTRY(__exp_finite)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl2e
+	fmull	4(%esp)			/* x * log2(e) */
+	fld	%st
+	frndint				/* int(x * log2(e)) */
+	fsubr	%st,%st(1)		/* fract(x * log2(e)) */
+	fxch
+	f2xm1				/* 2^(fract(x * log2(e))) - 1 */
+	fld1
+	faddp				/* 2^(fract(x * log2(e))) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	DBL_NARROW_EVAL_UFLOW_NONNEG
+	ret
+END(__exp_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp10.S b/REORG.TODO/sysdeps/i386/fpu/e_exp10.S
new file mode 100644
index 0000000000..acb5160a3f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp10.S
@@ -0,0 +1,53 @@
+/*
+ * Written by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+/* 10^x = 2^(x * log2(10)) */
+ENTRY(__ieee754_exp10)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	fldl2t
+	fmulp				/* x * log2(10) */
+	fld	%st
+	frndint				/* int(x * log2(10)) */
+	fsubr	%st,%st(1)		/* fract(x * log2(10)) */
+	fxch
+	f2xm1				/* 2^(fract(x * log2(10))) - 1 */
+	fld1
+	faddp				/* 2^(fract(x * log2(10))) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	DBL_NARROW_EVAL_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_exp10)
+strong_alias (__ieee754_exp10, __exp10_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp10f.S b/REORG.TODO/sysdeps/i386/fpu/e_exp10f.S
new file mode 100644
index 0000000000..1812b34398
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp10f.S
@@ -0,0 +1,53 @@
+/*
+ * Written by Ulrich Drepper.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+/* 10^x = 2^(x * log2(10)) */
+ENTRY(__ieee754_exp10f)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	flds	4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	fldl2t
+	fmulp				/* x * log2(10) */
+	fld	%st
+	frndint				/* int(x * log2(10)) */
+	fsubr	%st,%st(1)		/* fract(x * log2(10)) */
+	fxch
+	f2xm1				/* 2^(fract(x * log2(10))) - 1 */
+	fld1
+	faddp				/* 2^(fract(x * log2(10))) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	FLT_NARROW_EVAL_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_exp10f)
+strong_alias (__ieee754_exp10f, __exp10f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp10l.S b/REORG.TODO/sysdeps/i386/fpu/e_exp10l.S
new file mode 100644
index 0000000000..d843e2b5e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp10l.S
@@ -0,0 +1,2 @@
+#define USE_AS_EXP10L
+#include <e_expl.S>
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp2.S b/REORG.TODO/sysdeps/i386/fpu/e_exp2.S
new file mode 100644
index 0000000000..fc16a96053
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp2.S
@@ -0,0 +1,52 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for exp2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_exp2)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	fld	%st
+	frndint				/* int(x) */
+	fsubr	%st,%st(1)		/* fract(x) */
+	fxch
+	f2xm1				/* 2^(fract(x)) - 1 */
+	fld1
+	faddp				/* 2^(fract(x)) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	DBL_NARROW_EVAL_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_exp2)
+strong_alias (__ieee754_exp2, __exp2_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp2f.S b/REORG.TODO/sysdeps/i386/fpu/e_exp2f.S
new file mode 100644
index 0000000000..30623cd850
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp2f.S
@@ -0,0 +1,52 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for exp2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_exp2f)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	flds	4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	fld	%st
+	frndint				/* int(x) */
+	fsubr	%st,%st(1)		/* fract(x) */
+	fxch
+	f2xm1				/* 2^(fract(x)) - 1 */
+	fld1
+	faddp				/* 2^(fract(x)) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	FLT_NARROW_EVAL_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_exp2f)
+strong_alias (__ieee754_exp2f, __exp2f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp2l.S b/REORG.TODO/sysdeps/i386/fpu/e_exp2l.S
new file mode 100644
index 0000000000..c4cb73d589
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp2l.S
@@ -0,0 +1,60 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for exp2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_LDBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_exp2l)
+#ifdef PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldt	4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	movzwl	4+8(%esp), %eax
+	andl	$0x7fff, %eax
+	cmpl	$0x3fbe, %eax
+	jge	3f
+	/* Argument's exponent below -65, result rounds to 1.  */
+	fld1
+	faddp
+	ret
+3:	fld	%st
+	frndint				/* int(x) */
+	fsubr	%st,%st(1)		/* fract(x) */
+	fxch
+	f2xm1				/* 2^(fract(x)) - 1 */
+	fld1
+	faddp				/* 2^(fract(x)) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	LDBL_CHECK_FORCE_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_exp2l)
+strong_alias (__ieee754_exp2l, __exp2l_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_expf.S b/REORG.TODO/sysdeps/i386/fpu/e_expf.S
new file mode 100644
index 0000000000..65cb4ec204
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_expf.S
@@ -0,0 +1,74 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+/* e^x = 2^(x * log2(e)) */
+ENTRY(__ieee754_expf)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	flds	4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	fldl2e
+	fmulp				/* x * log2(e) */
+	fld	%st
+	frndint				/* int(x * log2(e)) */
+	fsubr	%st,%st(1)		/* fract(x * log2(e)) */
+	fxch
+	f2xm1				/* 2^(fract(x * log2(e))) - 1 */
+	fld1
+	faddp				/* 2^(fract(x * log2(e))) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	FLT_NARROW_EVAL_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_expf)
+
+
+ENTRY(__expf_finite)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl2e
+	fmuls	4(%esp)			/* x * log2(e) */
+	fld	%st
+	frndint				/* int(x * log2(e)) */
+	fsubr	%st,%st(1)		/* fract(x * log2(e)) */
+	fxch
+	f2xm1				/* 2^(fract(x * log2(e))) - 1 */
+	fld1
+	faddp				/* 2^(fract(x * log2(e))) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	FLT_NARROW_EVAL_UFLOW_NONNEG
+	ret
+END(__expf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_expl.S b/REORG.TODO/sysdeps/i386/fpu/e_expl.S
new file mode 100644
index 0000000000..7d75fe22a1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_expl.S
@@ -0,0 +1,226 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+/*
+ * The 8087 method for the exponential function is to calculate
+ *   exp(x) = 2^(x log2(e))
+ * after separating integer and fractional parts
+ *   x log2(e) = i + f, |f| <= .5
+ * 2^i is immediate but f needs to be precise for long double accuracy.
+ * Suppress range reduction error in computing f by the following.
+ * Separate x into integer and fractional parts
+ *   x = xi + xf, |xf| <= .5
+ * Separate log2(e) into the sum of an exact number c0 and small part c1.
+ *   c0 + c1 = log2(e) to extra precision
+ * Then
+ *   f = (c0 xi - i) + c0 xf + c1 x
+ * where c0 xi is exact and so also is (c0 xi - i).
+ * -- moshier@na-net.ornl.gov
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+#ifdef USE_AS_EXP10L
+# define IEEE754_EXPL __ieee754_exp10l
+# define EXPL_FINITE __exp10l_finite
+# define FLDLOG fldl2t
+#elif defined USE_AS_EXPM1L
+# define IEEE754_EXPL __expm1l
+# undef EXPL_FINITE
+# define FLDLOG fldl2e
+#else
+# define IEEE754_EXPL __ieee754_expl
+# define EXPL_FINITE __expl_finite
+# define FLDLOG fldl2e
+#endif
+
+	.section .rodata.cst16,"aM",@progbits,16
+
+	.p2align 4
+#ifdef USE_AS_EXP10L
+	.type c0,@object
+c0:	.byte 0, 0, 0, 0, 0, 0, 0x9a, 0xd4, 0x00, 0x40
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(c0)
+	.type c1,@object
+c1:	.byte 0x58, 0x92, 0xfc, 0x15, 0x37, 0x9a, 0x97, 0xf0, 0xef, 0x3f
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(c1)
+#else
+	.type c0,@object
+c0:	.byte 0, 0, 0, 0, 0, 0, 0xaa, 0xb8, 0xff, 0x3f
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(c0)
+	.type c1,@object
+c1:	.byte 0x20, 0xfa, 0xee, 0xc2, 0x5f, 0x70, 0xa5, 0xec, 0xed, 0x3f
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(c1)
+#endif
+#ifndef USE_AS_EXPM1L
+	.type csat,@object
+csat:	.byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x0e, 0x40
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(csat)
+DEFINE_LDBL_MIN
+#endif
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(IEEE754_EXPL)
+#ifdef USE_AS_EXPM1L
+	movzwl	4+8(%esp), %eax
+	xorb	$0x80, %ah	// invert sign bit (now 1 is "positive")
+	cmpl	$0xc006, %eax	// is num positive and exp >= 6 (number is >= 128.0)?
+	jae	HIDDEN_JUMPTARGET (__expl) // (if num is denormal, it is at least >= 64.0)
+#endif
+	fldt	4(%esp)
+/* I added the following ugly construct because expl(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam			/* Is NaN or +-Inf?  */
+#ifdef PIC
+	LOAD_PIC_REG (cx)
+#endif
+#ifdef USE_AS_EXPM1L
+	xorb	$0x80, %ah
+	cmpl	$0xc006, %eax
+	fstsw	%ax
+	movb	$0x45, %dh
+	jb	4f
+
+	/* Below -64.0 (may be -NaN or -Inf). */
+	andb	%ah, %dh
+	cmpb	$0x01, %dh
+	je	6f		/* Is +-NaN, jump.  */
+	jmp	1f		/* -large, possibly -Inf.  */
+
+4:	/* In range -64.0 to 64.0 (may be +-0 but not NaN or +-Inf).  */
+	/* Test for +-0 as argument.  */
+	andb	%ah, %dh
+	cmpb	$0x40, %dh
+	je	2f
+
+	/* Test for arguments that are small but not subnormal.  */
+	movzwl	4+8(%esp), %eax
+	andl	$0x7fff, %eax
+	cmpl	$0x3fbf, %eax
+	jge	3f
+	/* Argument's exponent below -64; avoid spurious underflow if
+	   normal.  */
+	cmpl	$0x0001, %eax
+	jge	2f
+	/* Force underflow and return the argument, to avoid wrong signs
+	   of zero results from the code below in some rounding modes.  */
+	fld	%st
+	fmul	%st
+	fstp	%st
+	jmp	2f
+#else
+	movzwl	4+8(%esp), %eax
+	andl	$0x7fff, %eax
+	cmpl	$0x400d, %eax
+	jg	5f
+	cmpl	$0x3fbc, %eax
+	jge	3f
+	/* Argument's exponent below -67, result rounds to 1.  */
+	fld1
+	faddp
+	jmp	2f
+5:	/* Overflow, underflow or infinity or NaN as argument.  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f		/* Is +-Inf, jump.    */
+	cmpb	$0x01, %dh
+	je	6f		/* Is +-NaN, jump.    */
+	/* Overflow or underflow; saturate.  */
+	fstp	%st
+	fldt	MO(csat)
+	andb	$2, %ah
+	jz	3f
+	fchs
+#endif
+3:	FLDLOG			/* 1  log2(base)      */
+	fmul	%st(1), %st	/* 1  x log2(base)    */
+	/* Set round-to-nearest temporarily.  */
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fstcw	4(%esp)
+	movl	$0xf3ff, %edx
+	andl	4(%esp), %edx
+	movl	%edx, (%esp)
+	fldcw	(%esp)
+	frndint			/* 1  i               */
+	fld	%st(1)		/* 2  x               */
+	frndint			/* 2  xi              */
+	fldcw	4(%esp)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fld	%st(1)		/* 3  i               */
+	fldt	MO(c0)		/* 4  c0              */
+	fld	%st(2)		/* 5  xi              */
+	fmul	%st(1), %st	/* 5  c0 xi           */
+	fsubp	%st, %st(2)	/* 4  f = c0 xi  - i  */
+	fld	%st(4)		/* 5  x               */
+	fsub	%st(3), %st	/* 5  xf = x - xi     */
+	fmulp	%st, %st(1)	/* 4  c0 xf           */
+	faddp	%st, %st(1)	/* 3  f = f + c0 xf   */
+	fldt	MO(c1)		/* 4                  */
+	fmul	%st(4), %st	/* 4  c1 * x          */
+	faddp	%st, %st(1)	/* 3  f = f + c1 * x  */
+	f2xm1			/* 3 2^(fract(x * log2(base))) - 1 */
+#ifdef USE_AS_EXPM1L
+	fstp	%st(1)		/* 2                  */
+	fscale			/* 2 scale factor is st(1); base^x - 2^i */
+	fxch			/* 2 i                */
+	fld1			/* 3 1.0              */
+	fscale			/* 3 2^i              */
+	fld1			/* 4 1.0              */
+	fsubrp	%st, %st(1)	/* 3 2^i - 1.0        */
+	fstp	%st(1)		/* 2                  */
+	faddp	%st, %st(1)	/* 1 base^x - 1.0     */
+#else
+	fld1			/* 4 1.0              */
+	faddp			/* 3 2^(fract(x * log2(base))) */
+	fstp	%st(1)		/* 2  */
+	fscale			/* 2 scale factor is st(1); base^x */
+	fstp	%st(1)		/* 1  */
+	LDBL_CHECK_FORCE_UFLOW_NONNEG
+#endif
+	fstp	%st(1)		/* 0  */
+	jmp	2f
+1:
+#ifdef USE_AS_EXPM1L
+	/* For expm1l, only negative sign gets here.  */
+	fstp	%st
+	fld1
+	fchs
+#else
+	testl	$0x200, %eax	/* Test sign.  */
+	jz	2f		/* If positive, jump.  */
+	fstp	%st
+	fldz			/* Set result to 0.  */
+#endif
+2:	ret
+6:	/* NaN argument.  */
+	fadd	%st
+	ret
+END(IEEE754_EXPL)
+#ifdef USE_AS_EXPM1L
+libm_hidden_def (__expm1l)
+weak_alias (__expm1l, expm1l)
+#else
+strong_alias (IEEE754_EXPL, EXPL_FINITE)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_fmod.S b/REORG.TODO/sysdeps/i386/fpu/e_fmod.S
new file mode 100644
index 0000000000..26b3acc392
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_fmod.S
@@ -0,0 +1,18 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_fmod)
+	fldl	12(%esp)
+	fldl	4(%esp)
+1:	fprem
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	ret
+END (__ieee754_fmod)
+strong_alias (__ieee754_fmod, __fmod_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_fmodf.S b/REORG.TODO/sysdeps/i386/fpu/e_fmodf.S
new file mode 100644
index 0000000000..ece4d98427
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_fmodf.S
@@ -0,0 +1,19 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_fmodf)
+	flds	8(%esp)
+	flds	4(%esp)
+1:	fprem
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	ret
+END(__ieee754_fmodf)
+strong_alias (__ieee754_fmodf, __fmodf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_fmodl.c b/REORG.TODO/sysdeps/i386/fpu/e_fmodl.c
new file mode 100644
index 0000000000..49700ae8f6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_fmodl.c
@@ -0,0 +1,23 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+long double
+__ieee754_fmodl (long double x, long double y)
+{
+  long double res;
+
+  asm ("1:\tfprem\n"
+       "fstsw   %%ax\n"
+       "sahf\n"
+       "jp      1b\n"
+       "fstp    %%st(1)"
+       : "=t" (res) : "0" (x), "u" (y) : "ax", "st(1)");
+  return res;
+}
+strong_alias (__ieee754_fmodl, __fmodl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_hypot.S b/REORG.TODO/sysdeps/i386/fpu/e_hypot.S
new file mode 100644
index 0000000000..7403566fd7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_hypot.S
@@ -0,0 +1,75 @@
+/* Compute the hypothenuse of X and Y.
+   Copyright (C) 1998-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <i386-math-asm.h>
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_hypot)
+#ifdef  PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fldl	4(%esp)		// x
+	fxam
+	fnstsw
+	fldl	12(%esp)	// y : x
+	movb	%ah, %ch
+	fxam
+	fnstsw
+	movb	%ah, %al
+	orb	%ch, %ah
+	sahf
+	jc	1f
+	fmul	%st(0)		// y * y : x
+	fxch			// x : y * y
+	fmul	%st(0)		// x * x : y * y
+	faddp			// x * x + y * y
+	fsqrt
+	DBL_NARROW_EVAL_UFLOW_NONNEG
+2:	ret
+
+	// We have to test whether any of the parameters is Inf.
+	// In this case the result is infinity.
+1:	andb	$0x45, %al
+	cmpb	$5, %al
+	je	3f		// jump if y is Inf
+	andb	$0x45, %ch
+	cmpb	$5, %ch
+	jne	4f		// jump if x is not Inf
+	fxch
+3:	fstp	%st(1)
+	fabs
+	jmp	2b
+
+4:	testb	$1, %al
+	jnz	5f		// y is NaN
+	fxch
+5:	fstp	%st(1)
+	jmp	2b
+
+END(__ieee754_hypot)
+strong_alias (__ieee754_hypot, __hypot_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_hypotf.S b/REORG.TODO/sysdeps/i386/fpu/e_hypotf.S
new file mode 100644
index 0000000000..6a2c7052b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_hypotf.S
@@ -0,0 +1,64 @@
+/* Compute the hypothenuse of X and Y.
+   Copyright (C) 1998-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <i386-math-asm.h>
+
+	.text
+ENTRY(__ieee754_hypotf)
+	flds	4(%esp)		// x
+	fxam
+	fnstsw
+	flds	8(%esp)		// y : x
+	movb	%ah, %ch
+	fxam
+	fnstsw
+	movb	%ah, %al
+	orb	%ch, %ah
+	sahf
+	jc	1f
+	fmul	%st(0)		// y * y : x
+	fxch			// x : y * y
+	fmul	%st(0)		// x * x : y * y
+	faddp			// x * x + y * y
+	fsqrt
+	FLT_NARROW_EVAL
+2:	ret
+
+	// We have to test whether any of the parameters is Inf.
+	// In this case the result is infinity.
+1:	andb	$0x45, %al
+	cmpb	$5, %al
+	je	3f		// jump if y is Inf
+	andb	$0x45, %ch
+	cmpb	$5, %ch
+	jne	4f		// jump if x is not Inf
+	fxch
+3:	fstp	%st(1)
+	fabs
+	jmp	2b
+
+4:	testb	$1, %al
+	jnz	5f		// y is NaN
+	fxch
+5:	fstp	%st(1)
+	jmp	2b
+
+END(__ieee754_hypotf)
+strong_alias (__ieee754_hypotf, __hypotf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_ilogb.S b/REORG.TODO/sysdeps/i386/fpu/e_ilogb.S
new file mode 100644
index 0000000000..29ef2214e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_ilogb.S
@@ -0,0 +1,42 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_ilogb.S,v 1.5 1995/10/12 15:53:09 jtc Exp $")
+
+ENTRY(__ieee754_ilogb)
+	fldl	4(%esp)
+/* I added the following ugly construct because ilogb(+-Inf) is
+   required to return INT_MAX in ISO C99.
+   -- jakub@redhat.com.  */
+	fxam			/* Is NaN or +-Inf?  */
+	fstsw   %ax
+	movb    $0x45, %dh
+	andb    %ah, %dh
+	cmpb    $0x05, %dh
+	je      1f		/* Is +-Inf, jump.  */
+	cmpb    $0x40, %dh
+	je      2f		/* Is +-0, jump.  */
+
+	fxtract
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	fstp	%st
+
+	fistpl	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+
+	ret
+
+1:	fstp	%st
+	movl	$0x7fffffff, %eax
+	ret
+2:	fstp	%st
+	movl	$0x80000000, %eax	/* FP_ILOGB0  */
+	ret
+END (__ieee754_ilogb)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S b/REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S
new file mode 100644
index 0000000000..d72de6c84a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S
@@ -0,0 +1,42 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_ilogbf.S,v 1.4 1995/10/22 20:32:43 pk Exp $")
+
+ENTRY(__ieee754_ilogbf)
+	flds	4(%esp)
+/* I added the following ugly construct because ilogb(+-Inf) is
+   required to return INT_MAX in ISO C99.
+   -- jakub@redhat.com.  */
+	fxam			/* Is NaN or +-Inf?  */
+	fstsw   %ax
+	movb    $0x45, %dh
+	andb    %ah, %dh
+	cmpb    $0x05, %dh
+	je      1f		/* Is +-Inf, jump.  */
+	cmpb    $0x40, %dh
+	je      2f		/* Is +-0, jump.  */
+
+	fxtract
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	fstp	%st
+
+	fistpl	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+
+	ret
+
+1:	fstp	%st
+	movl	$0x7fffffff, %eax
+	ret
+2:	fstp	%st
+	movl	$0x80000000, %eax	/* FP_ILOGB0  */
+	ret
+END (__ieee754_ilogbf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S b/REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S
new file mode 100644
index 0000000000..60761dfa38
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S
@@ -0,0 +1,43 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__ieee754_ilogbl)
+	fldt	4(%esp)
+/* I added the following ugly construct because ilogb(+-Inf) is
+   required to return INT_MAX in ISO C99.
+   -- jakub@redhat.com.  */
+	fxam			/* Is NaN or +-Inf?  */
+	fstsw   %ax
+	movb    $0x45, %dh
+	andb    %ah, %dh
+	cmpb    $0x05, %dh
+	je      1f		/* Is +-Inf, jump.  */
+	cmpb    $0x40, %dh
+	je      2f		/* Is +-0, jump.  */
+
+	fxtract
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	fstp	%st
+
+	fistpl	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+
+	ret
+
+1:	fstp	%st
+	movl	$0x7fffffff, %eax
+	ret
+2:	fstp	%st
+	movl	$0x80000000, %eax	/* FP_ILOGB0  */
+	ret
+END (__ieee754_ilogbl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log.S b/REORG.TODO/sysdeps/i386/fpu/e_log.S
new file mode 100644
index 0000000000..335df22577
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log.S
@@ -0,0 +1,92 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log)
+	fldln2			// log(2)
+	fldl	4(%esp)		// x : log(2)
+	fxam
+	fnstsw
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	sahf
+	jc	3f		// in case x is NaN or +-Inf
+4:	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+
+3:	jp	4b		// in case x is +-Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+END (__ieee754_log)
+
+ENTRY(__log_finite)
+	fldln2			// log(2)
+	fldl	4(%esp)		// x : log(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2b
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	6f
+	fabs			// log(1) is +0 in all rounding modes.
+6:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+END(__log_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log10.S b/REORG.TODO/sysdeps/i386/fpu/e_log10.S
new file mode 100644
index 0000000000..17277084ca
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log10.S
@@ -0,0 +1,68 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log10)
+	fldlg2			// log10(2)
+	fldl	4(%esp)		// x : log10(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fxam
+	fnstsw
+	fld	%st		// x : x : log10(2)
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fsubl	MO(one)		// x-1 : x : log10(2)
+	fld	%st		// x-1 : x-1 : x : log10(2)
+	fabs			// |x-1| : x-1 : x : log10(2)
+	fcompl	MO(limit)	// x-1 : x : log10(2)
+	fnstsw			// x-1 : x : log10(2)
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log10(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : log10(2)
+	fyl2xp1			// log10(x)
+	ret
+
+2:	fstp	%st(0)		// x : log10(2)
+	fyl2x			// log10(x)
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+END (__ieee754_log10)
+strong_alias (__ieee754_log10, __log10_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log10f.S b/REORG.TODO/sysdeps/i386/fpu/e_log10f.S
new file mode 100644
index 0000000000..72a3b88251
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log10f.S
@@ -0,0 +1,69 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log10f)
+	fldlg2			// log10(2)
+	flds	4(%esp)		// x : log10(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fxam
+	fnstsw
+	fld	%st		// x : x : log10(2)
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fsubl	MO(one)		// x-1 : x : log10(2)
+	fld	%st		// x-1 : x-1 : x : log10(2)
+	fabs			// |x-1| : x-1 : x : log10(2)
+	fcompl	MO(limit)	// x-1 : x : log10(2)
+	fnstsw			// x-1 : x : log10(2)
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log10(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : log10(2)
+	fyl2xp1			// log10(x)
+	ret
+
+2:	fstp	%st(0)		// x : log10(2)
+	fyl2x			// log10(x)
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+END (__ieee754_log10f)
+strong_alias (__ieee754_log10f, __log10f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log10l.S b/REORG.TODO/sysdeps/i386/fpu/e_log10l.S
new file mode 100644
index 0000000000..9326b19796
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log10l.S
@@ -0,0 +1,71 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log10l)
+	fldlg2			// log10(2)
+	fldt	4(%esp)		// x : log10(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fxam
+	fnstsw
+	fld	%st		// x : x : log10(2)
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fsubl	MO(one)		// x-1 : x : log10(2)
+	fld	%st		// x-1 : x-1 : x : log10(2)
+	fabs			// |x-1| : x-1 : x : log10(2)
+	fcompl	MO(limit)	// x-1 : x : log10(2)
+	fnstsw			// x-1 : x : log10(2)
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log10(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : log10(2)
+	fyl2xp1			// log10(x)
+	ret
+
+2:	fstp	%st(0)		// x : log10(2)
+	fyl2x			// log10(x)
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	fadd	%st(0)
+	ret
+END(__ieee754_log10l)
+strong_alias (__ieee754_log10l, __log10l_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log2.S b/REORG.TODO/sysdeps/i386/fpu/e_log2.S
new file mode 100644
index 0000000000..73ff0fffd3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log2.S
@@ -0,0 +1,69 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for use as log2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fldl	MO(one)
+	fldl	4(%esp)		// x : 1
+	fxam
+	fnstsw
+	fld	%st		// x : x : 1
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fsub	%st(2), %st	// x-1 : x : 1
+	fld	%st		// x-1 : x-1 : x : 1
+	fabs			// |x-1| : x-1 : x : 1
+	fcompl	MO(limit)	// x-1 : x : 1
+	fnstsw			// x-1 : x : 1
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log2(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : 1
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : 1
+	fyl2x			// log(x)
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+END (__ieee754_log2)
+strong_alias (__ieee754_log2, __log2_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log2f.S b/REORG.TODO/sysdeps/i386/fpu/e_log2f.S
new file mode 100644
index 0000000000..344eeb495e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log2f.S
@@ -0,0 +1,69 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for use as log2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log2f)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fldl	MO(one)
+	flds	4(%esp)		// x : 1
+	fxam
+	fnstsw
+	fld	%st		// x : x : 1
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fsub	%st(2), %st	// x-1 : x : 1
+	fld	%st		// x-1 : x-1 : x : 1
+	fabs			// |x-1| : x-1 : x : 1
+	fcompl	MO(limit)	// x-1 : x : 1
+	fnstsw			// x-1 : x : 1
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log2(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : 1
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : 1
+	fyl2x			// log(x)
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+END (__ieee754_log2f)
+strong_alias (__ieee754_log2f, __log2f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log2l.S b/REORG.TODO/sysdeps/i386/fpu/e_log2l.S
new file mode 100644
index 0000000000..73e62ea908
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log2l.S
@@ -0,0 +1,70 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for use as log2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log2l)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fldl	MO(one)
+	fldt	4(%esp)		// x : 1
+	fxam
+	fnstsw
+	fld	%st		// x : x : 1
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fsub	%st(2), %st	// x-1 : x : 1
+	fld	%st		// x-1 : x-1 : x : 1
+	fabs			// |x-1| : x-1 : x : 1
+	fcompl	MO(limit)	// x-1 : x : 1
+	fnstsw			// x-1 : x : 1
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log2(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : 1
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : 1
+	fyl2x			// log(x)
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	fadd	%st(0)
+	ret
+END (__ieee754_log2l)
+strong_alias (__ieee754_log2l, __log2l_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_logf.S b/REORG.TODO/sysdeps/i386/fpu/e_logf.S
new file mode 100644
index 0000000000..de967a31f5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_logf.S
@@ -0,0 +1,93 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_logf)
+	fldln2			// log(2)
+	flds	4(%esp)		// x : log(2)
+	fxam
+	fnstsw
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	sahf
+	jc	3f		// in case x is NaN or +-Inf
+4:	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+
+3:	jp	4b		// in case x is +-Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+END (__ieee754_logf)
+
+ENTRY(__logf_finite)
+	fldln2			// log(2)
+	flds	4(%esp)		// x : log(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2b
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	6f
+	fabs			// log(1) is +0 in all rounding modes.
+6:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+END(__logf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_logl.S b/REORG.TODO/sysdeps/i386/fpu/e_logl.S
new file mode 100644
index 0000000000..53127d704e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_logl.S
@@ -0,0 +1,97 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_logl)
+	fldln2			// log(2)
+	fldt	4(%esp)		// x : log(2)
+	fxam
+	fnstsw
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	sahf
+	jc	3f		// in case x is NaN or +-Inf
+	movzwl	4+8(%esp), %eax
+	cmpl	$0xc000, %eax
+	jae	6f		// x <= -2, avoid overflow from -LDBL_MAX - 1.
+4:	fsubl	MO(one)		// x-1 : x : log(2)
+6:	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+
+3:	jp	4b		// in case x is +-Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	fadd	%st(0)
+	ret
+END (__ieee754_logl)
+
+ENTRY(__logl_finite)
+	fldln2			// log(2)
+	fldt	4(%esp)		// x : log(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2b
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	7f
+	fabs			// log(1) is +0 in all rounding modes.
+7:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+END(__logl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_pow.S b/REORG.TODO/sysdeps/i386/fpu/e_pow.S
new file mode 100644
index 0000000000..2edb9a9fbc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_pow.S
@@ -0,0 +1,456 @@
+/* ix87 specific implementation of pow function.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	.type p63,@object
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
+	.type p10,@object
+p10:	.byte 0, 0, 0, 0, 0, 0, 0x90, 0x40
+	ASM_SIZE_DIRECTIVE(p10)
+
+	.section .rodata.cst16,"aM",@progbits,16
+
+	.p2align 3
+	.type infinity,@object
+inf_zero:
+infinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
+	ASM_SIZE_DIRECTIVE(infinity)
+	.type zero,@object
+zero:	.double 0.0
+	ASM_SIZE_DIRECTIVE(zero)
+	.type minf_mzero,@object
+minf_mzero:
+minfinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
+mzero:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	ASM_SIZE_DIRECTIVE(minf_mzero)
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+	.text
+ENTRY(__ieee754_pow)
+	fldl	12(%esp)	// y
+	fxam
+
+#ifdef	PIC
+	LOAD_PIC_REG (cx)
+#endif
+
+	fnstsw
+	movb	%ah, %dl
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah	// is y == 0 ?
+	je	11f
+
+	cmpb	$0x05, %ah	// is y == ±inf ?
+	je	12f
+
+	cmpb	$0x01, %ah	// is y == NaN ?
+	je	30f
+
+	fldl	4(%esp)		// x : y
+
+	subl	$8,%esp
+	cfi_adjust_cfa_offset (8)
+
+	fxam
+	fnstsw
+	movb	%ah, %dh
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	je	20f		// x is ±0
+
+	cmpb	$0x05, %ah
+	je	15f		// x is ±inf
+
+	cmpb	$0x01, %ah
+	je	32f		// x is NaN
+
+	fxch			// y : x
+
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p63)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
+	/* First see whether `y' is a natural number.  In this case we
+	   can use a more precise algorithm.  */
+	fld	%st		// y : y : x
+	fistpll	(%esp)		// y : x
+	fildll	(%esp)		// int(y) : y : x
+	fucomp	%st(1)		// y : x
+	fnstsw
+	sahf
+	jne	3f
+
+	/* OK, we have an integer value for y.  If large enough that
+	   errors may propagate out of the 11 bits excess precision, use
+	   the algorithm for real exponent instead.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p10)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	orl	$0, %edx
+	fstp	%st(0)		// x
+	jns	4f		// y >= 0, jump
+	fdivrl	MO(one)		// 1/x		(now referred to as x)
+	negl	%eax
+	adcl	$0, %edx
+	negl	%edx
+4:	fldl	MO(one)		// 1 : x
+	fxch
+
+	/* If y is even, take the absolute value of x.  Otherwise,
+	   ensure all intermediate values that might overflow have the
+	   sign of x.  */
+	testb	$1, %al
+	jnz	6f
+	fabs
+
+6:	shrdl	$1, %edx, %eax
+	jnc	5f
+	fxch
+	fabs
+	fmul	%st(1)		// x : ST*x
+	fxch
+5:	fld	%st		// x : x : ST*x
+	fabs			// |x| : x : ST*x
+	fmulp			// |x|*x : ST*x
+	shrl	$1, %edx
+	movl	%eax, %ecx
+	orl	%edx, %ecx
+	jnz	6b
+	fstp	%st(0)		// ST*x
+#ifdef	PIC
+	LOAD_PIC_REG (cx)
+#endif
+	DBL_NARROW_EVAL_UFLOW_NONNAN
+	ret
+
+	/* y is ±NAN */
+30:	fldl	4(%esp)		// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fucomp	%st(1)		// x : y
+	fnstsw
+	sahf
+	je	31f
+	fxch			// y : x
+31:	fstp	%st(1)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+32:	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fstp	%st(1)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+2:	// y is a large integer (absolute value at least 1L<<10), but
+	// may be odd unless at least 1L<<64.  So it may be necessary
+	// to adjust the sign of a negative result afterwards.
+	fxch			// x : y
+	fabs			// |x| : y
+	fxch			// y : x
+	.align ALIGNARG(4)
+3:	/* y is a real number.  */
+	fxch			// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fldl	MO(limit)	// 0.29 : 1.0 : x : y
+	fld	%st(2)		// x : 0.29 : 1.0 : x : y
+	fsub	%st(2)		// x-1 : 0.29 : 1.0 : x : y
+	fabs			// |x-1| : 0.29 : 1.0 : x : y
+	fucompp			// 1.0 : x : y
+	fnstsw
+	fxch			// x : 1.0 : y
+	sahf
+	ja	7f
+	fsub	%st(1)		// x-1 : 1.0 : y
+	fyl2xp1			// log2(x) : y
+	jmp	8f
+
+7:	fyl2x			// log2(x) : y
+8:	fmul	%st(1)		// y*log2(x) : y
+	fst	%st(1)		// y*log2(x) : y*log2(x)
+	frndint			// int(y*log2(x)) : y*log2(x)
+	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
+	fxch			// fract(y*log2(x)) : int(y*log2(x))
+	f2xm1			// 2^fract(y*log2(x))-1 : int(y*log2(x))
+	faddl	MO(one)		// 2^fract(y*log2(x)) : int(y*log2(x))
+
+	// Before scaling, we must negate if x is negative and y is an
+	// odd integer.
+	testb	$2, %dh
+	jz	291f
+	// x is negative.  If y is an odd integer, negate the result.
+	fldl	20(%esp)	// y : 2^fract(y*log2(x)) : int(y*log2(x))
+	fld	%st		// y : y : 2^fract(y*log2(x)) : int(y*log2(x))
+	fabs			// |y| : y : 2^fract(y*log2(x)) : int(y*log2(x))
+	fcompl	MO(p63)		// y : 2^fract(y*log2(x)) : int(y*log2(x))
+	fnstsw
+	sahf
+	jnc	290f
+
+	// We must find out whether y is an odd integer.
+	fld	%st		// y : y : 2^fract(y*log2(x)) : int(y*log2(x))
+	fistpll	(%esp)		// y : 2^fract(y*log2(x)) : int(y*log2(x))
+	fildll	(%esp)		// int(y) : y : 2^fract(y*log2(x)) : int(y*log2(x))
+	fucompp			// 2^fract(y*log2(x)) : int(y*log2(x))
+	fnstsw
+	sahf
+	jne	291f
+
+	// OK, the value is an integer, but is it odd?
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	292f		// jump if not odd
+	// It's an odd integer.
+	fchs
+	jmp	292f
+
+	cfi_adjust_cfa_offset (8)
+290:	fstp	%st(0)		// 2^fract(y*log2(x)) : int(y*log2(x))
+291:	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+292:	fscale			// +/- 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x))
+	fstp	%st(1)		// +/- 2^fract(y*log2(x))*2^int(y*log2(x))
+	DBL_NARROW_EVAL_UFLOW_NONNAN
+	ret
+
+
+	// pow(x,±0) = 1
+	.align ALIGNARG(4)
+11:	fstp	%st(0)		// pop y
+	fldl	MO(one)
+	ret
+
+	// y == ±inf
+	.align ALIGNARG(4)
+12:	fstp	%st(0)		// pop y
+	fldl	MO(one)		// 1
+	fldl	4(%esp)		// x : 1
+	fabs			// abs(x) : 1
+	fucompp			// < 1, == 1, or > 1
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x45, %ah
+	je	13f		// jump if x is NaN
+
+	cmpb	$0x40, %ah
+	je	14f		// jump if |x| == 1
+
+	shlb	$1, %ah
+	xorb	%ah, %dl
+	andl	$2, %edx
+	fldl	MOX(inf_zero, %edx, 4)
+	ret
+
+	.align ALIGNARG(4)
+14:	fldl	MO(one)
+	ret
+
+	.align ALIGNARG(4)
+13:	fldl	4(%esp)		// load x == NaN
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is ±inf
+15:	fstp	%st(0)		// y
+	testb	$2, %dh
+	jz	16f		// jump if x == +inf
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, so test
+	// that (in which case y is certainly even) before testing
+	// whether y is odd.
+	fld	%st		// y : y
+	fabs			// |y| : y
+	fcompl	MO(p63)		// y
+	fnstsw
+	sahf
+	jnc	16f
+
+	// We must find out whether y is an odd integer.
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	17f
+
+	// OK, the value is an integer.
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	18f		// jump if not odd
+	// It's an odd integer.
+	shrl	$31, %edx
+	fldl	MOX(minf_mzero, %edx, 8)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+16:	fcompl	MO(zero)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fnstsw
+	shrl	$5, %eax
+	andl	$8, %eax
+	fldl	MOX(inf_zero, %eax, 1)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+17:	shll	$30, %edx	// sign bit for y in right position
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+18:	shrl	$31, %edx
+	fldl	MOX(inf_zero, %edx, 8)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is ±0
+20:	fstp	%st(0)		// y
+	testb	$2, %dl
+	jz	21f		// y > 0
+
+	// x is ±0 and y is < 0.  We must find out whether y is an odd integer.
+	testb	$2, %dh
+	jz	25f
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, so test
+	// that (in which case y is certainly even) before testing
+	// whether y is odd.
+	fld	%st		// y : y
+	fabs			// |y| : y
+	fcompl	MO(p63)		// y
+	fnstsw
+	sahf
+	jnc	25f
+
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	26f
+
+	// OK, the value is an integer.
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	27f		// jump if not odd
+	// It's an odd integer.
+	// Raise divide-by-zero exception and get minus infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	fchs
+	ret
+
+	cfi_adjust_cfa_offset (8)
+25:	fstp	%st(0)
+26:	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+27:	// Raise divide-by-zero exception and get infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is ±0 and y is > 0.  We must find out whether y is an odd integer.
+21:	testb	$2, %dh
+	jz	22f
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, so test
+	// that (in which case y is certainly even) before testing
+	// whether y is odd.
+	fcoml	MO(p63)		// y
+	fnstsw
+	sahf
+	jnc	22f
+
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	23f
+
+	// OK, the value is an integer.
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	24f		// jump if not odd
+	// It's an odd integer.
+	fldl	MO(mzero)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+22:	fstp	%st(0)
+23:	addl	$8, %esp	// Don't use 2 x pop
+	cfi_adjust_cfa_offset (-8)
+24:	fldl	MO(zero)
+	ret
+
+END(__ieee754_pow)
+strong_alias (__ieee754_pow, __pow_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_powf.S b/REORG.TODO/sysdeps/i386/fpu/e_powf.S
new file mode 100644
index 0000000000..467ef2380b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_powf.S
@@ -0,0 +1,392 @@
+/* ix87 specific implementation of pow function.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	.type p31,@object
+p31:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41
+	ASM_SIZE_DIRECTIVE(p31)
+
+	.section .rodata.cst16,"aM",@progbits,16
+
+	.p2align 3
+	.type infinity,@object
+inf_zero:
+infinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
+	ASM_SIZE_DIRECTIVE(infinity)
+	.type zero,@object
+zero:	.double 0.0
+	ASM_SIZE_DIRECTIVE(zero)
+	.type minf_mzero,@object
+minf_mzero:
+minfinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
+mzero:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	ASM_SIZE_DIRECTIVE(minf_mzero)
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+	.text
+ENTRY(__ieee754_powf)
+	flds	8(%esp)	// y
+	fxam
+
+#ifdef	PIC
+	LOAD_PIC_REG (cx)
+#endif
+
+	fnstsw
+	movb	%ah, %dl
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah	// is y == 0 ?
+	je	11f
+
+	cmpb	$0x05, %ah	// is y == ±inf ?
+	je	12f
+
+	cmpb	$0x01, %ah	// is y == NaN ?
+	je	30f
+
+	flds	4(%esp)		// x : y
+
+	subl	$4, %esp
+	cfi_adjust_cfa_offset (4)
+
+	fxam
+	fnstsw
+	movb	%ah, %dh
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	je	20f		// x is ±0
+
+	cmpb	$0x05, %ah
+	je	15f		// x is ±inf
+
+	cmpb	$0x01, %ah
+	je	33f		// x is NaN
+
+	fxch			// y : x
+
+	/* fistpl raises invalid exception for |y| >= 1L<<31.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p31)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
+	/* First see whether `y' is a natural number.  In this case we
+	   can use a more precise algorithm.  */
+	fld	%st		// y : y : x
+	fistpl	(%esp)		// y : x
+	fildl	(%esp)		// int(y) : y : x
+	fucomp	%st(1)		// y : x
+	fnstsw
+	sahf
+	jne	3f
+
+	/* OK, we have an integer value for y.  */
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	orl	$0, %edx
+	fstp	%st(0)		// x
+	jns	4f		// y >= 0, jump
+	fdivrl	MO(one)		// 1/x		(now referred to as x)
+	negl	%edx
+4:	fldl	MO(one)		// 1 : x
+	fxch
+
+	/* If y is even, take the absolute value of x.  Otherwise,
+	   ensure all intermediate values that might overflow have the
+	   sign of x.  */
+	testb	$1, %dl
+	jnz	6f
+	fabs
+
+6:	shrl	$1, %edx
+	jnc	5f
+	fxch
+	fabs
+	fmul	%st(1)		// x : ST*x
+	fxch
+5:	fld	%st		// x : x : ST*x
+	fabs			// |x| : x : ST*x
+	fmulp			// |x|*x : ST*x
+	testl	%edx, %edx
+	jnz	6b
+	fstp	%st(0)		// ST*x
+	FLT_NARROW_EVAL_UFLOW_NONNAN
+	ret
+
+	/* y is ±NAN */
+30:	flds	4(%esp)		// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fucomp	%st(1)		// x : y
+	fnstsw
+	sahf
+	je	31f
+	fxch			// y : x
+31:	fstp	%st(1)
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	.align ALIGNARG(4)
+2:	/* y is a large integer (so even).  */
+	fxch			// x : y
+	fabs			// |x| : y
+	fxch			// y : x
+	.align ALIGNARG(4)
+3:	/* y is a real number.  */
+	fxch			// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fldl	MO(limit)	// 0.29 : 1.0 : x : y
+	fld	%st(2)		// x : 0.29 : 1.0 : x : y
+	fsub	%st(2)		// x-1 : 0.29 : 1.0 : x : y
+	fabs			// |x-1| : 0.29 : 1.0 : x : y
+	fucompp			// 1.0 : x : y
+	fnstsw
+	fxch			// x : 1.0 : y
+	sahf
+	ja	7f
+	fsub	%st(1)		// x-1 : 1.0 : y
+	fyl2xp1			// log2(x) : y
+	jmp	8f
+
+7:	fyl2x			// log2(x) : y
+8:	fmul	%st(1)		// y*log2(x) : y
+	fst	%st(1)		// y*log2(x) : y*log2(x)
+	frndint			// int(y*log2(x)) : y*log2(x)
+	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
+	fxch			// fract(y*log2(x)) : int(y*log2(x))
+	f2xm1			// 2^fract(y*log2(x))-1 : int(y*log2(x))
+	faddl	MO(one)		// 2^fract(y*log2(x)) : int(y*log2(x))
+	fscale			// 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x))
+32:	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
+	FLT_NARROW_EVAL_UFLOW_NONNAN
+	ret
+
+	/* x is NaN.  */
+	cfi_adjust_cfa_offset (4)
+33:	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+	fstp	%st(1)
+	ret
+
+	// pow(x,±0) = 1
+	.align ALIGNARG(4)
+11:	fstp	%st(0)		// pop y
+	fldl	MO(one)
+	ret
+
+	// y == ±inf
+	.align ALIGNARG(4)
+12:	fstp	%st(0)		// pop y
+	fldl	MO(one)		// 1
+	flds	4(%esp)		// x : 1
+	fabs			// abs(x) : 1
+	fucompp			// < 1, == 1, or > 1
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x45, %ah
+	je	13f		// jump if x is NaN
+
+	cmpb	$0x40, %ah
+	je	14f		// jump if |x| == 1
+
+	shlb	$1, %ah
+	xorb	%ah, %dl
+	andl	$2, %edx
+	fldl	MOX(inf_zero, %edx, 4)
+	ret
+
+	.align ALIGNARG(4)
+14:	fldl	MO(one)
+	ret
+
+	.align ALIGNARG(4)
+13:	flds	4(%esp)		// load x == NaN
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	.align ALIGNARG(4)
+	// x is ±inf
+15:	fstp	%st(0)		// y
+	testb	$2, %dh
+	jz	16f		// jump if x == +inf
+
+	// fistpl raises invalid exception for |y| >= 1L<<31, so test
+	// that (in which case y is certainly even) before testing
+	// whether y is odd.
+	fld	%st		// y : y
+	fabs			// |y| : y
+	fcompl	MO(p31)		// y
+	fnstsw
+	sahf
+	jnc	16f
+
+	// We must find out whether y is an odd integer.
+	fld	%st		// y : y
+	fistpl	(%esp)		// y
+	fildl	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	17f
+
+	// OK, the value is an integer.
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	testb	$1, %dl
+	jz	18f		// jump if not odd
+	// It's an odd integer.
+	shrl	$31, %edx
+	fldl	MOX(minf_mzero, %edx, 8)
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	.align ALIGNARG(4)
+16:	fcompl	MO(zero)
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+	fnstsw
+	shrl	$5, %eax
+	andl	$8, %eax
+	fldl	MOX(inf_zero, %eax, 1)
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	.align ALIGNARG(4)
+17:	shll	$30, %edx	// sign bit for y in right position
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+18:	shrl	$31, %edx
+	fldl	MOX(inf_zero, %edx, 8)
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	.align ALIGNARG(4)
+	// x is ±0
+20:	fstp	%st(0)		// y
+	testb	$2, %dl
+	jz	21f		// y > 0
+
+	// x is ±0 and y is < 0.  We must find out whether y is an odd integer.
+	testb	$2, %dh
+	jz	25f
+
+	// fistpl raises invalid exception for |y| >= 1L<<31, so test
+	// that (in which case y is certainly even) before testing
+	// whether y is odd.
+	fld	%st		// y : y
+	fabs			// |y| : y
+	fcompl	MO(p31)		// y
+	fnstsw
+	sahf
+	jnc	25f
+
+	fld	%st		// y : y
+	fistpl	(%esp)		// y
+	fildl	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	26f
+
+	// OK, the value is an integer.
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	testb	$1, %dl
+	jz	27f		// jump if not odd
+	// It's an odd integer.
+	// Raise divide-by-zero exception and get minus infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	fchs
+	ret
+
+	cfi_adjust_cfa_offset (4)
+25:	fstp	%st(0)
+26:	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+27:	// Raise divide-by-zero exception and get infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	.align ALIGNARG(4)
+	// x is ±0 and y is > 0.  We must find out whether y is an odd integer.
+21:	testb	$2, %dh
+	jz	22f
+
+	// fistpl raises invalid exception for |y| >= 1L<<31, so test
+	// that (in which case y is certainly even) before testing
+	// whether y is odd.
+	fcoml	MO(p31)		// y
+	fnstsw
+	sahf
+	jnc	22f
+
+	fld	%st		// y : y
+	fistpl	(%esp)		// y
+	fildl	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	23f
+
+	// OK, the value is an integer.
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	testb	$1, %dl
+	jz	24f		// jump if not odd
+	// It's an odd integer.
+	fldl	MO(mzero)
+	ret
+
+	cfi_adjust_cfa_offset (4)
+22:	fstp	%st(0)
+23:	addl	$4, %esp	// Don't use pop.
+	cfi_adjust_cfa_offset (-4)
+24:	fldl	MO(zero)
+	ret
+
+END(__ieee754_powf)
+strong_alias (__ieee754_powf, __powf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_powl.S b/REORG.TODO/sysdeps/i386/fpu/e_powl.S
new file mode 100644
index 0000000000..9e162848e4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_powl.S
@@ -0,0 +1,459 @@
+/* ix87 specific implementation of pow function.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type p2,@object
+p2:	.byte 0, 0, 0, 0, 0, 0, 0x10, 0x40
+	ASM_SIZE_DIRECTIVE(p2)
+	.type p63,@object
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
+	.type p64,@object
+p64:	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
+	ASM_SIZE_DIRECTIVE(p64)
+	.type p78,@object
+p78:	.byte 0, 0, 0, 0, 0, 0, 0xd0, 0x44
+	ASM_SIZE_DIRECTIVE(p78)
+	.type pm79,@object
+pm79:	.byte 0, 0, 0, 0, 0, 0, 0, 0x3b
+	ASM_SIZE_DIRECTIVE(pm79)
+
+	.section .rodata.cst16,"aM",@progbits,16
+
+	.p2align 3
+	.type infinity,@object
+inf_zero:
+infinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
+	ASM_SIZE_DIRECTIVE(infinity)
+	.type zero,@object
+zero:	.double 0.0
+	ASM_SIZE_DIRECTIVE(zero)
+	.type minf_mzero,@object
+minf_mzero:
+minfinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
+mzero:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	ASM_SIZE_DIRECTIVE(minf_mzero)
+DEFINE_LDBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+	.text
+ENTRY(__ieee754_powl)
+	fldt	16(%esp)	// y
+	fxam
+
+#ifdef	PIC
+	LOAD_PIC_REG (cx)
+#endif
+
+	fnstsw
+	movb	%ah, %dl
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah	// is y == 0 ?
+	je	11f
+
+	cmpb	$0x05, %ah	// is y == ±inf ?
+	je	12f
+
+	cmpb	$0x01, %ah	// is y == NaN ?
+	je	30f
+
+	fldt	4(%esp)		// x : y
+
+	subl	$8,%esp
+	cfi_adjust_cfa_offset (8)
+
+	fxam
+	fnstsw
+	movb	%ah, %dh
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	je	20f		// x is ±0
+
+	cmpb	$0x05, %ah
+	je	15f		// x is ±inf
+
+	cmpb	$0x01, %ah
+	je	32f		// x is NaN
+
+	fxch			// y : x
+
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p63)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
+	/* First see whether `y' is a natural number.  In this case we
+	   can use a more precise algorithm.  */
+	fld	%st		// y : y : x
+	fistpll	(%esp)		// y : x
+	fildll	(%esp)		// int(y) : y : x
+	fucomp	%st(1)		// y : x
+	fnstsw
+	sahf
+	je	9f
+
+	// If y has absolute value at most 0x1p-79, then any finite
+	// nonzero x will result in 1.  Saturate y to those bounds to
+	// avoid underflow in the calculation of y*log2(x).
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(pm79)	// y : x
+	fnstsw
+	sahf
+	jnc	3f
+	fstp	%st(0)		// pop y
+	fldl	MO(pm79)	// 0x1p-79 : x
+	testb	$2, %dl
+	jnz	3f		// y > 0
+	fchs			// -0x1p-79 : x
+	jmp	3f
+
+9:	/* OK, we have an integer value for y.  Unless very small
+	   (we use < 4), use the algorithm for real exponent to avoid
+	   accumulation of errors.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p2)		// y : x
+	fnstsw
+	sahf
+	jnc	3f
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	orl	$0, %edx
+	fstp	%st(0)		// x
+	jns	4f		// y >= 0, jump
+	fdivrl	MO(one)		// 1/x		(now referred to as x)
+	negl	%eax
+	adcl	$0, %edx
+	negl	%edx
+4:	fldl	MO(one)		// 1 : x
+	fxch
+
+	/* If y is even, take the absolute value of x.  Otherwise,
+	   ensure all intermediate values that might overflow have the
+	   sign of x.  */
+	testb	$1, %al
+	jnz	6f
+	fabs
+
+6:	shrdl	$1, %edx, %eax
+	jnc	5f
+	fxch
+	fabs
+	fmul	%st(1)		// x : ST*x
+	fxch
+5:	fld	%st		// x : x : ST*x
+	fabs			// |x| : x : ST*x
+	fmulp			// |x|*x : ST*x
+	shrl	$1, %edx
+	movl	%eax, %ecx
+	orl	%edx, %ecx
+	jnz	6b
+	fstp	%st(0)		// ST*x
+#ifdef	PIC
+	LOAD_PIC_REG (cx)
+#endif
+	LDBL_CHECK_FORCE_UFLOW_NONNAN
+	ret
+
+	/* y is ±NAN */
+30:	fldt	4(%esp)		// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fucomp	%st(1)		// x : y
+	fnstsw
+	sahf
+	je	33f
+31:	/* At least one argument NaN, and result should be NaN.  */
+	faddp
+	ret
+33:	jp	31b
+	/* pow (1, NaN); check if the NaN signaling.  */
+	testb	$0x40, 23(%esp)
+	jz	31b
+	fstp	%st(1)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+32:	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	faddp
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+2:	// y is a large integer (absolute value at least 1L<<63).
+	// If y has absolute value at least 1L<<78, then any finite
+	// nonzero x will result in 0 (underflow), 1 or infinity (overflow).
+	// Saturate y to those bounds to avoid overflow in the calculation
+	// of y*log2(x).
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p78)		// y : x
+	fnstsw
+	sahf
+	jc	3f
+	fstp	%st(0)		// pop y
+	fldl	MO(p78)		// 1L<<78 : x
+	testb	$2, %dl
+	jz	3f		// y > 0
+	fchs			// -(1L<<78) : x
+	.align ALIGNARG(4)
+3:	/* y is a real number.  */
+	subl	$28, %esp
+	cfi_adjust_cfa_offset (28)
+	fstpt	12(%esp)	// x
+	fstpt	(%esp)		// <empty>
+	call	HIDDEN_JUMPTARGET (__powl_helper)	// <result>
+	addl	$36, %esp
+	cfi_adjust_cfa_offset (-36)
+	ret
+
+	// pow(x,±0) = 1, unless x is sNaN
+	.align ALIGNARG(4)
+11:	fstp	%st(0)		// pop y
+	fldt	4(%esp)		// x
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	112f		// x is NaN
+111:	fstp	%st(0)
+	fldl	MO(one)
+	ret
+
+112:	testb	$0x40, 11(%esp)
+	jnz	111b
+	fadd	%st(0)
+	ret
+
+	// y == ±inf
+	.align ALIGNARG(4)
+12:	fstp	%st(0)		// pop y
+	fldl	MO(one)		// 1
+	fldt	4(%esp)		// x : 1
+	fabs			// abs(x) : 1
+	fucompp			// < 1, == 1, or > 1
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x45, %ah
+	je	13f		// jump if x is NaN
+
+	cmpb	$0x40, %ah
+	je	14f		// jump if |x| == 1
+
+	shlb	$1, %ah
+	xorb	%ah, %dl
+	andl	$2, %edx
+	fldl	MOX(inf_zero, %edx, 4)
+	ret
+
+	.align ALIGNARG(4)
+14:	fldl	MO(one)
+	ret
+
+	.align ALIGNARG(4)
+13:	fldt	4(%esp)		// load x == NaN
+	fadd	%st(0)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is ±inf
+15:	fstp	%st(0)		// y
+	testb	$2, %dh
+	jz	16f		// jump if x == +inf
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, but y
+	// may be odd unless we know |y| >= 1L<<64.
+	fld	%st		// y : y
+	fabs			// |y| : y
+	fcompl	MO(p64)		// y
+	fnstsw
+	sahf
+	jnc	16f
+	fldl	MO(p63)		// p63 : y
+	fxch			// y : p63
+	fprem			// y%p63 : p63
+	fstp	%st(1)		// y%p63
+
+	// We must find out whether y is an odd integer.
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	17f
+
+	// OK, the value is an integer, but is it odd?
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	18f		// jump if not odd
+	// It's an odd integer.
+	shrl	$31, %edx
+	fldl	MOX(minf_mzero, %edx, 8)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+16:	fcompl	MO(zero)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fnstsw
+	shrl	$5, %eax
+	andl	$8, %eax
+	fldl	MOX(inf_zero, %eax, 1)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+17:	shll	$30, %edx	// sign bit for y in right position
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+18:	shrl	$31, %edx
+	fldl	MOX(inf_zero, %edx, 8)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is ±0
+20:	fstp	%st(0)		// y
+	testb	$2, %dl
+	jz	21f		// y > 0
+
+	// x is ±0 and y is < 0.  We must find out whether y is an odd integer.
+	testb	$2, %dh
+	jz	25f
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, but y
+	// may be odd unless we know |y| >= 1L<<64.
+	fld	%st		// y : y
+	fabs			// |y| : y
+	fcompl	MO(p64)		// y
+	fnstsw
+	sahf
+	jnc	25f
+	fldl	MO(p63)		// p63 : y
+	fxch			// y : p63
+	fprem			// y%p63 : p63
+	fstp	%st(1)		// y%p63
+
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	26f
+
+	// OK, the value is an integer, but is it odd?
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	27f		// jump if not odd
+	// It's an odd integer.
+	// Raise divide-by-zero exception and get minus infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	fchs
+	ret
+
+	cfi_adjust_cfa_offset (8)
+25:	fstp	%st(0)
+26:	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+27:	// Raise divide-by-zero exception and get infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is ±0 and y is > 0.  We must find out whether y is an odd integer.
+21:	testb	$2, %dh
+	jz	22f
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, but y
+	// may be odd unless we know |y| >= 1L<<64.
+	fld	%st		// y : y
+	fcompl	MO(p64)		// y
+	fnstsw
+	sahf
+	jnc	22f
+	fldl	MO(p63)		// p63 : y
+	fxch			// y : p63
+	fprem			// y%p63 : p63
+	fstp	%st(1)		// y%p63
+
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	23f
+
+	// OK, the value is an integer, but is it odd?
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	24f		// jump if not odd
+	// It's an odd integer.
+	fldl	MO(mzero)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+22:	fstp	%st(0)
+23:	addl	$8, %esp	// Don't use 2 x pop
+	cfi_adjust_cfa_offset (-8)
+24:	fldl	MO(zero)
+	ret
+
+END(__ieee754_powl)
+strong_alias (__ieee754_powl, __powl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c b/REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c
new file mode 100644
index 0000000000..1347b0468c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c
@@ -0,0 +1,3 @@
+/* Empty.  This file is only meant to avoid compiling the file with the
+   same name in the libm-ieee754 directory.  The code is not used since
+   there is an assembler version for all users of this file.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_remainder.S b/REORG.TODO/sysdeps/i386/fpu/e_remainder.S
new file mode 100644
index 0000000000..f7867aa90b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_remainder.S
@@ -0,0 +1,18 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_remainder)
+	fldl	12(%esp)
+	fldl	4(%esp)
+1:	fprem1
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	ret
+END (__ieee754_remainder)
+strong_alias (__ieee754_remainder, __remainder_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_remainderf.S b/REORG.TODO/sysdeps/i386/fpu/e_remainderf.S
new file mode 100644
index 0000000000..cfd390bc69
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_remainderf.S
@@ -0,0 +1,18 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_remainderf)
+	flds	8(%esp)
+	flds	4(%esp)
+1:	fprem1
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	ret
+END (__ieee754_remainderf)
+strong_alias (__ieee754_remainderf, __remainderf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_remainderl.S b/REORG.TODO/sysdeps/i386/fpu/e_remainderl.S
new file mode 100644
index 0000000000..5ec23a37a3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_remainderl.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_remainderl)
+	fldt	16(%esp)
+	fldt	4(%esp)
+1:	fprem1
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	ret
+END (__ieee754_remainderl)
+strong_alias (__ieee754_remainderl, __remainderl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_scalb.S b/REORG.TODO/sysdeps/i386/fpu/e_scalb.S
new file mode 100644
index 0000000000..370924c29f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_scalb.S
@@ -0,0 +1,100 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Correct handling of y==-inf <drepper@gnu>
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type zero_nan,@object
+zero_nan:
+	.double 0.0
+nan:	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+	ASM_SIZE_DIRECTIVE(zero_nan)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+	.text
+ENTRY(__ieee754_scalb)
+	fldl	12(%esp)
+	fxam
+	fnstsw
+	fldl	4(%esp)
+	andl	$0x4700, %eax
+	cmpl	$0x0700, %eax
+	je	1f
+	andl	$0x4500, %eax
+	cmpl	$0x0100, %eax
+	je	2f
+	fxam
+	fnstsw
+	andl	$0x4500, %eax
+	cmpl	$0x0100, %eax
+	je	3f
+	fld	%st(1)
+	frndint
+	fcomp	%st(2)
+	fnstsw
+	sahf
+	jne	4f
+	fscale
+	fstp	%st(1)
+	DBL_NARROW_EVAL
+	ret
+
+	/* y is -inf */
+1:	fxam
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fnstsw
+	movl	8(%esp), %edx
+	shrl	$5, %eax
+	fstp	%st
+	fstp	%st
+	andl	$0x80000000, %edx
+	andl	$0x0228, %eax
+	cmpl	$0x0028, %eax
+	je	4f
+	andl	$8, %eax
+	shrl	$27, %edx
+	addl	%edx, %eax
+	fldl	MOX(zero_nan, %eax, 1)
+	ret
+
+	/* The result is NaN, but we must not raise an exception.
+	   So use a variable.  */
+2:	fstp	%st
+	fstp	%st
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	MO(nan)
+	ret
+
+	/* The first parameter is a NaN.  Return it.  */
+3:	fstp	%st(1)
+	ret
+
+	/* Return NaN and raise the invalid exception.  */
+4:	fstp	%st
+	fstp	%st
+	fldz
+	fdiv	%st
+	ret
+END(__ieee754_scalb)
+strong_alias (__ieee754_scalb, __scalb_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_scalbf.S b/REORG.TODO/sysdeps/i386/fpu/e_scalbf.S
new file mode 100644
index 0000000000..4f2dfa3acf
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_scalbf.S
@@ -0,0 +1,102 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Correct handling of y==-inf <drepper@gnu>
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type zero_nan,@object
+zero_nan:
+	.double 0.0
+nan:	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+	ASM_SIZE_DIRECTIVE(zero_nan)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+
+	.text
+ENTRY(__ieee754_scalbf)
+	flds	8(%esp)
+	fxam
+	fnstsw
+	flds	4(%esp)
+	andl	$0x4700, %eax
+	cmpl	$0x0700, %eax
+	je	1f
+	andl	$0x4500, %eax
+	cmpl	$0x0100, %eax
+	je	2f
+	fxam
+	fnstsw
+	andl	$0x4500, %eax
+	cmpl	$0x0100, %eax
+	je	3f
+	fld	%st(1)
+	frndint
+	fcomp	%st(2)
+	fnstsw
+	sahf
+	jne	4f
+	fscale
+	fstp	%st(1)
+	FLT_NARROW_EVAL
+	ret
+
+	/* y is -inf */
+1:	fxam
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fnstsw
+	movl	4(%esp), %edx
+	shrl	$5, %eax
+	fstp	%st
+	fstp	%st
+	andl	$0x80000000, %edx
+	andl	$0x0228, %eax
+	cmpl	$0x0028, %eax
+	je	4f
+	andl	$8, %eax
+	shrl	$27, %edx
+	addl	%edx, %eax
+	fldl	MOX(zero_nan, %eax, 1)
+	ret
+
+	/* The result is NaN, but we must not raise an exception.
+	   So use a variable.  */
+2:	fstp	%st
+	fstp	%st
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	MO(nan)
+	ret
+
+	/* The first parameter is a NaN.  Return it.  */
+3:	fstp	%st(1)
+	ret
+
+	/* Return NaN and raise the invalid exception.  */
+4:	fstp	%st
+	fstp	%st
+	fldz
+	fdiv	%st
+	ret
+END(__ieee754_scalbf)
+strong_alias (__ieee754_scalbf, __scalbf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_scalbl.S b/REORG.TODO/sysdeps/i386/fpu/e_scalbl.S
new file mode 100644
index 0000000000..896f599cb0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_scalbl.S
@@ -0,0 +1,90 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Correct handling of y==-inf <drepper@gnu>
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type zero_nan,@object
+zero_nan:
+	.double 0.0
+nan:	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+	ASM_SIZE_DIRECTIVE(zero_nan)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+	.text
+ENTRY(__ieee754_scalbl)
+	fldt	16(%esp)
+	fxam
+	fnstsw
+	fldt	4(%esp)
+	andl	$0x4700, %eax
+	cmpl	$0x0700, %eax
+	je	1f
+	andl	$0x4500, %eax
+	cmpl	$0x0100, %eax
+	je	2f
+	fxam
+	fnstsw
+	andl	$0x4500, %eax
+	cmpl	$0x0100, %eax
+	je	2f
+	fld	%st(1)
+	frndint
+	fcomp	%st(2)
+	fnstsw
+	sahf
+	jne	4f
+	fscale
+	fstp	%st(1)
+	ret
+
+	/* y is -inf */
+1:	fxam
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fnstsw
+	movl	12(%esp), %edx
+	shrl	$5, %eax
+	fstp	%st
+	fstp	%st
+	andl	$0x8000, %edx
+	andl	$0x0228, %eax
+	cmpl	$0x0028, %eax
+	je	4f
+	andl	$8, %eax
+	shrl	$11, %edx
+	addl	%edx, %eax
+	fldl	MOX(zero_nan, %eax, 1)
+	ret
+
+	/* The result is NaN; raise an exception for sNaN arguments.  */
+2:	faddp
+	ret
+
+	/* Return NaN and raise the invalid exception.  */
+4:	fstp	%st
+	fstp	%st
+	fldz
+	fdiv	%st
+	ret
+END(__ieee754_scalbl)
+strong_alias (__ieee754_scalbl, __scalbl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_sqrt.S b/REORG.TODO/sysdeps/i386/fpu/e_sqrt.S
new file mode 100644
index 0000000000..fba5833a9a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_sqrt.S
@@ -0,0 +1,23 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_sqrt)
+	fldl	4(%esp)
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fstcw	4(%esp)
+	movl	$0xfeff, %edx
+	andl	4(%esp), %edx
+	movl	%edx, (%esp)
+	fldcw	(%esp)
+	fsqrt
+	fldcw	4(%esp)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	ret
+END (__ieee754_sqrt)
+strong_alias (__ieee754_sqrt, __sqrt_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S b/REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S
new file mode 100644
index 0000000000..6f7e4b015f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S
@@ -0,0 +1,13 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_sqrtf)
+	flds	4(%esp)
+	fsqrt
+	ret
+END (__ieee754_sqrtf)
+strong_alias (__ieee754_sqrtf, __sqrtf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c b/REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c
new file mode 100644
index 0000000000..41bcd7eeb7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+#undef __ieee754_sqrtl
+long double
+__ieee754_sqrtl (long double x)
+{
+  long double res;
+
+  asm ("fsqrt" : "=t" (res) : "0" (x));
+
+  return res;
+}
+strong_alias (__ieee754_sqrtl, __sqrtl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c b/REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c
new file mode 100644
index 0000000000..5d8596964b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c
@@ -0,0 +1,69 @@
+/* Clear given exceptions in current floating-point environment.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__feclearexcept (int excepts)
+{
+  fenv_t temp;
+
+  /* Mask out unsupported bits/exceptions.  */
+  excepts &= FE_ALL_EXCEPT;
+
+  /* Bah, we have to clear selected exceptions.  Since there is no
+     `fldsw' instruction we have to do it the hard way.  */
+  __asm__ ("fnstenv %0" : "=m" (*&temp));
+
+  /* Clear the relevant bits.  */
+  temp.__status_word &= excepts ^ FE_ALL_EXCEPT;
+
+  /* Put the new data in effect.  */
+  __asm__ ("fldenv %0" : : "m" (*&temp));
+
+  /* If the CPU supports SSE, we clear the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int xnew_exc;
+
+      /* Get the current MXCSR.  */
+      __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+      /* Clear the relevant bits.  */
+      xnew_exc &= ~excepts;
+
+      /* Put the new data in effect.  */
+      __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+    }
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feclearexcept, __old_feclearexcept)
+compat_symbol (libm, __old_feclearexcept, feclearexcept, GLIBC_2_1);
+#endif
+
+libm_hidden_ver (__feclearexcept, feclearexcept)
+versioned_symbol (libm, __feclearexcept, feclearexcept, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c b/REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c
new file mode 100644
index 0000000000..f8db665425
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c
@@ -0,0 +1,54 @@
+/* Disable floating-point exceptions.
+   Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+fedisableexcept (int excepts)
+{
+  unsigned short int new_exc, old_exc;
+
+  /* Get the current control word.  */
+  __asm__ ("fstcw %0" : "=m" (*&new_exc));
+
+  old_exc = (~new_exc) & FE_ALL_EXCEPT;
+
+  excepts &= FE_ALL_EXCEPT;
+
+  new_exc |= excepts;
+  __asm__ ("fldcw %0" : : "m" (*&new_exc));
+
+  /* If the CPU supports SSE we set the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int xnew_exc;
+
+      /* Get the current control word.  */
+      __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+      xnew_exc |= excepts << 7;
+
+      __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+    }
+
+  return old_exc;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c b/REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c
new file mode 100644
index 0000000000..f1c42d7c27
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c
@@ -0,0 +1,54 @@
+/* Enable floating-point exceptions.
+   Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+feenableexcept (int excepts)
+{
+  unsigned short int new_exc;
+  unsigned short int old_exc;
+
+  /* Get the current control word.  */
+  __asm__ ("fstcw %0" : "=m" (*&new_exc));
+
+  excepts &= FE_ALL_EXCEPT;
+  old_exc = (~new_exc) & FE_ALL_EXCEPT;
+
+  new_exc &= ~excepts;
+  __asm__ ("fldcw %0" : : "m" (*&new_exc));
+
+  /* If the CPU supports SSE we set the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int xnew_exc;
+
+      /* Get the current control word.  */
+      __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+      xnew_exc &= ~(excepts << 7);
+
+      __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+    }
+
+  return old_exc;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetenv.c b/REORG.TODO/sysdeps/i386/fpu/fegetenv.c
new file mode 100644
index 0000000000..983f6af25e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fegetenv.c
@@ -0,0 +1,49 @@
+/* Store current floating-point environment.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__fegetenv (fenv_t *envp)
+{
+  __asm__ ("fnstenv %0" : "=m" (*envp));
+  /* And load it right back since the processor changes the mask.
+     Intel thought this opcode to be used in interrupt handlers which
+     would block all exceptions.  */
+  __asm__ ("fldenv %0" : : "m" (*envp));
+
+  if (HAS_CPU_FEATURE (SSE))
+    __asm__ ("stmxcsr %0" : "=m" (envp->__eip));
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fegetenv, __old_fegetenv)
+compat_symbol (libm, __old_fegetenv, fegetenv, GLIBC_2_1);
+#endif
+
+libm_hidden_def (__fegetenv)
+libm_hidden_ver (__fegetenv, fegetenv)
+versioned_symbol (libm, __fegetenv, fegetenv, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetexcept.c b/REORG.TODO/sysdeps/i386/fpu/fegetexcept.c
new file mode 100644
index 0000000000..dc87b7a470
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fegetexcept.c
@@ -0,0 +1,31 @@
+/* Get enabled floating-point exceptions.
+   Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+fegetexcept (void)
+{
+  unsigned short int exc;
+
+  /* Get the current control word.  */
+  __asm__ ("fstcw %0" : "=m" (*&exc));
+
+  return (~exc) & FE_ALL_EXCEPT;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetmode.c b/REORG.TODO/sysdeps/i386/fpu/fegetmode.c
new file mode 100644
index 0000000000..abbce3075f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fegetmode.c
@@ -0,0 +1,32 @@
+/* Store current floating-point control modes.  i386 version.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <fpu_control.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+fegetmode (femode_t *modep)
+{
+  _FPU_GETCW (modep->__control_word);
+  if (HAS_CPU_FEATURE (SSE))
+    __asm__ ("stmxcsr %0" : "=m" (modep->__mxcsr));
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetround.c b/REORG.TODO/sysdeps/i386/fpu/fegetround.c
new file mode 100644
index 0000000000..8ce8b859d8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fegetround.c
@@ -0,0 +1,33 @@
+/* Return current rounding direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+__fegetround (void)
+{
+  int cw;
+
+  __asm__ ("fnstcw %0" : "=m" (*&cw));
+
+  return cw & 0xc00;
+}
+libm_hidden_def (__fegetround)
+weak_alias (__fegetround, fegetround)
+libm_hidden_weak (fegetround)
diff --git a/REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c b/REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c
new file mode 100644
index 0000000000..d327358913
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c
@@ -0,0 +1,50 @@
+/* Store current floating-point environment and clear exceptions.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__feholdexcept (fenv_t *envp)
+{
+  /* Store the environment.  Recall that fnstenv has a side effect of
+     masking all exceptions.  Then clear all exceptions.  */
+  __asm__ volatile ("fnstenv %0; fnclex" : "=m" (*envp));
+
+  /* If the CPU supports SSE we set the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int xwork;
+
+      /* Get the current control word.  */
+      __asm__ ("stmxcsr %0" : "=m" (envp->__eip));
+
+      /* Set all exceptions to non-stop and clear them.  */
+      xwork = (envp->__eip | 0x1f80) & ~0x3f;
+
+      __asm__ ("ldmxcsr %0" : : "m" (*&xwork));
+    }
+
+  return 0;
+}
+libm_hidden_def (__feholdexcept)
+weak_alias (__feholdexcept, feholdexcept)
+libm_hidden_weak (feholdexcept)
diff --git a/REORG.TODO/sysdeps/i386/fpu/fenv_private.h b/REORG.TODO/sysdeps/i386/fpu/fenv_private.h
new file mode 100644
index 0000000000..e20e1f1662
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fenv_private.h
@@ -0,0 +1,501 @@
+#ifndef FENV_PRIVATE_H
+#define FENV_PRIVATE_H 1
+
+#include <fenv.h>
+#include <fpu_control.h>
+
+#ifdef __SSE2_MATH__
+# define math_opt_barrier(x) \
+  ({ __typeof(x) __x;					\
+     if (sizeof (x) <= sizeof (double))			\
+       __asm ("" : "=x" (__x) : "0" (x));		\
+     else						\
+       __asm ("" : "=t" (__x) : "0" (x));		\
+     __x; })
+# define math_force_eval(x) \
+  do {							\
+    if (sizeof (x) <= sizeof (double))			\
+      __asm __volatile ("" : : "x" (x));		\
+    else						\
+      __asm __volatile ("" : : "f" (x));		\
+  } while (0)
+#else
+# define math_opt_barrier(x) \
+  ({ __typeof (x) __x;					\
+     __asm ("" : "=t" (__x) : "0" (x));			\
+     __x; })
+# define math_force_eval(x) \
+  do {							\
+    __typeof (x) __x = (x);				\
+    if (sizeof (x) <= sizeof (double))			\
+      __asm __volatile ("" : : "m" (__x));		\
+    else						\
+      __asm __volatile ("" : : "f" (__x));		\
+  } while (0)
+#endif
+
+/* This file is used by both the 32- and 64-bit ports.  The 64-bit port
+   has a field in the fenv_t for the mxcsr; the 32-bit port does not.
+   Instead, we (ab)use the only 32-bit field extant in the struct.  */
+#ifndef __x86_64__
+# define __mxcsr	__eip
+#endif
+
+
+/* All of these functions are private to libm, and are all used in pairs
+   to save+change the fp state and restore the original state.  Thus we
+   need not care for both the 387 and the sse unit, only the one we're
+   actually using.  */
+
+#if defined __AVX__ || defined SSE2AVX
+# define STMXCSR "vstmxcsr"
+# define LDMXCSR "vldmxcsr"
+#else
+# define STMXCSR "stmxcsr"
+# define LDMXCSR "ldmxcsr"
+#endif
+
+static __always_inline void
+libc_feholdexcept_sse (fenv_t *e)
+{
+  unsigned int mxcsr;
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
+  e->__mxcsr = mxcsr;
+  mxcsr = (mxcsr | 0x1f80) & ~0x3f;
+  asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+static __always_inline void
+libc_feholdexcept_387 (fenv_t *e)
+{
+  /* Recall that fnstenv has a side-effect of masking exceptions.
+     Clobber all of the fp registers so that the TOS field is 0.  */
+  asm volatile ("fnstenv %0; fnclex"
+		: "=m"(*e)
+		: : "st", "st(1)", "st(2)", "st(3)",
+		    "st(4)", "st(5)", "st(6)", "st(7)");
+}
+
+static __always_inline void
+libc_fesetround_sse (int r)
+{
+  unsigned int mxcsr;
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
+  mxcsr = (mxcsr & ~0x6000) | (r << 3);
+  asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+static __always_inline void
+libc_fesetround_387 (int r)
+{
+  fpu_control_t cw;
+  _FPU_GETCW (cw);
+  cw = (cw & ~0xc00) | r;
+  _FPU_SETCW (cw);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_sse (fenv_t *e, int r)
+{
+  unsigned int mxcsr;
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
+  e->__mxcsr = mxcsr;
+  mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3);
+  asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+/* Set both rounding mode and precision.  A convenience function for use
+   by libc_feholdexcept_setround and libc_feholdexcept_setround_53bit. */
+static __always_inline void
+libc_feholdexcept_setround_387_prec (fenv_t *e, int r)
+{
+  libc_feholdexcept_387 (e);
+
+  fpu_control_t cw = e->__control_word;
+  cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
+  cw |= r | 0x3f;
+  _FPU_SETCW (cw);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387 (fenv_t *e, int r)
+{
+  libc_feholdexcept_setround_387_prec (e, r | _FPU_EXTENDED);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387_53bit (fenv_t *e, int r)
+{
+  libc_feholdexcept_setround_387_prec (e, r | _FPU_DOUBLE);
+}
+
+static __always_inline int
+libc_fetestexcept_sse (int e)
+{
+  unsigned int mxcsr;
+  asm volatile (STMXCSR " %0" : "=m" (*&mxcsr));
+  return mxcsr & e & FE_ALL_EXCEPT;
+}
+
+static __always_inline int
+libc_fetestexcept_387 (int ex)
+{
+  fexcept_t temp;
+  asm volatile ("fnstsw %0" : "=a" (temp));
+  return temp & ex & FE_ALL_EXCEPT;
+}
+
+static __always_inline void
+libc_fesetenv_sse (fenv_t *e)
+{
+  asm volatile (LDMXCSR " %0" : : "m" (e->__mxcsr));
+}
+
+static __always_inline void
+libc_fesetenv_387 (fenv_t *e)
+{
+  /* Clobber all fp registers so that the TOS value we saved earlier is
+     compatible with the current state of the compiler.  */
+  asm volatile ("fldenv %0"
+		: : "m" (*e)
+		: "st", "st(1)", "st(2)", "st(3)",
+		  "st(4)", "st(5)", "st(6)", "st(7)");
+}
+
+static __always_inline int
+libc_feupdateenv_test_sse (fenv_t *e, int ex)
+{
+  unsigned int mxcsr, old_mxcsr, cur_ex;
+  asm volatile (STMXCSR " %0" : "=m" (*&mxcsr));
+  cur_ex = mxcsr & FE_ALL_EXCEPT;
+
+  /* Merge current exceptions with the old environment.  */
+  old_mxcsr = e->__mxcsr;
+  mxcsr = old_mxcsr | cur_ex;
+  asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+
+  /* Raise SIGFPE for any new exceptions since the hold.  Expect that
+     the normal environment has all exceptions masked.  */
+  if (__glibc_unlikely (~(old_mxcsr >> 7) & cur_ex))
+    __feraiseexcept (cur_ex);
+
+  /* Test for exceptions raised since the hold.  */
+  return cur_ex & ex;
+}
+
+static __always_inline int
+libc_feupdateenv_test_387 (fenv_t *e, int ex)
+{
+  fexcept_t cur_ex;
+
+  /* Save current exceptions.  */
+  asm volatile ("fnstsw %0" : "=a" (cur_ex));
+  cur_ex &= FE_ALL_EXCEPT;
+
+  /* Reload original environment.  */
+  libc_fesetenv_387 (e);
+
+  /* Merge current exceptions.  */
+  __feraiseexcept (cur_ex);
+
+  /* Test for exceptions raised since the hold.  */
+  return cur_ex & ex;
+}
+
+static __always_inline void
+libc_feupdateenv_sse (fenv_t *e)
+{
+  libc_feupdateenv_test_sse (e, 0);
+}
+
+static __always_inline void
+libc_feupdateenv_387 (fenv_t *e)
+{
+  libc_feupdateenv_test_387 (e, 0);
+}
+
+static __always_inline void
+libc_feholdsetround_sse (fenv_t *e, int r)
+{
+  unsigned int mxcsr;
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
+  e->__mxcsr = mxcsr;
+  mxcsr = (mxcsr & ~0x6000) | (r << 3);
+  asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+static __always_inline void
+libc_feholdsetround_387_prec (fenv_t *e, int r)
+{
+  fpu_control_t cw;
+
+  _FPU_GETCW (cw);
+  e->__control_word = cw;
+  cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
+  cw |= r;
+  _FPU_SETCW (cw);
+}
+
+static __always_inline void
+libc_feholdsetround_387 (fenv_t *e, int r)
+{
+  libc_feholdsetround_387_prec (e, r | _FPU_EXTENDED);
+}
+
+static __always_inline void
+libc_feholdsetround_387_53bit (fenv_t *e, int r)
+{
+  libc_feholdsetround_387_prec (e, r | _FPU_DOUBLE);
+}
+
+static __always_inline void
+libc_feresetround_sse (fenv_t *e)
+{
+  unsigned int mxcsr;
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
+  mxcsr = (mxcsr & ~0x6000) | (e->__mxcsr & 0x6000);
+  asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+static __always_inline void
+libc_feresetround_387 (fenv_t *e)
+{
+  _FPU_SETCW (e->__control_word);
+}
+
+#ifdef __SSE_MATH__
+# define libc_feholdexceptf		libc_feholdexcept_sse
+# define libc_fesetroundf		libc_fesetround_sse
+# define libc_feholdexcept_setroundf	libc_feholdexcept_setround_sse
+# define libc_fetestexceptf		libc_fetestexcept_sse
+# define libc_fesetenvf			libc_fesetenv_sse
+# define libc_feupdateenv_testf		libc_feupdateenv_test_sse
+# define libc_feupdateenvf		libc_feupdateenv_sse
+# define libc_feholdsetroundf		libc_feholdsetround_sse
+# define libc_feresetroundf		libc_feresetround_sse
+#else
+# define libc_feholdexceptf		libc_feholdexcept_387
+# define libc_fesetroundf		libc_fesetround_387
+# define libc_feholdexcept_setroundf	libc_feholdexcept_setround_387
+# define libc_fetestexceptf		libc_fetestexcept_387
+# define libc_fesetenvf			libc_fesetenv_387
+# define libc_feupdateenv_testf		libc_feupdateenv_test_387
+# define libc_feupdateenvf		libc_feupdateenv_387
+# define libc_feholdsetroundf		libc_feholdsetround_387
+# define libc_feresetroundf		libc_feresetround_387
+#endif /* __SSE_MATH__ */
+
+#ifdef __SSE2_MATH__
+# define libc_feholdexcept		libc_feholdexcept_sse
+# define libc_fesetround		libc_fesetround_sse
+# define libc_feholdexcept_setround	libc_feholdexcept_setround_sse
+# define libc_fetestexcept		libc_fetestexcept_sse
+# define libc_fesetenv			libc_fesetenv_sse
+# define libc_feupdateenv_test		libc_feupdateenv_test_sse
+# define libc_feupdateenv		libc_feupdateenv_sse
+# define libc_feholdsetround		libc_feholdsetround_sse
+# define libc_feresetround		libc_feresetround_sse
+#else
+# define libc_feholdexcept		libc_feholdexcept_387
+# define libc_fesetround		libc_fesetround_387
+# define libc_feholdexcept_setround	libc_feholdexcept_setround_387
+# define libc_fetestexcept		libc_fetestexcept_387
+# define libc_fesetenv			libc_fesetenv_387
+# define libc_feupdateenv_test		libc_feupdateenv_test_387
+# define libc_feupdateenv		libc_feupdateenv_387
+# define libc_feholdsetround		libc_feholdsetround_387
+# define libc_feresetround		libc_feresetround_387
+#endif /* __SSE2_MATH__ */
+
+#define libc_feholdexceptl		libc_feholdexcept_387
+#define libc_fesetroundl		libc_fesetround_387
+#define libc_feholdexcept_setroundl	libc_feholdexcept_setround_387
+#define libc_fetestexceptl		libc_fetestexcept_387
+#define libc_fesetenvl			libc_fesetenv_387
+#define libc_feupdateenv_testl		libc_feupdateenv_test_387
+#define libc_feupdateenvl		libc_feupdateenv_387
+#define libc_feholdsetroundl		libc_feholdsetround_387
+#define libc_feresetroundl		libc_feresetround_387
+
+#ifndef __SSE2_MATH__
+# define libc_feholdexcept_setround_53bit libc_feholdexcept_setround_387_53bit
+# define libc_feholdsetround_53bit	libc_feholdsetround_387_53bit
+#endif
+
+/* We have support for rounding mode context.  */
+#define HAVE_RM_CTX 1
+
+static __always_inline void
+libc_feholdexcept_setround_sse_ctx (struct rm_ctx *ctx, int r)
+{
+  unsigned int mxcsr, new_mxcsr;
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
+  new_mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3);
+
+  ctx->env.__mxcsr = mxcsr;
+  if (__glibc_unlikely (mxcsr != new_mxcsr))
+    {
+      asm volatile (LDMXCSR " %0" : : "m" (*&new_mxcsr));
+      ctx->updated_status = true;
+    }
+  else
+    ctx->updated_status = false;
+}
+
+/* Unconditional since we want to overwrite any exceptions that occurred in the
+   context.  This is also why all fehold* functions unconditionally write into
+   ctx->env.  */
+static __always_inline void
+libc_fesetenv_sse_ctx (struct rm_ctx *ctx)
+{
+  libc_fesetenv_sse (&ctx->env);
+}
+
+static __always_inline void
+libc_feupdateenv_sse_ctx (struct rm_ctx *ctx)
+{
+  if (__glibc_unlikely (ctx->updated_status))
+    libc_feupdateenv_test_sse (&ctx->env, 0);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387_prec_ctx (struct rm_ctx *ctx, int r)
+{
+  libc_feholdexcept_387 (&ctx->env);
+
+  fpu_control_t cw = ctx->env.__control_word;
+  fpu_control_t old_cw = cw;
+  cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
+  cw |= r | 0x3f;
+
+  if (__glibc_unlikely (old_cw != cw))
+    {
+      _FPU_SETCW (cw);
+      ctx->updated_status = true;
+    }
+  else
+    ctx->updated_status = false;
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387_ctx (struct rm_ctx *ctx, int r)
+{
+  libc_feholdexcept_setround_387_prec_ctx (ctx, r | _FPU_EXTENDED);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387_53bit_ctx (struct rm_ctx *ctx, int r)
+{
+  libc_feholdexcept_setround_387_prec_ctx (ctx, r | _FPU_DOUBLE);
+}
+
+static __always_inline void
+libc_feholdsetround_387_prec_ctx (struct rm_ctx *ctx, int r)
+{
+  fpu_control_t cw, new_cw;
+
+  _FPU_GETCW (cw);
+  new_cw = cw;
+  new_cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
+  new_cw |= r;
+
+  ctx->env.__control_word = cw;
+  if (__glibc_unlikely (new_cw != cw))
+    {
+      _FPU_SETCW (new_cw);
+      ctx->updated_status = true;
+    }
+  else
+    ctx->updated_status = false;
+}
+
+static __always_inline void
+libc_feholdsetround_387_ctx (struct rm_ctx *ctx, int r)
+{
+  libc_feholdsetround_387_prec_ctx (ctx, r | _FPU_EXTENDED);
+}
+
+static __always_inline void
+libc_feholdsetround_387_53bit_ctx (struct rm_ctx *ctx, int r)
+{
+  libc_feholdsetround_387_prec_ctx (ctx, r | _FPU_DOUBLE);
+}
+
+static __always_inline void
+libc_feholdsetround_sse_ctx (struct rm_ctx *ctx, int r)
+{
+  unsigned int mxcsr, new_mxcsr;
+
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
+  new_mxcsr = (mxcsr & ~0x6000) | (r << 3);
+
+  ctx->env.__mxcsr = mxcsr;
+  if (__glibc_unlikely (new_mxcsr != mxcsr))
+    {
+      asm volatile (LDMXCSR " %0" : : "m" (*&new_mxcsr));
+      ctx->updated_status = true;
+    }
+  else
+    ctx->updated_status = false;
+}
+
+static __always_inline void
+libc_feresetround_sse_ctx (struct rm_ctx *ctx)
+{
+  if (__glibc_unlikely (ctx->updated_status))
+    libc_feresetround_sse (&ctx->env);
+}
+
+static __always_inline void
+libc_feresetround_387_ctx (struct rm_ctx *ctx)
+{
+  if (__glibc_unlikely (ctx->updated_status))
+    _FPU_SETCW (ctx->env.__control_word);
+}
+
+static __always_inline void
+libc_feupdateenv_387_ctx (struct rm_ctx *ctx)
+{
+  if (__glibc_unlikely (ctx->updated_status))
+    libc_feupdateenv_test_387 (&ctx->env, 0);
+}
+
+#ifdef __SSE_MATH__
+# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_sse_ctx
+# define libc_fesetenvf_ctx		libc_fesetenv_sse_ctx
+# define libc_feupdateenvf_ctx		libc_feupdateenv_sse_ctx
+# define libc_feholdsetroundf_ctx	libc_feholdsetround_sse_ctx
+# define libc_feresetroundf_ctx		libc_feresetround_sse_ctx
+#else
+# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_387_ctx
+# define libc_feupdateenvf_ctx		libc_feupdateenv_387_ctx
+# define libc_feholdsetroundf_ctx	libc_feholdsetround_387_ctx
+# define libc_feresetroundf_ctx		libc_feresetround_387_ctx
+#endif /* __SSE_MATH__ */
+
+#ifdef __SSE2_MATH__
+# define libc_feholdexcept_setround_ctx	libc_feholdexcept_setround_sse_ctx
+# define libc_fesetenv_ctx		libc_fesetenv_sse_ctx
+# define libc_feupdateenv_ctx		libc_feupdateenv_sse_ctx
+# define libc_feholdsetround_ctx	libc_feholdsetround_sse_ctx
+# define libc_feresetround_ctx		libc_feresetround_sse_ctx
+#else
+# define libc_feholdexcept_setround_ctx	libc_feholdexcept_setround_387_ctx
+# define libc_feupdateenv_ctx		libc_feupdateenv_387_ctx
+# define libc_feholdsetround_ctx	libc_feholdsetround_387_ctx
+# define libc_feresetround_ctx		libc_feresetround_387_ctx
+#endif /* __SSE2_MATH__ */
+
+#define libc_feholdexcept_setroundl_ctx	libc_feholdexcept_setround_387_ctx
+#define libc_feupdateenvl_ctx		libc_feupdateenv_387_ctx
+#define libc_feholdsetroundl_ctx	libc_feholdsetround_387_ctx
+#define libc_feresetroundl_ctx		libc_feresetround_387_ctx
+
+#ifndef __SSE2_MATH__
+# define libc_feholdsetround_53bit_ctx	libc_feholdsetround_387_53bit_ctx
+# define libc_feresetround_53bit_ctx	libc_feresetround_387_ctx
+#endif
+
+#undef __mxcsr
+
+#endif /* FENV_PRIVATE_H */
diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetenv.c b/REORG.TODO/sysdeps/i386/fpu/fesetenv.c
new file mode 100644
index 0000000000..a338e5d555
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fesetenv.c
@@ -0,0 +1,131 @@
+/* Install given floating-point environment.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <fpu_control.h>
+#include <assert.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+
+/* All exceptions, including the x86-specific "denormal operand"
+   exception.  */
+#define FE_ALL_EXCEPT_X86 (FE_ALL_EXCEPT | __FE_DENORM)
+
+
+int
+__fesetenv (const fenv_t *envp)
+{
+  fenv_t temp;
+
+  /* The memory block used by fstenv/fldenv has a size of 28 bytes.  */
+  assert (sizeof (fenv_t) == 28);
+
+  /* Install the environment specified by ENVP.  But there are a few
+     values which we do not want to come from the saved environment.
+     Therefore, we get the current environment and replace the values
+     we want to use from the environment specified by the parameter.  */
+  __asm__ ("fnstenv %0" : "=m" (*&temp));
+
+  if (envp == FE_DFL_ENV)
+    {
+      temp.__control_word |= FE_ALL_EXCEPT_X86;
+      temp.__control_word &= ~FE_TOWARDZERO;
+      temp.__control_word |= _FPU_EXTENDED;
+      temp.__status_word &= ~FE_ALL_EXCEPT_X86;
+    }
+  else if (envp == FE_NOMASK_ENV)
+    {
+      temp.__control_word &= ~(FE_ALL_EXCEPT | FE_TOWARDZERO);
+      /* Keep the "denormal operand" exception masked.  */
+      temp.__control_word |= __FE_DENORM;
+      temp.__control_word |= _FPU_EXTENDED;
+      temp.__status_word &= ~FE_ALL_EXCEPT_X86;
+    }
+  else
+    {
+      temp.__control_word &= ~(FE_ALL_EXCEPT_X86
+			       | FE_TOWARDZERO
+			       | _FPU_EXTENDED);
+      temp.__control_word |= (envp->__control_word
+			      & (FE_ALL_EXCEPT_X86
+				 | FE_TOWARDZERO
+				 | _FPU_EXTENDED));
+      temp.__status_word &= ~FE_ALL_EXCEPT_X86;
+      temp.__status_word |= envp->__status_word & FE_ALL_EXCEPT_X86;
+    }
+  temp.__eip = 0;
+  temp.__cs_selector = 0;
+  temp.__opcode = 0;
+  temp.__data_offset = 0;
+  temp.__data_selector = 0;
+
+  __asm__ ("fldenv %0" : : "m" (temp));
+
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int mxcsr;
+      __asm__ ("stmxcsr %0" : "=m" (mxcsr));
+
+      if (envp == FE_DFL_ENV)
+	{
+	  /* Clear SSE exceptions.  */
+	  mxcsr &= ~FE_ALL_EXCEPT_X86;
+	  /* Set mask for SSE MXCSR.  */
+	  mxcsr |= (FE_ALL_EXCEPT_X86 << 7);
+	  /* Set rounding to FE_TONEAREST.  */
+	  mxcsr &= ~0x6000;
+	  mxcsr |= (FE_TONEAREST << 3);
+	  /* Clear the FZ and DAZ bits.  */
+	  mxcsr &= ~0x8040;
+	}
+      else if (envp == FE_NOMASK_ENV)
+	{
+	  /* Clear SSE exceptions.  */
+	  mxcsr &= ~FE_ALL_EXCEPT_X86;
+	  /* Do not mask exceptions.  */
+	  mxcsr &= ~(FE_ALL_EXCEPT << 7);
+	  /* Keep the "denormal operand" exception masked.  */
+	  mxcsr |= (__FE_DENORM << 7);
+	  /* Set rounding to FE_TONEAREST.  */
+	  mxcsr &= ~0x6000;
+	  mxcsr |= (FE_TONEAREST << 3);
+	  /* Clear the FZ and DAZ bits.  */
+	  mxcsr &= ~0x8040;
+	}
+      else
+	mxcsr = envp->__eip;
+
+      __asm__ ("ldmxcsr %0" : : "m" (mxcsr));
+    }
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fesetenv, __old_fesetenv)
+compat_symbol (libm, __old_fesetenv, fesetenv, GLIBC_2_1);
+#endif
+
+libm_hidden_def (__fesetenv)
+libm_hidden_ver (__fesetenv, fesetenv)
+versioned_symbol (libm, __fesetenv, fesetenv, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetexcept.c b/REORG.TODO/sysdeps/i386/fpu/fesetexcept.c
new file mode 100644
index 0000000000..adfcf17ba6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fesetexcept.c
@@ -0,0 +1,31 @@
+/* Set given exception flags.  i386 version.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+fesetexcept (int excepts)
+{
+  fenv_t temp;
+
+  __asm__ ("fnstenv %0" : "=m" (*&temp));
+  temp.__status_word |= excepts & FE_ALL_EXCEPT;
+  __asm__ ("fldenv %0" : : "m" (*&temp));
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetmode.c b/REORG.TODO/sysdeps/i386/fpu/fesetmode.c
new file mode 100644
index 0000000000..bd9f74cd97
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fesetmode.c
@@ -0,0 +1,54 @@
+/* Install given floating-point control modes.  i386 version.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <fpu_control.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+/* All exceptions, including the x86-specific "denormal operand"
+   exception.  */
+#define FE_ALL_EXCEPT_X86 (FE_ALL_EXCEPT | __FE_DENORM)
+
+int
+fesetmode (const femode_t *modep)
+{
+  fpu_control_t cw;
+  if (modep == FE_DFL_MODE)
+    cw = _FPU_DEFAULT;
+  else
+    cw = modep->__control_word;
+  _FPU_SETCW (cw);
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int mxcsr;
+      __asm__ ("stmxcsr %0" : "=m" (mxcsr));
+      /* Preserve SSE exception flags but restore other state in
+	 MXCSR.  */
+      mxcsr &= FE_ALL_EXCEPT_X86;
+      if (modep == FE_DFL_MODE)
+	/* Default MXCSR state has all bits zero except for those
+	   masking exceptions.  */
+	mxcsr |= FE_ALL_EXCEPT_X86 << 7;
+      else
+	mxcsr |= modep->__mxcsr & ~FE_ALL_EXCEPT_X86;
+      __asm__ ("ldmxcsr %0" : : "m" (mxcsr));
+    }
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetround.c b/REORG.TODO/sysdeps/i386/fpu/fesetround.c
new file mode 100644
index 0000000000..a3fa6235c0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fesetround.c
@@ -0,0 +1,54 @@
+/* Set current rounding direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__fesetround (int round)
+{
+  unsigned short int cw;
+
+  if ((round & ~0xc00) != 0)
+    /* ROUND is no valid rounding mode.  */
+    return 1;
+
+  __asm__ ("fnstcw %0" : "=m" (*&cw));
+  cw &= ~0xc00;
+  cw |= round;
+  __asm__ ("fldcw %0" : : "m" (*&cw));
+
+  /* If the CPU supports SSE we set the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int xcw;
+
+      __asm__ ("stmxcsr %0" : "=m" (*&xcw));
+      xcw &= ~0x6000;
+      xcw |= round << 3;
+      __asm__ ("ldmxcsr %0" : : "m" (*&xcw));
+    }
+
+  return 0;
+}
+libm_hidden_def (__fesetround)
+weak_alias (__fesetround, fesetround)
+libm_hidden_weak (fesetround)
diff --git a/REORG.TODO/sysdeps/i386/fpu/feupdateenv.c b/REORG.TODO/sysdeps/i386/fpu/feupdateenv.c
new file mode 100644
index 0000000000..b610289cd0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/feupdateenv.c
@@ -0,0 +1,60 @@
+/* Install given floating-point environment and raise exceptions.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <dl-procinfo.h>
+#include <ldsodefs.h>
+
+int
+__feupdateenv (const fenv_t *envp)
+{
+  fexcept_t temp;
+  unsigned int xtemp = 0;
+
+  /* Save current exceptions.  */
+  __asm__ ("fnstsw %0" : "=m" (*&temp));
+
+  /* If the CPU supports SSE we test the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    __asm__ ("stmxcsr %0" : "=m" (*&xtemp));
+
+  temp = (temp | xtemp) & FE_ALL_EXCEPT;
+
+  /* Install new environment.  */
+  __fesetenv (envp);
+
+  /* Raise the saved exception.  Incidently for us the implementation
+     defined format of the values in objects of type fexcept_t is the
+     same as the ones specified using the FE_* constants.  */
+  __feraiseexcept ((int) temp);
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feupdateenv, __old_feupdateenv)
+compat_symbol (libm, __old_feupdateenv, feupdateenv, GLIBC_2_1);
+#endif
+
+libm_hidden_def (__feupdateenv)
+libm_hidden_ver (__feupdateenv, feupdateenv)
+versioned_symbol (libm, __feupdateenv, feupdateenv, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c b/REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c
new file mode 100644
index 0000000000..954e5f69d8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c
@@ -0,0 +1,57 @@
+/* Store current representation for exceptions.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+
+int
+__fegetexceptflag (fexcept_t *flagp, int excepts)
+{
+  fexcept_t temp;
+
+  /* Get the current exceptions.  */
+  __asm__ ("fnstsw %0" : "=m" (*&temp));
+
+  *flagp = temp & excepts & FE_ALL_EXCEPT;
+
+  /* If the CPU supports SSE, we clear the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int sse_exc;
+
+      /* Get the current MXCSR.  */
+      __asm__ ("stmxcsr %0" : "=m" (*&sse_exc));
+
+      *flagp |= sse_exc & excepts & FE_ALL_EXCEPT;
+    }
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fegetexceptflag, __old_fegetexceptflag)
+compat_symbol (libm, __old_fegetexceptflag, fegetexceptflag, GLIBC_2_1);
+#endif
+
+versioned_symbol (libm, __fegetexceptflag, fegetexceptflag, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c b/REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c
new file mode 100644
index 0000000000..913d7b912c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c
@@ -0,0 +1,124 @@
+/* Raise given exceptions.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <math.h>
+
+int
+__feraiseexcept (int excepts)
+{
+  /* Raise exceptions represented by EXPECTS.  But we must raise only
+     one signal at a time.  It is important that if the overflow/underflow
+     exception and the inexact exception are given at the same time,
+     the overflow/underflow exception follows the inexact exception.  */
+
+  /* First: invalid exception.  */
+  if ((FE_INVALID & excepts) != 0)
+    {
+      /* One example of an invalid operation is 0.0 / 0.0.  */
+      double d;
+      __asm__ __volatile__ ("fldz; fdiv %%st, %%st(0); fwait" : "=t" (d));
+      (void) &d;
+    }
+
+  /* Next: division by zero.  */
+  if ((FE_DIVBYZERO & excepts) != 0)
+    {
+      double d;
+      __asm__ __volatile__ ("fldz; fld1; fdivp %%st, %%st(1); fwait"
+			    : "=t" (d));
+      (void) &d;
+    }
+
+  /* Next: overflow.  */
+  if ((FE_OVERFLOW & excepts) != 0)
+    {
+      /* There is no way to raise only the overflow flag.  Do it the
+	 hard way.  */
+      fenv_t temp;
+
+      /* Bah, we have to clear selected exceptions.  Since there is no
+	 `fldsw' instruction we have to do it the hard way.  */
+      __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp));
+
+      /* Set the relevant bits.  */
+      temp.__status_word |= FE_OVERFLOW;
+
+      /* Put the new data in effect.  */
+      __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp));
+
+      /* And raise the exception.  */
+      __asm__ __volatile__ ("fwait");
+    }
+
+  /* Next: underflow.  */
+  if ((FE_UNDERFLOW & excepts) != 0)
+    {
+      /* There is no way to raise only the underflow flag.  Do it the
+	 hard way.  */
+      fenv_t temp;
+
+      /* Bah, we have to clear selected exceptions.  Since there is no
+	 `fldsw' instruction we have to do it the hard way.  */
+      __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp));
+
+      /* Set the relevant bits.  */
+      temp.__status_word |= FE_UNDERFLOW;
+
+      /* Put the new data in effect.  */
+      __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp));
+
+      /* And raise the exception.  */
+      __asm__ __volatile__ ("fwait");
+    }
+
+  /* Last: inexact.  */
+  if ((FE_INEXACT & excepts) != 0)
+    {
+      /* There is no way to raise only the inexact flag.  Do it the
+	 hard way.  */
+      fenv_t temp;
+
+      /* Bah, we have to clear selected exceptions.  Since there is no
+	 `fldsw' instruction we have to do it the hard way.  */
+      __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp));
+
+      /* Set the relevant bits.  */
+      temp.__status_word |= FE_INEXACT;
+
+      /* Put the new data in effect.  */
+      __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp));
+
+      /* And raise the exception.  */
+      __asm__ __volatile__ ("fwait");
+    }
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feraiseexcept, __old_feraiseexcept)
+compat_symbol (libm, __old_feraiseexcept, feraiseexcept, GLIBC_2_1);
+#endif
+
+libm_hidden_def (__feraiseexcept)
+libm_hidden_ver (__feraiseexcept, feraiseexcept)
+versioned_symbol (libm, __feraiseexcept, feraiseexcept, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c b/REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c
new file mode 100644
index 0000000000..efa64aaefd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c
@@ -0,0 +1,69 @@
+/* Set floating-point environment exception handling.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <math.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__fesetexceptflag (const fexcept_t *flagp, int excepts)
+{
+  fenv_t temp;
+
+  /* Get the current environment.  We have to do this since we cannot
+     separately set the status word.  */
+  __asm__ ("fnstenv %0" : "=m" (*&temp));
+
+  temp.__status_word &= ~(excepts & FE_ALL_EXCEPT);
+  temp.__status_word |= *flagp & excepts & FE_ALL_EXCEPT;
+
+  /* Store the new status word (along with the rest of the environment.
+     Possibly new exceptions are set but they won't get executed unless
+     the next floating-point instruction.  */
+  __asm__ ("fldenv %0" : : "m" (*&temp));
+
+  /* If the CPU supports SSE, we set the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int xnew_exc;
+
+      /* Get the current MXCSR.  */
+      __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+      /* Set the relevant bits.  */
+      xnew_exc &= ~(excepts & FE_ALL_EXCEPT);
+      xnew_exc |= *flagp & excepts & FE_ALL_EXCEPT;
+
+      /* Put the new data in effect.  */
+      __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+    }
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fesetexceptflag, __old_fesetexceptflag)
+compat_symbol (libm, __old_fesetexceptflag, fesetexceptflag, GLIBC_2_1);
+#endif
+
+versioned_symbol (libm, __fesetexceptflag, fesetexceptflag, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/ftestexcept.c b/REORG.TODO/sysdeps/i386/fpu/ftestexcept.c
new file mode 100644
index 0000000000..f523f9e709
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/ftestexcept.c
@@ -0,0 +1,40 @@
+/* Test exception in current environment.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <dl-procinfo.h>
+#include <ldsodefs.h>
+
+int
+fetestexcept (int excepts)
+{
+  short temp;
+  int xtemp = 0;
+
+  /* Get current exceptions.  */
+  __asm__ ("fnstsw %0" : "=a" (temp));
+
+  /* If the CPU supports SSE we test the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    __asm__ ("stmxcsr %0" : "=m" (*&xtemp));
+
+  return (temp | xtemp) & excepts & FE_ALL_EXCEPT;
+}
+libm_hidden_def (fetestexcept)
diff --git a/REORG.TODO/sysdeps/i386/fpu/halfulp.c b/REORG.TODO/sysdeps/i386/fpu/halfulp.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/halfulp.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h b/REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h
new file mode 100644
index 0000000000..6ffc8e6f64
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h
@@ -0,0 +1,340 @@
+/* Helper macros for x86 libm functions.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _I386_MATH_ASM_H
+#define _I386_MATH_ASM_H 1
+
+/* Remove excess range and precision by storing a value on the stack
+   and loading it back.  */
+#define FLT_NARROW_EVAL				\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	fstps	(%esp);				\
+	flds	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);
+#define DBL_NARROW_EVAL				\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fstpl	(%esp);				\
+	fldl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);
+
+/* Define constants for the minimum value of a floating-point
+   type.  */
+#define DEFINE_FLT_MIN				\
+	.section .rodata.cst4,"aM",@progbits,4;	\
+	.p2align 2;				\
+	.type flt_min,@object;			\
+flt_min:					\
+	.byte 0, 0, 0x80, 0;			\
+	.size flt_min, .-flt_min;
+#define DEFINE_DBL_MIN				\
+	.section .rodata.cst8,"aM",@progbits,8;	\
+	.p2align 3;				\
+	.type dbl_min,@object;			\
+dbl_min:					\
+	.byte 0, 0, 0, 0, 0, 0, 0x10, 0;	\
+	.size dbl_min, .-dbl_min;
+#define DEFINE_LDBL_MIN					\
+	.section .rodata.cst16,"aM",@progbits,16;	\
+	.p2align 4;					\
+	.type ldbl_min,@object;				\
+ldbl_min:						\
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x1, 0;	\
+	.byte 0, 0, 0, 0, 0, 0;				\
+	.size ldbl_min, .-ldbl_min;
+
+/* Remove excess range and precision by storing a value on the stack
+   and loading it back.  The value is given to be nonnegative or NaN;
+   if it is subnormal, also force an underflow exception.  The
+   relevant constant for the minimum of the type must have been
+   defined, the MO macro must have been defined for access to memory
+   operands, and, if PIC, the PIC register must have been loaded.  */
+#define FLT_NARROW_EVAL_UFLOW_NONNEG_NAN	\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	flds	MO(flt_min);			\
+	fld	%st(1);				\
+	fucompp;				\
+	fnstsw;					\
+	sahf;					\
+	jnc 6424f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstps	(%esp);				\
+6424:	fstps	(%esp);				\
+	flds	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);
+#define DBL_NARROW_EVAL_UFLOW_NONNEG_NAN	\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fldl	MO(dbl_min);			\
+	fld	%st(1);				\
+	fucompp;				\
+	fnstsw;					\
+	sahf;					\
+	jnc 6453f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstpl	(%esp);				\
+6453:	fstpl	(%esp);				\
+	fldl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);
+
+/* Likewise, but the argument is not a NaN (so fcom instructions,
+   which support memory operands, can be used).  */
+#define FLT_NARROW_EVAL_UFLOW_NONNEG		\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	fcoms	MO(flt_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6424f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstps	(%esp);				\
+6424:	fstps	(%esp);				\
+	flds	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);
+#define DBL_NARROW_EVAL_UFLOW_NONNEG		\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fcoml	MO(dbl_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6453f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstpl	(%esp);				\
+6453:	fstpl	(%esp);				\
+	fldl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);
+
+/* Likewise, but the non-NaN argument may be negative.  */
+#define FLT_NARROW_EVAL_UFLOW_NONNAN		\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	fld	%st(0);				\
+	fabs;					\
+	fcomps	MO(flt_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6424f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstps	(%esp);				\
+6424:	fstps	(%esp);				\
+	flds	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);
+#define DBL_NARROW_EVAL_UFLOW_NONNAN		\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fld	%st(0);				\
+	fabs;					\
+	fcompl	MO(dbl_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6453f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstpl	(%esp);				\
+6453:	fstpl	(%esp);				\
+	fldl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);
+
+/* Force an underflow exception if the given value is subnormal.  The
+   relevant constant for the minimum of the type must have been
+   defined, the MO macro must have been defined for access to memory
+   operands, and, if PIC, the PIC register must have been loaded.  */
+#define FLT_CHECK_FORCE_UFLOW			\
+	flds	MO(flt_min);			\
+	fld	%st(1);				\
+	fabs;					\
+	fucompp;				\
+	fnstsw;					\
+	sahf;					\
+	jnc 6424f;				\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstps	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);		\
+6424:
+#define DBL_CHECK_FORCE_UFLOW			\
+	fldl	MO(dbl_min);			\
+	fld	%st(1);				\
+	fabs;					\
+	fucompp;				\
+	fnstsw;					\
+	sahf;					\
+	jnc 6453f;				\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstpl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);		\
+6453:
+
+/* Likewise, but also remove excess range and precision if the value
+   is subnormal.  */
+#define FLT_CHECK_FORCE_UFLOW_NARROW		\
+	flds	MO(flt_min);			\
+	fld	%st(1);				\
+	fabs;					\
+	fucompp;				\
+	fnstsw;					\
+	sahf;					\
+	jnc 6424f;				\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstps	(%esp);				\
+	fstps	(%esp);				\
+	flds	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);		\
+6424:
+#define DBL_CHECK_FORCE_UFLOW_NARROW		\
+	fldl	MO(dbl_min);			\
+	fld	%st(1);				\
+	fabs;					\
+	fucompp;				\
+	fnstsw;					\
+	sahf;					\
+	jnc 6453f;				\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstpl	(%esp);				\
+	fstpl	(%esp);				\
+	fldl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);		\
+6453:
+
+/* Likewise, but the argument is nonnegative or NaN.  */
+#define LDBL_CHECK_FORCE_UFLOW_NONNEG_NAN	\
+	fldt	MO(ldbl_min);			\
+	fld	%st(1);				\
+	fucompp;				\
+	fnstsw;					\
+	sahf;					\
+	jnc 6464f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstp	%st(0);				\
+6464:
+
+/* Likewise, but the argument is not a NaN.  */
+#define FLT_CHECK_FORCE_UFLOW_NONNAN		\
+	fld %st(0);				\
+	fabs;					\
+	fcomps	MO(flt_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6424f;				\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstps	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);		\
+6424:
+#define DBL_CHECK_FORCE_UFLOW_NONNAN		\
+	fld %st(0);				\
+	fabs;					\
+	fcompl	MO(dbl_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6453f;				\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstpl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);		\
+6453:
+#define LDBL_CHECK_FORCE_UFLOW_NONNAN		\
+	fldt	MO(ldbl_min);			\
+	fld	%st(1);				\
+	fabs;					\
+	fcompp;					\
+	fnstsw;					\
+	sahf;					\
+	jnc 6464f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstp	%st(0);				\
+6464:
+
+/* Likewise, but the argument is nonnegative and not a NaN.  */
+#define FLT_CHECK_FORCE_UFLOW_NONNEG		\
+	fcoms	MO(flt_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6424f;				\
+	subl	$4, %esp;			\
+	cfi_adjust_cfa_offset (4);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstps	(%esp);				\
+	addl	$4, %esp;			\
+	cfi_adjust_cfa_offset (-4);		\
+6424:
+#define DBL_CHECK_FORCE_UFLOW_NONNEG		\
+	fcoml	MO(dbl_min);			\
+	fnstsw;					\
+	sahf;					\
+	jnc 6453f;				\
+	subl	$8, %esp;			\
+	cfi_adjust_cfa_offset (8);		\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstpl	(%esp);				\
+	addl	$8, %esp;			\
+	cfi_adjust_cfa_offset (-8);		\
+6453:
+#define LDBL_CHECK_FORCE_UFLOW_NONNEG		\
+	fldt	MO(ldbl_min);			\
+	fld	%st(1);				\
+	fcompp;					\
+	fnstsw;					\
+	sahf;					\
+	jnc 6464f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstp	%st(0);				\
+6464:
+
+#endif /* i386-math-asm.h.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps
new file mode 100644
index 0000000000..0fc50907ad
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps
@@ -0,0 +1,2202 @@
+# Begin of automatic generation
+
+# Maximal error of functions:
+Function: "acos":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "acos_downward":
+ildouble: 2
+ldouble: 2
+
+Function: "acos_towardzero":
+ildouble: 2
+ldouble: 2
+
+Function: "acos_upward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "acosh":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 2
+
+Function: "acosh_downward":
+double: 1
+idouble: 1
+ildouble: 6
+ldouble: 4
+
+Function: "acosh_towardzero":
+double: 1
+idouble: 1
+ildouble: 6
+ldouble: 4
+
+Function: "acosh_upward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 3
+
+Function: "asin":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_downward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "asin_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asinh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "asinh_downward":
+double: 1
+float: 1
+idouble: 1
+ildouble: 5
+ldouble: 5
+
+Function: "asinh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "asinh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "atan":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atanh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "atanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 4
+
+Function: "atanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 3
+
+Function: "atanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "cabs":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cacos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "cacos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "cacos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "cacosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacosh_downward":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cacosh_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacosh_upward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "carg":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "casin_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "casin_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "casin_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casin_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casinh_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Imaginary part of "casinh_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "casinh_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "casinh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "catan":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "catanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catanh":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cbrt":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cbrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "cbrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cbrt_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccosh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cexp":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cexp":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cexp_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cexp_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "clog":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog10":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog10":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "clog10_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "clog10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cos":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cos_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh":
+double: 1
+float: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_downward":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: "cosh_towardzero":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_upward":
+double: 4
+float: 2
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: Real part of "cpow":
+double: 2
+float: 5
+idouble: 2
+ifloat: 5
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cpow":
+float: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "cpow_downward":
+double: 5
+float: 8
+idouble: 5
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cpow_towardzero":
+double: 5
+float: 8
+idouble: 5
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cpow_upward":
+double: 4
+float: 1
+idouble: 4
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cpow_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csin":
+float: 1
+ifloat: 1
+
+Function: Real part of "csin_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "csin_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csinh":
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "csinh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "csinh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csqrt":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "csqrt":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csqrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "csqrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ctan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_towardzero":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctan_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ctanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ctanh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctanh_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctanh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "erf":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erfc":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "erfc_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "exp":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_downward":
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_upward":
+ildouble: 1
+ldouble: 1
+
+Function: "exp_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "expm1":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "expm1_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "gamma":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "gamma_downward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_towardzero":
+double: 4
+float: 2
+idouble: 4
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_upward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "hypot":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j0":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "j0_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "j0_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "j0_upward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "j1":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j1_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "j1_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "j1_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: "jn":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "jn_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "jn_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "jn_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "lgamma":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "lgamma_downward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_towardzero":
+double: 4
+float: 2
+idouble: 4
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_upward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "log":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log1p":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log1p_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "log2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log_downward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow_downward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_towardzero":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_upward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "sin":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "sin_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sin_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sin_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "sincos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sincos_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sinh":
+double: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sinh_downward":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 5
+
+Function: "sinh_towardzero":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 4
+
+Function: "sinh_upward":
+double: 4
+float: 2
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 5
+
+Function: "tan":
+float: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "tan_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "tanh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 7
+ldouble: 4
+
+Function: "tanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 4
+
+Function: "tgamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "y0":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "y0_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y0_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y0_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "y1":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "y1_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "y1_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y1_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "yn":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "yn_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "yn_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "yn_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+# end of automatic generation
diff --git a/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name
new file mode 100644
index 0000000000..54ca0d8295
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name
@@ -0,0 +1 @@
+ix86
diff --git a/REORG.TODO/sysdeps/i386/fpu/math-tests.h b/REORG.TODO/sysdeps/i386/fpu/math-tests.h
new file mode 100644
index 0000000000..26d0633dc0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/math-tests.h
@@ -0,0 +1,27 @@
+/* Configuration for math tests.  32-bit x86 version.
+   Copyright (C) 2013-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* On 32-bit x86, versions of GCC up to at least 4.8 are happy to use FPU load
+   instructions for sNaN values, and loading a float or double sNaN value will
+   already raise an INVALID exception as well as turn the sNaN into a qNaN,
+   rendering certain tests infeasible in this scenario.
+   <http://gcc.gnu.org/PR56831>.  */
+#define SNAN_TESTS_float	0
+#define SNAN_TESTS_double	0
+
+#include_next <math-tests.h>
diff --git a/REORG.TODO/sysdeps/i386/fpu/math_private.h b/REORG.TODO/sysdeps/i386/fpu/math_private.h
new file mode 100644
index 0000000000..485214391f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/math_private.h
@@ -0,0 +1,7 @@
+#ifndef I386_MATH_PRIVATE_H
+#define I386_MATH_PRIVATE_H 1
+
+#include "fenv_private.h"
+#include_next <math_private.h>
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/mpatan.c b/REORG.TODO/sysdeps/i386/fpu/mpatan.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mpatan.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/mpatan2.c b/REORG.TODO/sysdeps/i386/fpu/mpatan2.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mpatan2.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/mpexp.c b/REORG.TODO/sysdeps/i386/fpu/mpexp.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mpexp.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/mplog.c b/REORG.TODO/sysdeps/i386/fpu/mplog.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mplog.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/mpsqrt.c b/REORG.TODO/sysdeps/i386/fpu/mpsqrt.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mpsqrt.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_asinh.S b/REORG.TODO/sysdeps/i386/fpu/s_asinh.S
new file mode 100644
index 0000000000..1a60f7de2c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_asinh.S
@@ -0,0 +1,139 @@
+/* ix87 specific implementation of arcsinh.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type huge,@object
+huge:	.double 1e+300
+	ASM_SIZE_DIRECTIVE(huge)
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__asinh)
+	movl	8(%esp), %ecx
+	movl	$0x7fffffff, %eax
+	andl	%ecx, %eax
+	andl	$0x80000000, %ecx
+	movl	%eax, %edx
+	orl	$0x800fffff, %edx
+	incl	%edx
+	jz	7f			// x in ±Inf or NaN
+	xorl	%ecx, 8(%esp)
+	fldl	4(%esp)			// |x|
+	cmpl	$0x3e300000, %eax
+	jb	2f			// |x| < 2^-28
+	fldln2				// log(2) : |x|
+	cmpl	$0x41b00000, %eax
+	fxch				// |x| : log(2)
+	ja	3f			// |x| > 2^28
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpl	$0x40000000, %eax
+	ja	5f			// |x| > 2
+
+	// 2^-28 <= |x| <= 2 => y = sign(x)*log1p(|x|+|x|^2/(1+sqrt(1+|x|^2)))
+	fld	%st			// |x| : |x| : log(2)
+	fmul	%st(1)			// |x|^2 : |x| : log(2)
+	fld	%st			// |x|^2 : |x|^2 : |x| : log(2)
+	faddl	MO(one)			// 1+|x|^2 : |x|^2 : |x| : log(2)
+	fsqrt				// sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+	faddl	MO(one)			// 1+sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+	fdivrp				// |x|^2/(1+sqrt(1+|x|^2)) : |x| : log(2)
+	faddp				// |x|+|x|^2/(1+sqrt(1+|x|^2)) : log(2)
+	fcoml	MO(limit)
+	fnstsw
+	sahf
+	ja	6f
+	fyl2xp1
+	jecxz	4f
+	fchs
+4:	ret
+
+7:	fldl	4(%esp)
+	ret
+
+6:	faddl	MO(one)
+	fyl2x
+	jecxz	4f
+	fchs
+4:	ret
+
+	// |x| < 2^-28 => y = x (inexact iff |x| != 0.0)
+	.align ALIGNARG(4)
+2:
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	jecxz	4f
+	fchs				// x
+4:	fld	%st			// x : x
+	faddl	MO(huge)		// huge+x : x
+	fstp	%st(0)			// x
+	cmpl	$0x00100000, %eax
+	jae	8f
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fld	%st(0)
+	fmul	%st(0)
+	fstpl	(%esp)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+8:	ret
+
+	// |x| > 2^28 => y = sign(x) * (log(|x|) + log(2))
+	.align ALIGNARG(4)
+3:	fyl2x				// log(|x|)
+	fldln2				// log(2) : log(|x|)
+	faddp				// log(|x|)+log(2)
+	jecxz	4f
+	fchs
+4:	ret
+
+	// |x| > 2 => y = sign(x) * log(2*|x| + 1/(|x|+sqrt(x*x+1)))
+	.align ALIGNARG(4)
+5:	fld	%st			// |x| : |x| : log(2)
+	fadd	%st, %st(1)		// |x| : 2*|x| : log(2)
+	fld	%st			// |x| : |x| : 2*|x| : log(2)
+	fmul	%st(1)			// |x|^2 : |x| : 2*|x| : log(2)
+	faddl	MO(one)			// 1+|x|^2 : |x| : 2*|x| : log(2)
+	fsqrt				// sqrt(1+|x|^2) : |x| : 2*|x| : log(2)
+	faddp				// |x|+sqrt(1+|x|^2) : 2*|x| : log(2)
+	fdivrl	MO(one)			// 1/(|x|+sqrt(1+|x|^2)) : 2*|x| : log(2)
+	faddp				// 2*|x|+1/(|x|+sqrt(1+|x|^2)) : log(2)
+	fyl2x				// log(2*|x|+1/(|x|+sqrt(1+|x|^2)))
+	jecxz	4f
+	fchs
+4:	ret
+END(__asinh)
+weak_alias (__asinh, asinh)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_asinhf.S b/REORG.TODO/sysdeps/i386/fpu/s_asinhf.S
new file mode 100644
index 0000000000..12bcfef934
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_asinhf.S
@@ -0,0 +1,139 @@
+/* ix87 specific implementation of arcsinh.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type huge,@object
+huge:	.double 1e+36
+	ASM_SIZE_DIRECTIVE(huge)
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__asinhf)
+	movl	4(%esp), %ecx
+	movl	$0x7fffffff, %eax
+	andl	%ecx, %eax
+	andl	$0x80000000, %ecx
+	movl	%eax, %edx
+	orl	$0x807fffff, %edx
+	incl	%edx
+	jz	7f			// x in ±Inf or NaN
+	xorl	%ecx, 4(%esp)
+	flds	4(%esp)			// |x|
+	cmpl	$0x38000000, %eax
+	jb	2f			// |x| < 2^-14
+	fldln2				// log(2) : |x|
+	cmpl	$0x47000000, %eax
+	fxch				// |x| : log(2)
+	ja	3f			// |x| > 2^14
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpl	$0x40000000, %eax
+	ja	5f			// |x| > 2
+
+	// 2^-14 <= |x| <= 2 => y = sign(x)*log1p(|x|+|x|^2/(1+sqrt(1+|x|^2)))
+	fld	%st			// |x| : |x| : log(2)
+	fmul	%st(1)			// |x|^2 : |x| : log(2)
+	fld	%st			// |x|^2 : |x|^2 : |x| : log(2)
+	faddl	MO(one)			// 1+|x|^2 : |x|^2 : |x| : log(2)
+	fsqrt				// sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+	faddl	MO(one)			// 1+sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+	fdivrp				// |x|^2/(1+sqrt(1+|x|^2)) : |x| : log(2)
+	faddp				// |x|+|x|^2/(1+sqrt(1+|x|^2)) : log(2)
+	fcoml	MO(limit)
+	fnstsw
+	sahf
+	ja	6f
+	fyl2xp1
+	jecxz	4f
+	fchs
+4:	ret
+
+7:	flds	4(%esp)
+	ret
+
+6:	faddl	MO(one)
+	fyl2x
+	jecxz	4f
+	fchs
+4:	ret
+
+	// |x| < 2^-14 => y = x (inexact iff |x| != 0.0)
+	.align ALIGNARG(4)
+2:
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	jecxz	4f
+	fchs				// x
+4:	fld	%st			// x : x
+	faddl	MO(huge)		// huge+x : x
+	fstp	%st(0)			// x
+	cmpl	$0x00800000, %eax
+	jae	8f
+	subl	$4, %esp
+	cfi_adjust_cfa_offset (4)
+	fld	%st(0)
+	fmul	%st(0)
+	fstps	(%esp)
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+8:	ret
+
+	// |x| > 2^14 => y = sign(x) * (log(|x|) + log(2))
+	.align ALIGNARG(4)
+3:	fyl2x				// log(|x|)
+	fldln2				// log(2) : log(|x|)
+	faddp				// log(|x|)+log(2)
+	jecxz	4f
+	fchs
+4:	ret
+
+	// |x| > 2 => y = sign(x) * log(2*|x| + 1/(|x|+sqrt(x*x+1)))
+	.align ALIGNARG(4)
+5:	fld	%st			// |x| : |x| : log(2)
+	fadd	%st, %st(1)		// |x| : 2*|x| : log(2)
+	fld	%st			// |x| : |x| : 2*|x| : log(2)
+	fmul	%st(1)			// |x|^2 : |x| : 2*|x| : log(2)
+	faddl	MO(one)			// 1+|x|^2 : |x| : 2*|x| : log(2)
+	fsqrt				// sqrt(1+|x|^2) : |x| : 2*|x| : log(2)
+	faddp				// |x|+sqrt(1+|x|^2) : 2*|x| : log(2)
+	fdivrl	MO(one)			// 1/(|x|+sqrt(1+|x|^2)) : 2*|x| : log(2)
+	faddp				// 2*|x|+1/(|x|+sqrt(1+|x|^2)) : log(2)
+	fyl2x				// log(2*|x|+1/(|x|+sqrt(1+|x|^2)))
+	jecxz	4f
+	fchs
+4:	ret
+END(__asinhf)
+weak_alias (__asinhf, asinhf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_asinhl.S b/REORG.TODO/sysdeps/i386/fpu/s_asinhl.S
new file mode 100644
index 0000000000..f31a267e78
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_asinhl.S
@@ -0,0 +1,144 @@
+/* ix87 specific implementation of arcsinh.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type huge,@object
+huge:	.tfloat 1e+4930
+	ASM_SIZE_DIRECTIVE(huge)
+	.align ALIGNARG(4)
+	/* Please note that we use double value for 1.0.  This number
+	   has an exact representation and so we don't get accuracy
+	   problems.  The advantage is that the code is simpler.  */
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__asinhl)
+	movl	12(%esp), %ecx
+	movl	$0x7fff, %eax
+	andl	%ecx, %eax
+	andl	$0x8000, %ecx
+	movl	%eax, %edx
+	orl	$0xffff8000, %edx
+	incl	%edx
+	jz	7f			// x in ±Inf or NaN
+	xorl	%ecx, 12(%esp)
+	fldt	4(%esp)			// |x|
+	cmpl	$0x3fde, %eax
+	jb	2f			// |x| < 2^-34
+	fldln2				// log(2) : |x|
+	cmpl	$0x4020, %eax
+	fxch				// |x| : log(2)
+	ja	3f			// |x| > 2^34
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpl	$0x4000, %eax
+	ja	5f			// |x| > 2
+
+	// 2^-34 <= |x| <= 2 => y = sign(x)*log1p(|x|+|x|^2/(1+sqrt(1+|x|^2)))
+	fld	%st			// |x| : |x| : log(2)
+	fmul	%st(1)			// |x|^2 : |x| : log(2)
+	fld	%st			// |x|^2 : |x|^2 : |x| : log(2)
+	faddl	MO(one)			// 1+|x|^2 : |x|^2 : |x| : log(2)
+	fsqrt				// sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+	faddl	MO(one)			// 1+sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+	fdivrp				// |x|^2/(1+sqrt(1+|x|^2)) : |x| : log(2)
+	faddp				// |x|+|x|^2/(1+sqrt(1+|x|^2)) : log(2)
+	fcoml	MO(limit)
+	fnstsw
+	sahf
+	ja	6f
+	fyl2xp1
+	jecxz	4f
+	fchs
+4:	ret
+
+7:	fldt	4(%esp)
+	fadd	%st
+	ret
+
+6:	faddl	MO(one)
+	fyl2x
+	jecxz	4f
+	fchs
+4:	ret
+
+	// |x| < 2^-34 => y = x (inexact iff |x| != 0.0)
+	.align ALIGNARG(4)
+2:
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	jecxz	4f
+	fchs				// x
+4:	fld	%st			// x : x
+	fldt	MO(huge)		// huge : x : x
+	faddp				// huge+x : x
+	fstp	%st(0)			// x
+	cmpl	$0x0001, %eax
+	jae	8f
+	fld	%st(0)
+	fmul	%st(0)
+	fstp	%st(0)
+8:	ret
+
+	// |x| > 2^34 => y = sign(x) * (log(|x|) + log(2))
+	.align ALIGNARG(4)
+3:	fyl2x				// log(|x|)
+	fldln2				// log(2) : log(|x|)
+	faddp				// log(|x|)+log(2)
+	jecxz	4f
+	fchs
+4:	ret
+
+	// |x| > 2 => y = sign(x) * log(2*|x| + 1/(|x|+sqrt(x*x+1)))
+	.align ALIGNARG(4)
+5:	fld	%st			// |x| : |x| : log(2)
+	fadd	%st, %st(1)		// |x| : 2*|x| : log(2)
+	fld	%st			// |x| : |x| : 2*|x| : log(2)
+	fmul	%st(1)			// |x|^2 : |x| : 2*|x| : log(2)
+	faddl	MO(one)			// 1+|x|^2 : |x| : 2*|x| : log(2)
+	fsqrt				// sqrt(1+|x|^2) : |x| : 2*|x| : log(2)
+	faddp				// |x|+sqrt(1+|x|^2) : 2*|x| : log(2)
+	fdivrl	MO(one)			// 1/(|x|+sqrt(1+|x|^2)) : 2*|x| : log(2)
+	faddp				// 2*|x|+1/(|x|+sqrt(1+|x|^2)) : log(2)
+	fyl2x				// log(2*|x|+1/(|x|+sqrt(1+|x|^2)))
+	jecxz	4f
+	fchs
+4:	ret
+END(__asinhl)
+weak_alias (__asinhl, asinhl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_atan.S b/REORG.TODO/sysdeps/i386/fpu/s_atan.S
new file mode 100644
index 0000000000..644de78feb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_atan.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_atan.S,v 1.4 1995/05/08 23:50:41 jtc Exp $")
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__atan)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	fldl	4(%esp)
+	fld1
+	fpatan
+	DBL_CHECK_FORCE_UFLOW
+	ret
+END (__atan)
+weak_alias (__atan, atan)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_atanf.S b/REORG.TODO/sysdeps/i386/fpu/s_atanf.S
new file mode 100644
index 0000000000..0589c1135e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_atanf.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_atanf.S,v 1.3 1995/05/08 23:51:33 jtc Exp $")
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__atanf)
+#ifdef  PIC
+	LOAD_PIC_REG (cx)
+#endif
+	flds	4(%esp)
+	fld1
+	fpatan
+	FLT_CHECK_FORCE_UFLOW
+	ret
+END (__atanf)
+weak_alias (__atanf, atanf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_atanl.c b/REORG.TODO/sysdeps/i386/fpu/s_atanl.c
new file mode 100644
index 0000000000..b7dba88aad
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_atanl.c
@@ -0,0 +1,22 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+long double
+__atanl (long double x)
+{
+  long double res;
+
+  asm ("fld1\n"
+       "fpatan"
+       : "=t" (res) : "0" (x));
+
+  return res;
+}
+
+weak_alias (__atanl, atanl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_cbrt.S b/REORG.TODO/sysdeps/i386/fpu/s_cbrt.S
new file mode 100644
index 0000000000..7f01659eae
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_cbrt.S
@@ -0,0 +1,200 @@
+/* Compute cubic root of double value.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
+   Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+        .section .rodata
+
+        .align ALIGNARG(4)
+        .type f7,@object
+f7:	.double -0.145263899385486377
+	ASM_SIZE_DIRECTIVE(f7)
+        .type f6,@object
+f6:	.double 0.784932344976639262
+	ASM_SIZE_DIRECTIVE(f6)
+        .type f5,@object
+f5:	.double -1.83469277483613086
+	ASM_SIZE_DIRECTIVE(f5)
+        .type f4,@object
+f4:	.double 2.44693122563534430
+	ASM_SIZE_DIRECTIVE(f4)
+        .type f3,@object
+f3:	.double -2.11499494167371287
+	ASM_SIZE_DIRECTIVE(f3)
+        .type f2,@object
+f2:	.double 1.50819193781584896
+	ASM_SIZE_DIRECTIVE(f2)
+        .type f1,@object
+f1:	.double 0.354895765043919860
+	ASM_SIZE_DIRECTIVE(f1)
+
+#define CBRT2		1.2599210498948731648
+#define ONE_CBRT2	0.793700525984099737355196796584
+#define SQR_CBRT2	1.5874010519681994748
+#define ONE_SQR_CBRT2	0.629960524947436582364439673883
+
+	.type factor,@object
+factor:	.double ONE_SQR_CBRT2
+	.double ONE_CBRT2
+	.double 1.0
+	.double CBRT2
+	.double SQR_CBRT2
+	ASM_SIZE_DIRECTIVE(factor)
+
+        .type two54,@object
+two54:  .byte 0, 0, 0, 0, 0, 0, 0x50, 0x43
+        ASM_SIZE_DIRECTIVE(two54)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%ebx)
+#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
+#else
+#define MO(op) op
+#define MOX(op,x) op(x)
+#endif
+
+	.text
+ENTRY(__cbrt)
+	movl	4(%esp), %ecx
+	movl	8(%esp), %eax
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+	orl	%eax, %ecx
+	jz	1f
+	xorl	%ecx, %ecx
+	cmpl	$0x7ff00000, %eax
+	jae	1f
+
+#ifdef PIC
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	LOAD_PIC_REG (bx)
+#endif
+
+	cmpl	$0x00100000, %eax
+	jae	2f
+
+#ifdef PIC
+	fldl	8(%esp)
+#else
+	fldl	4(%esp)
+#endif
+	fmull	MO(two54)
+	movl	$-54, %ecx
+#ifdef PIC
+	fstpl	8(%esp)
+	movl	12(%esp), %eax
+#else
+	fstpl	4(%esp)
+	movl	8(%esp), %eax
+#endif
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+
+2:	shrl	$20, %eax
+	andl	$0x800fffff, %edx
+	subl	$1022, %eax
+	orl	$0x3fe00000, %edx
+	addl	%eax, %ecx
+#ifdef PIC
+	movl	%edx, 12(%esp)
+
+	fldl	8(%esp)			/* xm */
+#else
+	movl	%edx, 8(%esp)
+
+	fldl	4(%esp)			/* xm */
+#endif
+	fabs
+
+	/* The following code has two tracks:
+	    a) compute the normalized cbrt value
+	    b) compute xe/3 and xe%3
+	   The right track computes the value for b) and this is done
+	   in an optimized way by avoiding division.
+
+	   But why two tracks at all?  Very easy: efficiency.  Some FP
+	   instruction can overlap with a certain amount of integer (and
+	   FP) instructions.  So we get (except for the imull) all
+	   instructions for free.  */
+
+	fld	%st(0)			/* xm : xm */
+
+	fmull	MO(f7)			/* f7*xm : xm */
+			movl	$1431655766, %eax
+	faddl	MO(f6)			/* f6+f7*xm : xm */
+			imull	%ecx
+	fmul	%st(1)			/* (f6+f7*xm)*xm : xm */
+			movl	%ecx, %eax
+	faddl	MO(f5)			/* f5+(f6+f7*xm)*xm : xm */
+			sarl	$31, %eax
+	fmul	%st(1)			/* (f5+(f6+f7*xm)*xm)*xm : xm */
+			subl	%eax, %edx
+	faddl	MO(f4)			/* f4+(f5+(f6+f7*xm)*xm)*xm : xm */
+	fmul	%st(1)			/* (f4+(f5+(f6+f7*xm)*xm)*xm)*xm : xm */
+	faddl	MO(f3)			/* f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm : xm */
+	fmul	%st(1)			/* (f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm : xm */
+	faddl	MO(f2)			/* f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm : xm */
+	fmul	%st(1)			/* (f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm)*xm : xm */
+	faddl	MO(f1)			/* u:=f1+(f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm)*xm : xm */
+
+	fld	%st			/* u : u : xm */
+	fmul	%st(1)			/* u*u : u : xm */
+	fld	%st(2)			/* xm : u*u : u : xm */
+	fadd	%st			/* 2*xm : u*u : u : xm */
+	fxch	%st(1)			/* u*u : 2*xm : u : xm */
+	fmul	%st(2)			/* t2:=u*u*u : 2*xm : u : xm */
+			movl	%edx, %eax
+	fadd	%st, %st(1)		/* t2 : t2+2*xm : u : xm */
+			leal	(%edx,%edx,2),%edx
+	fadd	%st(0)			/* 2*t2 : t2+2*xm : u : xm */
+			subl	%edx, %ecx
+	faddp	%st, %st(3)		/* t2+2*xm : u : 2*t2+xm */
+			shll	$3, %ecx
+	fmulp				/* u*(t2+2*xm) : 2*t2+xm */
+	fdivp	%st, %st(1)		/* u*(t2+2*xm)/(2*t2+xm) */
+	fmull	MOX(16+factor,%ecx)	/* u*(t2+2*xm)/(2*t2+xm)*FACT */
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	fildl	(%esp)			/* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
+	fxch				/* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
+	fscale				/* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+#ifdef PIC
+	movl	12(%esp), %eax
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+#else
+	movl	8(%esp), %eax
+#endif
+	testl	%eax, %eax
+	fstp	%st(1)
+	jns	4f
+	fchs
+4:	ret
+
+	/* Return the argument.  */
+1:	fldl	4(%esp)
+	ret
+END(__cbrt)
+weak_alias (__cbrt, cbrt)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S b/REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S
new file mode 100644
index 0000000000..645d24372d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S
@@ -0,0 +1,177 @@
+/* Compute cubic root of float value.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
+   Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+        .section .rodata
+
+        .align ALIGNARG(4)
+        .type f3,@object
+f3:	.double 0.191502161678719066
+        ASM_SIZE_DIRECTIVE(f3)
+        .type f2,@object
+f2:	.double 0.697570460207922770
+        ASM_SIZE_DIRECTIVE(f2)
+        .type f1,@object
+f1:	.double 0.492659620528969547
+        ASM_SIZE_DIRECTIVE(f1)
+
+#define CBRT2		1.2599210498948731648
+#define ONE_CBRT2	0.793700525984099737355196796584
+#define SQR_CBRT2	1.5874010519681994748
+#define ONE_SQR_CBRT2	0.629960524947436582364439673883
+
+	.type factor,@object
+        .align ALIGNARG(4)
+factor:	.double ONE_SQR_CBRT2
+	.double ONE_CBRT2
+	.double 1.0
+	.double CBRT2
+	.double SQR_CBRT2
+	ASM_SIZE_DIRECTIVE(factor)
+
+        .type two25,@object
+two25:	.byte 0, 0, 0, 0x4c
+        ASM_SIZE_DIRECTIVE(two25)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%ebx)
+#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
+#else
+#define MO(op) op
+#define MOX(op,x) op(x)
+#endif
+
+	.text
+ENTRY(__cbrtf)
+	movl	4(%esp), %eax
+	xorl	%ecx, %ecx
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+	jz	1f
+	cmpl	$0x7f800000, %eax
+	jae	1f
+
+#ifdef PIC
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	LOAD_PIC_REG (bx)
+#endif
+
+	cmpl	$0x00800000, %eax
+	jae	2f
+
+#ifdef PIC
+	flds	8(%esp)
+#else
+	flds	4(%esp)
+#endif
+	fmuls	MO(two25)
+	movl	$-25, %ecx
+#ifdef PIC
+	fstps	8(%esp)
+	movl	8(%esp), %eax
+#else
+	fstps	4(%esp)
+	movl	4(%esp), %eax
+#endif
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+
+2:	shrl	$23, %eax
+	andl	$0x807fffff, %edx
+	subl	$126, %eax
+	orl	$0x3f000000, %edx
+	addl	%eax, %ecx
+#ifdef PIC
+	movl	%edx, 8(%esp)
+
+	flds	8(%esp)			/* xm */
+#else
+	movl	%edx, 4(%esp)
+
+	flds	4(%esp)			/* xm */
+#endif
+	fabs
+
+	/* The following code has two tracks:
+	    a) compute the normalized cbrt value
+	    b) compute xe/3 and xe%3
+	   The right track computes the value for b) and this is done
+	   in an optimized way by avoiding division.
+
+	   But why two tracks at all?  Very easy: efficiency.  Some FP
+	   instruction can overlap with a certain amount of integer (and
+	   FP) instructions.  So we get (except for the imull) all
+	   instructions for free.  */
+
+	fld	%st(0)			/* xm : xm */
+	fmull	MO(f3)			/* f3*xm : xm */
+			movl	$1431655766, %eax
+	fsubrl	MO(f2)			/* f2-f3*xm : xm */
+			imull	%ecx
+	fmul	%st(1)			/* (f2-f3*xm)*xm : xm */
+			movl	%ecx, %eax
+	faddl	MO(f1)			/* u:=f1+(f2-f3*xm)*xm : xm */
+			sarl	$31, %eax
+	fld	%st			/* u : u : xm */
+			subl	%eax, %edx
+	fmul	%st(1)			/* u*u : u : xm */
+	fld	%st(2)			/* xm : u*u : u : xm */
+	fadd	%st			/* 2*xm : u*u : u : xm */
+	fxch	%st(1)			/* u*u : 2*xm : u : xm */
+	fmul	%st(2)			/* t2:=u*u*u : 2*xm : u : xm */
+			movl	%edx, %eax
+	fadd	%st, %st(1)		/* t2 : t2+2*xm : u : xm */
+			leal	(%edx,%edx,2),%edx
+	fadd	%st(0)			/* 2*t2 : t2+2*xm : u : xm */
+			subl	%edx, %ecx
+	faddp	%st, %st(3)		/* t2+2*xm : u : 2*t2+xm */
+			shll	$3, %ecx
+	fmulp				/* u*(t2+2*xm) : 2*t2+xm */
+	fdivp	%st, %st(1)		/* u*(t2+2*xm)/(2*t2+xm) */
+	fmull	MOX(16+factor,%ecx)	/* u*(t2+2*xm)/(2*t2+xm)*FACT */
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	fildl	(%esp)			/* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
+	fxch				/* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
+	fscale				/* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+#ifdef PIC
+	movl	8(%esp), %eax
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+#else
+	movl	4(%esp), %eax
+#endif
+	testl	%eax, %eax
+	fstp	%st(1)
+	jns	4f
+	fchs
+4:	ret
+
+	/* Return the argument.  */
+1:	flds	4(%esp)
+	ret
+END(__cbrtf)
+weak_alias (__cbrtf, cbrtf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S b/REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S
new file mode 100644
index 0000000000..e4a72d29c6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S
@@ -0,0 +1,229 @@
+/* Compute cubic root of long double value.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
+   Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+        .section .rodata
+
+        .align ALIGNARG(4)
+        .type f8,@object
+f8:	.tfloat 0.161617097923756032
+	ASM_SIZE_DIRECTIVE(f8)
+        .align ALIGNARG(4)
+        .type f7,@object
+f7:	.tfloat -0.988553671195413709
+	ASM_SIZE_DIRECTIVE(f7)
+        .align ALIGNARG(4)
+        .type f6,@object
+f6:	.tfloat 2.65298938441952296
+	ASM_SIZE_DIRECTIVE(f6)
+        .align ALIGNARG(4)
+        .type f5,@object
+f5:	.tfloat -4.11151425200350531
+	ASM_SIZE_DIRECTIVE(f5)
+        .align ALIGNARG(4)
+        .type f4,@object
+f4:	.tfloat 4.09559907378707839
+	ASM_SIZE_DIRECTIVE(f4)
+        .align ALIGNARG(4)
+        .type f3,@object
+f3:	.tfloat -2.82414939754975962
+	ASM_SIZE_DIRECTIVE(f3)
+        .align ALIGNARG(4)
+        .type f2,@object
+f2:	.tfloat 1.67595307700780102
+	ASM_SIZE_DIRECTIVE(f2)
+        .align ALIGNARG(4)
+        .type f1,@object
+f1:	.tfloat 0.338058687610520237
+	ASM_SIZE_DIRECTIVE(f1)
+
+#define CBRT2		1.2599210498948731648
+#define ONE_CBRT2	0.793700525984099737355196796584
+#define SQR_CBRT2	1.5874010519681994748
+#define ONE_SQR_CBRT2	0.629960524947436582364439673883
+
+	/* We make the entries in the following table all 16 bytes
+	   wide to avoid having to implement a multiplication by 10.  */
+	.type factor,@object
+        .align ALIGNARG(4)
+factor:	.tfloat ONE_SQR_CBRT2
+	.byte 0, 0, 0, 0, 0, 0
+	.tfloat ONE_CBRT2
+	.byte 0, 0, 0, 0, 0, 0
+	.tfloat 1.0
+	.byte 0, 0, 0, 0, 0, 0
+	.tfloat CBRT2
+	.byte 0, 0, 0, 0, 0, 0
+	.tfloat SQR_CBRT2
+	ASM_SIZE_DIRECTIVE(factor)
+
+        .type two64,@object
+        .align ALIGNARG(4)
+two64:  .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
+        ASM_SIZE_DIRECTIVE(two64)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%ebx)
+#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
+#else
+#define MO(op) op
+#define MOX(op,x) op(x)
+#endif
+
+	.text
+ENTRY(__cbrtl)
+	movl	4(%esp), %ecx
+	movl	12(%esp), %eax
+	orl	8(%esp), %ecx
+	movl	%eax, %edx
+	andl	$0x7fff, %eax
+	orl	%eax, %ecx
+	jz	1f
+	xorl	%ecx, %ecx
+	cmpl	$0x7fff, %eax
+	je	1f
+
+#ifdef PIC
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	LOAD_PIC_REG (bx)
+#endif
+
+	cmpl	$0, %eax
+	jne	2f
+
+#ifdef PIC
+	fldt	8(%esp)
+#else
+	fldt	4(%esp)
+#endif
+	fmull	MO(two64)
+	movl	$-64, %ecx
+#ifdef PIC
+	fstpt	8(%esp)
+	movl	16(%esp), %eax
+#else
+	fstpt	4(%esp)
+	movl	12(%esp), %eax
+#endif
+	movl	%eax, %edx
+	andl	$0x7fff, %eax
+
+2:	andl	$0x8000, %edx
+	subl	$16382, %eax
+	orl	$0x3ffe, %edx
+	addl	%eax, %ecx
+#ifdef PIC
+	movl	%edx, 16(%esp)
+
+	fldt	8(%esp)			/* xm */
+#else
+	movl	%edx, 12(%esp)
+
+	fldt	4(%esp)			/* xm */
+#endif
+	fabs
+
+	/* The following code has two tracks:
+	    a) compute the normalized cbrt value
+	    b) compute xe/3 and xe%3
+	   The right track computes the value for b) and this is done
+	   in an optimized way by avoiding division.
+
+	   But why two tracks at all?  Very easy: efficiency.  Some FP
+	   instruction can overlap with a certain amount of integer (and
+	   FP) instructions.  So we get (except for the imull) all
+	   instructions for free.  */
+
+	fldt	MO(f8)			/* f8 : xm */
+	fmul	%st(1)			/* f8*xm : xm */
+
+	fldt	MO(f7)
+	faddp				/* f7+f8*xm : xm */
+	fmul	%st(1)			/* (f7+f8*xm)*xm : xm */
+			movl	$1431655766, %eax
+	fldt	MO(f6)
+	faddp				/* f6+(f7+f8*xm)*xm : xm */
+			imull	%ecx
+	fmul	%st(1)			/* (f6+(f7+f8*xm)*xm)*xm : xm */
+			movl	%ecx, %eax
+	fldt	MO(f5)
+	faddp				/* f5+(f6+(f7+f8*xm)*xm)*xm : xm */
+			sarl	$31, %eax
+	fmul	%st(1)			/* (f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
+			subl	%eax, %edx
+	fldt	MO(f4)
+	faddp				/* f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
+	fmul	%st(1)			/* (f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
+	fldt	MO(f3)
+	faddp				/* f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
+	fmul	%st(1)			/* (f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
+	fldt	MO(f2)
+	faddp				/* f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
+	fmul	%st(1)			/* (f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
+	fldt	MO(f1)
+	faddp				/* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
+
+	fld	%st			/* u : u : xm */
+	fmul	%st(1)			/* u*u : u : xm */
+	fld	%st(2)			/* xm : u*u : u : xm */
+	fadd	%st			/* 2*xm : u*u : u : xm */
+	fxch	%st(1)			/* u*u : 2*xm : u : xm */
+	fmul	%st(2)			/* t2:=u*u*u : 2*xm : u : xm */
+			movl	%edx, %eax
+	fadd	%st, %st(1)		/* t2 : t2+2*xm : u : xm */
+			leal	(%edx,%edx,2),%edx
+	fadd	%st(0)			/* 2*t2 : t2+2*xm : u : xm */
+			subl	%edx, %ecx
+	faddp	%st, %st(3)		/* t2+2*xm : u : 2*t2+xm */
+			shll	$4, %ecx
+	fmulp				/* u*(t2+2*xm) : 2*t2+xm */
+	fdivp	%st, %st(1)		/* u*(t2+2*xm)/(2*t2+xm) */
+	fldt	MOX(32+factor,%ecx)
+	fmulp				/* u*(t2+2*xm)/(2*t2+xm)*FACT */
+	pushl	%eax
+	cfi_adjust_cfa_offset (4)
+	fildl	(%esp)			/* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
+	fxch				/* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
+	fscale				/* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+#ifdef PIC
+	movl	16(%esp), %eax
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+#else
+	movl	12(%esp), %eax
+#endif
+	testl	$0x8000, %eax
+	fstp	%st(1)
+	jz	4f
+	fchs
+4:	ret
+
+	/* Return the argument.  */
+1:	fldt	4(%esp)
+	fadd	%st
+	ret
+END(__cbrtl)
+weak_alias (__cbrtl, cbrtl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_ceil.S b/REORG.TODO/sysdeps/i386/fpu/s_ceil.S
new file mode 100644
index 0000000000..1226bb2f87
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_ceil.S
@@ -0,0 +1,34 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_ceil.S,v 1.4 1995/05/08 23:52:13 jtc Exp $")
+
+ENTRY(__ceil)
+	fldl	4(%esp)
+	subl	$32,%esp
+	cfi_adjust_cfa_offset (32)
+
+	fnstenv	4(%esp)			/* store fpu environment */
+
+	/* We use here %edx although only the low 1 bits are defined.
+	   But none of the operations should care and they are faster
+	   than the 16 bit operations.  */
+	movl	$0x0800,%edx		/* round towards +oo */
+	orl	4(%esp),%edx
+	andl	$0xfbff,%edx
+	movl	%edx,(%esp)
+	fldcw	(%esp)			/* load modified control word */
+
+	frndint				/* round */
+
+	fldenv	4(%esp)			/* restore original environment */
+
+	addl	$32,%esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__ceil)
+weak_alias (__ceil, ceil)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_ceilf.S b/REORG.TODO/sysdeps/i386/fpu/s_ceilf.S
new file mode 100644
index 0000000000..d345c0973b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_ceilf.S
@@ -0,0 +1,34 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_ceilf.S,v 1.3 1995/05/08 23:52:44 jtc Exp $")
+
+ENTRY(__ceilf)
+	flds	4(%esp)
+	subl	$32,%esp
+	cfi_adjust_cfa_offset (32)
+
+	fnstenv	4(%esp)			/* store fpu environment */
+
+	/* We use here %edx although only the low 1 bits are defined.
+	   But none of the operations should care and they are faster
+	   than the 16 bit operations.  */
+	movl	$0x0800,%edx		/* round towards +oo */
+	orl	4(%esp),%edx
+	andl	$0xfbff,%edx
+	movl	%edx,(%esp)
+	fldcw	(%esp)			/* load modified control word */
+
+	frndint				/* round */
+
+	fldenv	4(%esp)			/* restore original environment */
+
+	addl	$32,%esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__ceilf)
+weak_alias (__ceilf, ceilf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_ceill.S b/REORG.TODO/sysdeps/i386/fpu/s_ceill.S
new file mode 100644
index 0000000000..7c08f43b24
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_ceill.S
@@ -0,0 +1,40 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__ceill)
+	fldt	4(%esp)
+	subl	$32,%esp
+	cfi_adjust_cfa_offset (32)
+
+	fnstenv	4(%esp)			/* store fpu environment */
+
+	/* We use here %edx although only the low 1 bits are defined.
+	   But none of the operations should care and they are faster
+	   than the 16 bit operations.  */
+	movl	$0x0800,%edx		/* round towards +oo */
+	orl	4(%esp),%edx
+	andl	$0xfbff,%edx
+	movl	%edx,(%esp)
+	fldcw	(%esp)			/* load modified control word */
+
+	frndint				/* round */
+
+	/* Preserve "invalid" exceptions from sNaN input.  */
+	fnstsw
+	andl	$0x1, %eax
+	orl	%eax, 8(%esp)
+
+	fldenv	4(%esp)			/* restore original environment */
+
+	addl	$32,%esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__ceill)
+weak_alias (__ceill, ceill)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_copysign.S b/REORG.TODO/sysdeps/i386/fpu/s_copysign.S
new file mode 100644
index 0000000000..2520a94427
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_copysign.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_copysign.S,v 1.4 1995/05/08 23:53:02 jtc Exp $")
+
+ENTRY(__copysign)
+	movl	16(%esp),%edx
+	movl	8(%esp),%eax
+	andl	$0x80000000,%edx
+	andl	$0x7fffffff,%eax
+	orl	%edx,%eax
+	movl	%eax,8(%esp)
+	fldl	4(%esp)
+	ret
+END (__copysign)
+weak_alias (__copysign, copysign)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_copysignf.S b/REORG.TODO/sysdeps/i386/fpu/s_copysignf.S
new file mode 100644
index 0000000000..57b1a6f119
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_copysignf.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_copysignf.S,v 1.3 1995/05/08 23:53:25 jtc Exp $")
+
+ENTRY(__copysignf)
+	movl	8(%esp),%edx
+	movl	4(%esp),%eax
+	andl	$0x80000000,%edx
+	andl	$0x7fffffff,%eax
+	orl	%edx,%eax
+	movl	%eax,4(%esp)
+	flds	4(%esp)
+	ret
+END (__copysignf)
+weak_alias (__copysignf, copysignf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_copysignl.S b/REORG.TODO/sysdeps/i386/fpu/s_copysignl.S
new file mode 100644
index 0000000000..2163e7b014
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_copysignl.S
@@ -0,0 +1,21 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__copysignl)
+	movl	24(%esp),%edx
+	movl	12(%esp),%eax
+	andl	$0x8000,%edx
+	andl	$0x7fff,%eax
+	orl	%edx,%eax
+	movl	%eax,12(%esp)
+	fldt	4(%esp)
+	ret
+END (__copysignl)
+weak_alias (__copysignl, copysignl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_expm1.S b/REORG.TODO/sysdeps/i386/fpu/s_expm1.S
new file mode 100644
index 0000000000..59fded2d5a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_expm1.S
@@ -0,0 +1,113 @@
+/* ix87 specific implementation of exp(x)-1.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+   Based on code by John C. Bowman <bowman@ipp-garching.mpg.de>.
+   Corrections by H.J. Lu (hjl@gnu.ai.mit.edu), 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+	/* Using: e^x - 1 = 2^(x * log2(e)) - 1 */
+
+#include <sysdep.h>
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type minus1,@object
+minus1:	.double -1.0
+	ASM_SIZE_DIRECTIVE(minus1)
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type l2e,@object
+l2e:	.tfloat 1.442695040888963407359924681002
+	ASM_SIZE_DIRECTIVE(l2e)
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__expm1)
+	movzwl	4+6(%esp), %eax
+	xorb	$0x80, %ah	// invert sign bit (now 1 is "positive")
+	cmpl	$0xc086, %eax	// is num >= 704?
+	jae	HIDDEN_JUMPTARGET (__exp)
+
+	fldl	4(%esp)		// x
+	fxam			// Is NaN, +-Inf or +-0?
+	xorb	$0x80, %ah
+	cmpl	$0xc043, %eax	// is num <= -38.0?
+	fstsw	%ax
+	movb	$0x45, %ch
+	jb	4f
+
+	// Below -38.0 (may be -NaN or -Inf).
+	andb	%ah, %ch
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpb	$0x01, %ch
+	je	5f		// If -NaN, jump.
+	jmp	2f		// -large, possibly -Inf.
+
+4:	// In range -38.0 to 704.0 (may be +-0 but not NaN or +-Inf).
+	andb	%ah, %ch
+	cmpb	$0x40, %ch
+	je	3f		// If +-0, jump.
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+5:	fldt	MO(l2e)		// log2(e) : x
+	fmulp			// log2(e)*x
+	fld	%st		// log2(e)*x : log2(e)*x
+	// Set round-to-nearest temporarily.
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fstcw	4(%esp)
+	movl	$0xf3ff, %ecx
+	andl	4(%esp), %ecx
+	movl	%ecx, (%esp)
+	fldcw	(%esp)
+	frndint			// int(log2(e)*x) : log2(e)*x
+	fldcw	4(%esp)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fsubr	%st, %st(1)	// int(log2(e)*x) : fract(log2(e)*x)
+	fxch			// fract(log2(e)*x) : int(log2(e)*x)
+	f2xm1			// 2^fract(log2(e)*x)-1 : int(log2(e)*x)
+	fscale			// 2^(log2(e)*x)-2^int(log2(e)*x) : int(log2(e)*x)
+	fxch			// int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fldl	MO(one)		// 1 : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fscale			// 2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fsubrl	MO(one)		// 1-2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fstp	%st(1)		// 1-2^int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fsubrp	%st, %st(1)	// 2^(log2(e)*x)
+	DBL_CHECK_FORCE_UFLOW
+	ret
+
+2:	fstp	%st
+	fldl	MO(minus1)	// Set result to -1.0.
+3:	ret
+END(__expm1)
+weak_alias (__expm1, expm1)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_expm1f.S b/REORG.TODO/sysdeps/i386/fpu/s_expm1f.S
new file mode 100644
index 0000000000..4f0b2e7832
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_expm1f.S
@@ -0,0 +1,113 @@
+/* ix87 specific implementation of exp(x)-1.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+   Based on code by John C. Bowman <bowman@ipp-garching.mpg.de>.
+   Corrections by H.J. Lu (hjl@gnu.ai.mit.edu), 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+	/* Using: e^x - 1 = 2^(x * log2(e)) - 1 */
+
+#include <sysdep.h>
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type minus1,@object
+minus1:	.double -1.0
+	ASM_SIZE_DIRECTIVE(minus1)
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type l2e,@object
+l2e:	.tfloat 1.442695040888963407359924681002
+	ASM_SIZE_DIRECTIVE(l2e)
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__expm1f)
+	movzwl	4+2(%esp), %eax
+	xorb	$0x80, %ah	// invert sign bit (now 1 is "positive")
+	cmpl	$0xc2b1, %eax	// is num >= 88.5?
+	jae	HIDDEN_JUMPTARGET (__expf)
+
+	flds	4(%esp)		// x
+	fxam			// Is NaN, +-Inf or +-0?
+	xorb	$0x80, %ah
+	cmpl	$0xc190, %eax	// is num <= -18.0?
+	fstsw	%ax
+	movb	$0x45, %ch
+	jb	4f
+
+	// Below -18.0 (may be -NaN or -Inf).
+	andb	%ah, %ch
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	cmpb	$0x01, %ch
+	je	5f		// If -NaN, jump.
+	jmp	2f		// -large, possibly -Inf.
+
+4:	// In range -18.0 to 88.5 (may be +-0 but not NaN or +-Inf).
+	andb	%ah, %ch
+	cmpb	$0x40, %ch
+	je	3f		// If +-0, jump.
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+5:	fldt	MO(l2e)		// log2(e) : x
+	fmulp			// log2(e)*x
+	fld	%st		// log2(e)*x : log2(e)*x
+	// Set round-to-nearest temporarily.
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fstcw	4(%esp)
+	movl	$0xf3ff, %ecx
+	andl	4(%esp), %ecx
+	movl	%ecx, (%esp)
+	fldcw	(%esp)
+	frndint			// int(log2(e)*x) : log2(e)*x
+	fldcw	4(%esp)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fsubr	%st, %st(1)	// int(log2(e)*x) : fract(log2(e)*x)
+	fxch			// fract(log2(e)*x) : int(log2(e)*x)
+	f2xm1			// 2^fract(log2(e)*x)-1 : int(log2(e)*x)
+	fscale			// 2^(log2(e)*x)-2^int(log2(e)*x) : int(log2(e)*x)
+	fxch			// int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fldl	MO(one)		// 1 : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fscale			// 2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fsubrl	MO(one)		// 1-2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fstp	%st(1)		// 1-2^int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+	fsubrp	%st, %st(1)	// 2^(log2(e)*x)
+	FLT_CHECK_FORCE_UFLOW
+	ret
+
+2:	fstp	%st
+	fldl	MO(minus1)	// Set result to -1.0.
+3:	ret
+END(__expm1f)
+weak_alias (__expm1f, expm1f)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_expm1l.S b/REORG.TODO/sysdeps/i386/fpu/s_expm1l.S
new file mode 100644
index 0000000000..7fbd99b0db
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_expm1l.S
@@ -0,0 +1,2 @@
+#define USE_AS_EXPM1L
+#include <e_expl.S>
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fabs.S b/REORG.TODO/sysdeps/i386/fpu/s_fabs.S
new file mode 100644
index 0000000000..23ae9dccb9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fabs.S
@@ -0,0 +1,9 @@
+#include <sysdep.h>
+
+	.text
+ENTRY(__fabs)
+	fldl	4(%esp)
+	fabs
+	ret
+END(__fabs)
+weak_alias (__fabs, fabs)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fabsf.S b/REORG.TODO/sysdeps/i386/fpu/s_fabsf.S
new file mode 100644
index 0000000000..c0407a8839
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fabsf.S
@@ -0,0 +1,9 @@
+#include <sysdep.h>
+
+	.text
+ENTRY(__fabsf)
+	flds	4(%esp)
+	fabs
+	ret
+END(__fabsf)
+weak_alias (__fabsf, fabsf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fabsl.S b/REORG.TODO/sysdeps/i386/fpu/s_fabsl.S
new file mode 100644
index 0000000000..a12a3e050b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fabsl.S
@@ -0,0 +1,9 @@
+#include <sysdep.h>
+
+	.text
+ENTRY(__fabsl)
+	fldt	4(%esp)
+	fabs
+	ret
+END(__fabsl)
+weak_alias (__fabsl, fabsl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fdim.c b/REORG.TODO/sysdeps/i386/fpu/s_fdim.c
new file mode 100644
index 0000000000..6243c62998
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fdim.c
@@ -0,0 +1,50 @@
+/* Return positive difference between arguments.  i386 version.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <fpu_control.h>
+#include <math.h>
+#include <math_private.h>
+
+double
+__fdim (double x, double y)
+{
+  if (islessequal (x, y))
+    return 0.0;
+
+  /* To avoid double rounding, set double precision for the
+     subtraction.  math_narrow_eval is still needed to eliminate
+     excess range in the case of overflow.  If the result of the
+     subtraction is in the subnormal range for double, it is exact, so
+     no issues of double rounding for subnormals arise.  */
+  fpu_control_t cw, cw_double;
+  _FPU_GETCW (cw);
+  cw_double = (cw & ~_FPU_EXTENDED) | _FPU_DOUBLE;
+  _FPU_SETCW (cw_double);
+  double r = math_narrow_eval (x - y);
+  _FPU_SETCW (cw);
+  if (isinf (r) && !isinf (x) && !isinf (y))
+    __set_errno (ERANGE);
+
+  return r;
+}
+weak_alias (__fdim, fdim)
+#ifdef NO_LONG_DOUBLE
+strong_alias (__fdim, __fdiml)
+weak_alias (__fdim, fdiml)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_finite.S b/REORG.TODO/sysdeps/i386/fpu/s_finite.S
new file mode 100644
index 0000000000..1ae4aed451
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_finite.S
@@ -0,0 +1,17 @@
+/*
+ * Written by Joe Keane <jgk@jgk.org>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__finite)
+	movl	8(%esp),%eax
+	movl    $0xFFEFFFFF,%ecx
+	subl    %eax,%ecx
+	xorl    %ecx,%eax
+	shrl	$31, %eax
+	ret
+END (__finite)
+weak_alias (__finite, finite)
+hidden_def (__finite)
+
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_finitef.S b/REORG.TODO/sysdeps/i386/fpu/s_finitef.S
new file mode 100644
index 0000000000..69e72facff
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_finitef.S
@@ -0,0 +1,16 @@
+/*
+ * Written by Joe Keane <jgk@jgk.org>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__finitef)
+	movl	4(%esp),%eax
+	movl    $0xFF7FFFFF,%ecx
+	subl    %eax,%ecx
+	xorl    %ecx,%eax
+	shrl    $31,%eax
+	ret
+END (__finitef)
+weak_alias (__finitef, finitef)
+hidden_def (__finitef)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_finitel.S b/REORG.TODO/sysdeps/i386/fpu/s_finitel.S
new file mode 100644
index 0000000000..cce90e18fc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_finitel.S
@@ -0,0 +1,15 @@
+/*
+ * Written by Joe Keane <jgk@jgk.org>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__finitel)
+	movl	12(%esp),%eax
+	orl	$0xffff8000, %eax
+	incl	%eax
+	shrl	$31, %eax
+	ret
+END (__finitel)
+weak_alias (__finitel, finitel)
+hidden_def (__finitel)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_floor.S b/REORG.TODO/sysdeps/i386/fpu/s_floor.S
new file mode 100644
index 0000000000..ed837dae40
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_floor.S
@@ -0,0 +1,34 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_floor.S,v 1.4 1995/05/09 00:01:59 jtc Exp $")
+
+ENTRY(__floor)
+	fldl	4(%esp)
+	subl	$32,%esp
+	cfi_adjust_cfa_offset (32)
+
+	fnstenv	4(%esp)			/* store fpu environment */
+
+	/* We use here %edx although only the low 1 bits are defined.
+	   But none of the operations should care and they are faster
+	   than the 16 bit operations.  */
+	movl	$0x400,%edx		/* round towards -oo */
+	orl	4(%esp),%edx
+	andl	$0xf7ff,%edx
+	movl	%edx,(%esp)
+	fldcw	(%esp)			/* load modified control word */
+
+	frndint				/* round */
+
+	fldenv	4(%esp)			/* restore original environment */
+
+	addl	$32,%esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__floor)
+weak_alias (__floor, floor)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_floorf.S b/REORG.TODO/sysdeps/i386/fpu/s_floorf.S
new file mode 100644
index 0000000000..84b6f7ed99
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_floorf.S
@@ -0,0 +1,34 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_floorf.S,v 1.3 1995/05/09 00:04:32 jtc Exp $")
+
+ENTRY(__floorf)
+	flds	4(%esp)
+	subl	$32,%esp
+	cfi_adjust_cfa_offset (32)
+
+	fnstenv	4(%esp)			/* store fpu environment */
+
+	/* We use here %edx although only the low 1 bits are defined.
+	   But none of the operations should care and they are faster
+	   than the 16 bit operations.  */
+	movl	$0x400,%edx		/* round towards -oo */
+	orl	4(%esp),%edx
+	andl	$0xf7ff,%edx
+	movl	%edx,(%esp)
+	fldcw	(%esp)			/* load modified control word */
+
+	frndint				/* round */
+
+	fldenv	4(%esp)			/* restore original environment */
+
+	addl	$32,%esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__floorf)
+weak_alias (__floorf, floorf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_floorl.S b/REORG.TODO/sysdeps/i386/fpu/s_floorl.S
new file mode 100644
index 0000000000..dc74a0c446
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_floorl.S
@@ -0,0 +1,40 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__floorl)
+	fldt	4(%esp)
+	subl	$32,%esp
+	cfi_adjust_cfa_offset (32)
+
+	fnstenv	4(%esp)			/* store fpu environment */
+
+	/* We use here %edx although only the low 1 bits are defined.
+	   But none of the operations should care and they are faster
+	   than the 16 bit operations.  */
+	movl	$0x400,%edx		/* round towards -oo */
+	orl	4(%esp),%edx
+	andl	$0xf7ff,%edx
+	movl	%edx,(%esp)
+	fldcw	(%esp)			/* load modified control word */
+
+	frndint				/* round */
+
+	/* Preserve "invalid" exceptions from sNaN input.  */
+	fnstsw
+	andl	$0x1, %eax
+	orl	%eax, 8(%esp)
+
+	fldenv	4(%esp)			/* restore original environment */
+
+	addl	$32,%esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__floorl)
+weak_alias (__floorl, floorl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmax.S b/REORG.TODO/sysdeps/i386/fpu/s_fmax.S
new file mode 100644
index 0000000000..218dcef421
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fmax.S
@@ -0,0 +1,43 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmax)
+	fldl	12(%esp)	// y
+	fxam
+	fnstsw
+	fldl	4(%esp)		// y : x
+
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	1f		// y == NaN
+
+	fucom	%st(1)
+	fnstsw
+	sahf
+	jnc	1f
+
+	fxch	%st(1)
+1:	fstp	%st(1)
+
+	ret
+END(__fmax)
+weak_alias (__fmax, fmax)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S b/REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S
new file mode 100644
index 0000000000..b7a00cefeb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S
@@ -0,0 +1,43 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmaxf)
+	flds	8(%esp)		// y
+	fxam
+	fnstsw
+	flds	4(%esp)		// y : x
+
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	1f		// y == NaN
+
+	fucom	%st(1)
+	fnstsw
+	sahf
+	jnc	1f
+
+	fxch	%st(1)
+1:	fstp	%st(1)
+
+	ret
+END(__fmaxf)
+weak_alias (__fmaxf, fmaxf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S b/REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S
new file mode 100644
index 0000000000..68162921db
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S
@@ -0,0 +1,71 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmaxl)
+	fldt	16(%esp)	// y
+	fxam
+	fnstsw
+	fldt	4(%esp)		// y : x
+
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	2f		// y == NaN
+
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	3f		// x == NaN
+
+	fucom	%st(1)
+	fnstsw
+	sahf
+	jnc	1f
+
+	fxch	%st(1)
+1:	fstp	%st(1)
+
+	ret
+
+2:	// st(1) is a NaN; st(0) may or may not be.
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	4f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 23(%esp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+3:	// st(0) is a NaN; st(1) is not.  Test if st(0) is signaling.
+	testb	$0x40, 11(%esp)
+	jz	4f
+	fstp	%st(0)
+	ret
+
+4:	// Both arguments are NaNs, or one is a signaling NaN.
+	faddp
+	ret
+END(__fmaxl)
+weak_alias (__fmaxl, fmaxl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmin.S b/REORG.TODO/sysdeps/i386/fpu/s_fmin.S
new file mode 100644
index 0000000000..a5bb0e06dd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fmin.S
@@ -0,0 +1,43 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmin)
+	fldl	4(%esp)		// x
+	fldl	12(%esp)	// x : y
+
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	1f		// y == NaN
+
+	fucom	%st(1)
+	fnstsw
+	sahf
+	jc	2f
+
+1:	fxch	%st(1)
+2:	fstp	%st(1)
+
+	ret
+END(__fmin)
+weak_alias (__fmin, fmin)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fminf.S b/REORG.TODO/sysdeps/i386/fpu/s_fminf.S
new file mode 100644
index 0000000000..fba4a41120
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fminf.S
@@ -0,0 +1,43 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fminf)
+	flds	4(%esp)		// x
+	flds	8(%esp)		// x : y
+
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	1f		// y == NaN
+
+	fucom	%st(1)
+	fnstsw
+	sahf
+	jc	2f
+
+1:	fxch	%st(1)
+2:	fstp	%st(1)
+
+	ret
+END(__fminf)
+weak_alias (__fminf, fminf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fminl.S b/REORG.TODO/sysdeps/i386/fpu/s_fminl.S
new file mode 100644
index 0000000000..12ef21fda9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fminl.S
@@ -0,0 +1,71 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fminl)
+	fldt	16(%esp)	// y
+	fxam
+	fnstsw
+	fldt	4(%esp)		// y : x
+
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	2f		// y == NaN
+
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	3f		// x == NaN
+
+	fucom	%st(1)
+	fnstsw
+	sahf
+	jc	1f
+
+	fxch	%st(1)
+1:	fstp	%st(1)
+
+	ret
+
+2:	// st(1) is a NaN; st(0) may or may not be.
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	4f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 23(%esp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+3:	// st(0) is a NaN; st(1) is not.  Test if st(0) is signaling.
+	testb	$0x40, 11(%esp)
+	jz	4f
+	fstp	%st(0)
+	ret
+
+4:	// Both arguments are NaNs, or one is a signaling NaN.
+	faddp
+	ret
+END(__fminl)
+weak_alias (__fminl, fminl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c b/REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c
new file mode 100644
index 0000000000..ce19fd0035
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c
@@ -0,0 +1,42 @@
+/* Return classification value corresponding to argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+
+#include <math_private.h>
+
+
+int
+__fpclassifyl (long double x)
+{
+  u_int32_t ex, hx, lx;
+  int retval = FP_NORMAL;
+
+  GET_LDOUBLE_WORDS (ex, hx, lx, x);
+  ex &= 0x7fff;
+  if ((ex | lx | hx) == 0)
+    retval = FP_ZERO;
+  else if (ex == 0 && (hx & 0x80000000) == 0)
+    retval = FP_SUBNORMAL;
+  else if (ex == 0x7fff)
+    retval = ((hx & 0x7fffffff) | lx) != 0 ? FP_NAN : FP_INFINITE;
+
+  return retval;
+}
+libm_hidden_def (__fpclassifyl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_frexp.S b/REORG.TODO/sysdeps/i386/fpu/s_frexp.S
new file mode 100644
index 0000000000..104f733bf6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_frexp.S
@@ -0,0 +1,83 @@
+/* ix87 specific frexp implementation for double.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type two54,@object
+two54:	.byte 0, 0, 0, 0, 0, 0, 0x50, 0x43
+	ASM_SIZE_DIRECTIVE(two54)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+#define PARMS	4		/* no space for saved regs */
+#define VAL0	PARMS
+#define VAL1	VAL0+4
+#define EXPP	VAL1+4
+
+	.text
+ENTRY (__frexp)
+
+	movl	VAL0(%esp), %ecx
+	movl	VAL1(%esp), %eax
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+	orl	%eax, %ecx
+	jz	1f
+	xorl	%ecx, %ecx
+	cmpl	$0x7ff00000, %eax
+	jae	1f
+
+	cmpl	$0x00100000, %eax
+	jae	2f
+
+	fldl	VAL0(%esp)
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fmull	MO(two54)
+	movl	$-54, %ecx
+	fstpl	VAL0(%esp)
+	fwait
+	movl	VAL1(%esp), %eax
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+
+2:	shrl	$20, %eax
+	andl	$0x800fffff, %edx
+	subl	$1022, %eax
+	orl	$0x3fe00000, %edx
+	addl	%eax, %ecx
+	movl	%edx, VAL1(%esp)
+
+	/* Store %ecx in the variable pointed to by the second argument,
+	   get the factor from the stack and return.  */
+1:	movl	EXPP(%esp), %eax
+	fldl	VAL0(%esp)
+	movl	%ecx, (%eax)
+
+	ret
+END (__frexp)
+weak_alias (__frexp, frexp)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_frexpf.S b/REORG.TODO/sysdeps/i386/fpu/s_frexpf.S
new file mode 100644
index 0000000000..f21c39ec4b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_frexpf.S
@@ -0,0 +1,80 @@
+/* ix87 specific frexp implementation for float.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type two25,@object
+two25:	.byte 0, 0, 0, 0x4c
+	ASM_SIZE_DIRECTIVE(two25)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+#define PARMS	4		/* no space for saved regs */
+#define VAL	PARMS
+#define EXPP	VAL+4
+
+	.text
+ENTRY (__frexpf)
+
+	movl	VAL(%esp), %eax
+	xorl	%ecx, %ecx
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+	jz	1f
+	cmpl	$0x7f800000, %eax
+	jae	1f
+
+	cmpl	$0x00800000, %eax
+	jae	2f
+
+	flds	VAL(%esp)
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fmuls	MO(two25)
+	movl	$-25, %ecx
+	fstps	VAL(%esp)
+	fwait
+	movl	VAL(%esp), %eax
+	movl	%eax, %edx
+	andl	$0x7fffffff, %eax
+
+2:	shrl	$23, %eax
+	andl	$0x807fffff, %edx
+	subl	$126, %eax
+	orl	$0x3f000000, %edx
+	addl	%eax, %ecx
+	movl	%edx, VAL(%esp)
+
+	/* Store %ecx in the variable pointed to by the second argument,
+	   get the factor from the stack and return.  */
+1:	movl	EXPP(%esp), %eax
+	flds	VAL(%esp)
+	movl	%ecx, (%eax)
+
+	ret
+END (__frexpf)
+weak_alias (__frexpf, frexpf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_frexpl.S b/REORG.TODO/sysdeps/i386/fpu/s_frexpl.S
new file mode 100644
index 0000000000..04f28888d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_frexpl.S
@@ -0,0 +1,92 @@
+/* ix87 specific frexp implementation for long double.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type two64,@object
+two64:	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
+	ASM_SIZE_DIRECTIVE(two64)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+#define PARMS	4		/* no space for saved regs */
+#define VAL0	PARMS
+#define VAL1	VAL0+4
+#define VAL2	VAL1+4
+#define EXPP	VAL2+4
+
+	.text
+ENTRY (__frexpl)
+
+	movl	VAL0(%esp), %ecx
+	movl	VAL2(%esp), %eax
+	orl	VAL1(%esp), %ecx
+	movl	%eax, %edx
+	andl	$0x7fff, %eax
+	orl	%eax, %ecx
+	jz	1f
+	xorl	%ecx, %ecx
+	cmpl	$0x7fff, %eax
+	je	3f
+
+	cmpl	$0, %eax
+	jne	2f
+
+	fldt	VAL0(%esp)
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+	fmull	MO(two64)	/* It's not necessary to use a 80bit factor */
+	movl	$-64, %ecx
+	fstpt	VAL0(%esp)
+	fwait
+	movl	VAL2(%esp), %eax
+	movl	%eax, %edx
+	andl	$0x7fff, %eax
+
+2:	andl	$0x8000, %edx
+	subl	$16382, %eax
+	orl	$0x3ffe, %edx
+	addl	%eax, %ecx
+	movl	%edx, VAL2(%esp)
+
+	/* Store %ecx in the variable pointed to by the second argument,
+	   get the factor from the stack and return.  */
+1:	movl	EXPP(%esp), %eax
+	fldt	VAL0(%esp)
+	movl	%ecx, (%eax)
+
+	ret
+
+	/* Infinity or NaN; ensure signaling NaNs are quieted.  */
+3:	movl	EXPP(%esp), %eax
+	fldt	VAL0(%esp)
+	fadd	%st
+	movl	%ecx, (%eax)
+	ret
+END (__frexpl)
+weak_alias (__frexpl, frexpl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_isinfl.c b/REORG.TODO/sysdeps/i386/fpu/s_isinfl.c
new file mode 100644
index 0000000000..cdd77183fa
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_isinfl.c
@@ -0,0 +1,32 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Change for long double by Ulrich Drepper <drepper@cygnus.com>.
+ * Intel i387 specific version.
+ * Public domain.
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+/*
+ * isinfl(x) returns 1 if x is inf, -1 if x is -inf, else 0;
+ * no branching!
+ */
+
+#include <math.h>
+#include <math_private.h>
+
+int __isinfl(long double x)
+{
+	int32_t se,hx,lx;
+	GET_LDOUBLE_WORDS(se,hx,lx,x);
+	/* This additional ^ 0x80000000 is necessary because in Intel's
+	   internal representation of the implicit one is explicit.  */
+	lx |= (hx ^ 0x80000000) | ((se & 0x7fff) ^ 0x7fff);
+	lx |= -lx;
+	se &= 0x8000;
+	return ~(lx >> 31) & (1 - (se >> 14));
+}
+hidden_def (__isinfl)
+weak_alias (__isinfl, isinfl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_isnanl.c b/REORG.TODO/sysdeps/i386/fpu/s_isnanl.c
new file mode 100644
index 0000000000..816396d8fb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_isnanl.c
@@ -0,0 +1,43 @@
+/* s_isnanl.c -- long double version for i387 of s_isnan.c.
+ * Conversion to long double by Ulrich Drepper,
+ * Cygnus Support, drepper@cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+/*
+ * isnanl(x) returns 1 is x is nan, else 0;
+ * no branching!
+ */
+
+#include <math.h>
+#include <math_private.h>
+
+int __isnanl(long double x)
+{
+	int32_t se,hx,lx;
+	GET_LDOUBLE_WORDS(se,hx,lx,x);
+	se = (se & 0x7fff) << 1;
+	/* The additional & 0x7fffffff is required because Intel's
+	   extended format has the normally implicit 1 explicit
+	   present.  Sigh!  */
+	lx |= hx & 0x7fffffff;
+	se |= (u_int32_t)(lx|(-lx))>>31;
+	se = 0xfffe - se;
+	return (int)((u_int32_t)(se))>>16;
+}
+hidden_def (__isnanl)
+weak_alias (__isnanl, isnanl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_llrint.S b/REORG.TODO/sysdeps/i386/fpu/s_llrint.S
new file mode 100644
index 0000000000..a597183aab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_llrint.S
@@ -0,0 +1,36 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__llrint)
+	fldl	4(%esp)
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fistpll	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	ret
+END(__llrint)
+weak_alias (__llrint, llrint)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_llrintf.S b/REORG.TODO/sysdeps/i386/fpu/s_llrintf.S
new file mode 100644
index 0000000000..a4b574eccb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_llrintf.S
@@ -0,0 +1,36 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__llrintf)
+	flds	4(%esp)
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fistpll	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	ret
+END(__llrintf)
+weak_alias (__llrintf, llrintf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_llrintl.S b/REORG.TODO/sysdeps/i386/fpu/s_llrintl.S
new file mode 100644
index 0000000000..7b48c02ef4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_llrintl.S
@@ -0,0 +1,36 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__llrintl)
+	fldt	4(%esp)
+	subl	$8, %esp
+	cfi_adjust_cfa_offset (8)
+	fistpll	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	ret
+END(__llrintl)
+weak_alias (__llrintl, llrintl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_log1p.S b/REORG.TODO/sysdeps/i386/fpu/s_log1p.S
new file mode 100644
index 0000000000..7978e76095
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_log1p.S
@@ -0,0 +1,67 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_log1p.S,v 1.7 1995/05/09 00:10:58 jtc Exp $")
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	/* The fyl2xp1 can only be used for values in
+		-1 + sqrt(2) / 2 <= x <= 1 - sqrt(2) / 2
+	   0.29 is a safe value.
+	*/
+limit:	.double 0.29
+one:	.double 1.0
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+/*
+ * Use the fyl2xp1 function when the argument is in the range -0.29 to 0.29,
+ * otherwise fyl2x with the needed extra computation.
+ */
+	.text
+ENTRY(__log1p)
+	fldln2
+
+	fldl	4(%esp)
+
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+	fxam
+	fnstsw
+	fld	%st
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fabs
+	fcompl	MO(limit)
+	fnstsw
+	sahf
+	jc	2f
+
+	faddl	MO(one)
+	fyl2x
+	ret
+
+2:	fyl2xp1
+	DBL_CHECK_FORCE_UFLOW_NONNAN
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+
+END (__log1p)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_log1pf.S b/REORG.TODO/sysdeps/i386/fpu/s_log1pf.S
new file mode 100644
index 0000000000..acaa299d94
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_log1pf.S
@@ -0,0 +1,67 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_log1pf.S,v 1.4 1995/05/09 00:13:05 jtc Exp $")
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	/* The fyl2xp1 can only be used for values in
+		-1 + sqrt(2) / 2 <= x <= 1 - sqrt(2) / 2
+	   0.29 is a safe value.
+	*/
+limit:	.float 0.29
+one:	.float 1.0
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+/*
+ * Use the fyl2xp1 function when the argument is in the range -0.29 to 0.29,
+ * otherwise fyl2x with the needed extra computation.
+ */
+	.text
+ENTRY(__log1pf)
+	fldln2
+
+	flds	4(%esp)
+
+#ifdef	PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+	fxam
+	fnstsw
+	fld	%st
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:	fabs
+	fcomps	MO(limit)
+	fnstsw
+	sahf
+	jc	2f
+
+	fadds	MO(one)
+	fyl2x
+	ret
+
+2:	fyl2xp1
+	FLT_CHECK_FORCE_UFLOW_NONNAN
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	ret
+
+END (__log1pf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_log1pl.S b/REORG.TODO/sysdeps/i386/fpu/s_log1pl.S
new file mode 100644
index 0000000000..0fd05cbdb3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_log1pl.S
@@ -0,0 +1,76 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_log1p.S,v 1.7 1995/05/09 00:10:58 jtc Exp $")
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	/* The fyl2xp1 can only be used for values in
+		-1 + sqrt(2) / 2 <= x <= 1 - sqrt(2) / 2
+	   0.29 is a safe value.
+	*/
+limit:	.tfloat 0.29
+	/* Please note:	 we use a double value here.  Since 1.0 has
+	   an exact representation this does not effect the accuracy
+	   but it helps to optimize the code.  */
+one:	.double 1.0
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+/*
+ * Use the fyl2xp1 function when the argument is in the range -0.29 to 0.29,
+ * otherwise fyl2x with the needed extra computation.
+ */
+	.text
+ENTRY(__log1pl)
+	fldln2
+
+	fldt	4(%esp)
+
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+
+	fxam
+	fnstsw
+	fld	%st
+	sahf
+	jc	3f		// in case x is NaN or ±Inf
+4:
+	fabs
+	fldt	MO(limit)
+	fcompp
+	fnstsw
+	sahf
+	jnc	2f
+
+	movzwl	4+8(%esp), %eax
+	xorb	$0x80, %ah
+	cmpl	$0xc040, %eax
+	jae	5f
+
+	faddl	MO(one)
+5:	fyl2x
+	ret
+
+2:	fyl2xp1
+	ret
+
+3:	jp	4b		// in case x is ±Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	fadd	%st(0)
+	ret
+
+END (__log1pl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_logb.S b/REORG.TODO/sysdeps/i386/fpu/s_logb.S
new file mode 100644
index 0000000000..f78c091c8a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_logb.S
@@ -0,0 +1,16 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_logb.S,v 1.4 1995/05/09 00:14:30 jtc Exp $")
+
+ENTRY(__logb)
+	fldl	4(%esp)
+	fxtract
+	fstp	%st
+	ret
+END (__logb)
+weak_alias (__logb, logb)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_logbf.S b/REORG.TODO/sysdeps/i386/fpu/s_logbf.S
new file mode 100644
index 0000000000..91eb3d2925
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_logbf.S
@@ -0,0 +1,16 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_logbf.S,v 1.3 1995/05/09 00:15:12 jtc Exp $")
+
+ENTRY(__logbf)
+	flds	4(%esp)
+	fxtract
+	fstp	%st
+	ret
+END (__logbf)
+weak_alias (__logbf, logbf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_logbl.c b/REORG.TODO/sysdeps/i386/fpu/s_logbl.c
new file mode 100644
index 0000000000..391e2db489
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_logbl.c
@@ -0,0 +1,19 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <math_private.h>
+
+long double
+__logbl (long double x)
+{
+  long double res;
+
+  asm ("fxtract\n"
+       "fstp	%%st" : "=t" (res) : "0" (x));
+  return res;
+}
+
+weak_alias (__logbl, logbl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_lrint.S b/REORG.TODO/sysdeps/i386/fpu/s_lrint.S
new file mode 100644
index 0000000000..79a374b399
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_lrint.S
@@ -0,0 +1,34 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__lrint)
+	fldl	4(%esp)
+	subl	$4, %esp
+	cfi_adjust_cfa_offset (4)
+	fistpl	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	ret
+END(__lrint)
+weak_alias (__lrint, lrint)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_lrintf.S b/REORG.TODO/sysdeps/i386/fpu/s_lrintf.S
new file mode 100644
index 0000000000..fc6e68e073
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_lrintf.S
@@ -0,0 +1,34 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__lrintf)
+	flds	4(%esp)
+	subl	$4, %esp
+	cfi_adjust_cfa_offset (4)
+	fistpl	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	ret
+END(__lrintf)
+weak_alias (__lrintf, lrintf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_lrintl.S b/REORG.TODO/sysdeps/i386/fpu/s_lrintl.S
new file mode 100644
index 0000000000..ba6dbdf44c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_lrintl.S
@@ -0,0 +1,34 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__lrintl)
+	fldt	4(%esp)
+	subl	$4, %esp
+	cfi_adjust_cfa_offset (4)
+	fistpl	(%esp)
+	fwait
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	ret
+END(__lrintl)
+weak_alias (__lrintl, lrintl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S b/REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S
new file mode 100644
index 0000000000..f7b79b6ff2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+/* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>.  */
+
+#include <machine/asm.h>
+
+ENTRY(__nearbyint)
+	fldl	4(%esp)
+	subl	$32, %esp
+	cfi_adjust_cfa_offset (32)
+	fnstenv	4(%esp)
+	frndint
+	fldenv	4(%esp)
+	addl	$32, %esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__nearbyint)
+weak_alias (__nearbyint, nearbyint)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S
new file mode 100644
index 0000000000..92df2f87b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+/* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>.  */
+
+#include <machine/asm.h>
+
+ENTRY(__nearbyintf)
+	flds	4(%esp)
+	subl	$32, %esp
+	cfi_adjust_cfa_offset (32)
+	fnstenv	4(%esp)
+	frndint
+	fldenv	4(%esp)
+	addl	$32, %esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__nearbyintf)
+weak_alias (__nearbyintf, nearbyintf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S
new file mode 100644
index 0000000000..3b7d1e2436
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S
@@ -0,0 +1,23 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+/* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>.  */
+
+#include <machine/asm.h>
+
+ENTRY(__nearbyintl)
+	fldt	4(%esp)
+	subl	$32, %esp
+	cfi_adjust_cfa_offset (32)
+	fnstenv	4(%esp)
+	frndint
+	fnstsw
+	andl	$0x1, %eax
+	orl	%eax, 8(%esp)
+	fldenv	4(%esp)
+	addl	$32, %esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END (__nearbyintl)
+weak_alias (__nearbyintl, nearbyintl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c b/REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c
new file mode 100644
index 0000000000..600ad7a8d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c
@@ -0,0 +1,125 @@
+/* s_nextafterl.c -- long double version of s_nextafter.c.
+ * Special version for i387.
+ * Conversion to long double by Ulrich Drepper,
+ * Cygnus Support, drepper@cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+/* IEEE functions
+ *	nextafterl(x,y)
+ *	return the next machine floating-point number of x in the
+ *	direction toward y.
+ *   Special cases:
+ */
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+
+long double __nextafterl(long double x, long double y)
+{
+	u_int32_t hx,hy,ix,iy;
+	u_int32_t lx,ly;
+	int32_t esx,esy;
+
+	GET_LDOUBLE_WORDS(esx,hx,lx,x);
+	GET_LDOUBLE_WORDS(esy,hy,ly,y);
+	ix = esx&0x7fff;		/* |x| */
+	iy = esy&0x7fff;		/* |y| */
+
+	/* Intel's extended format has the normally implicit 1 explicit
+	   present.  Sigh!  */
+	if(((ix==0x7fff)&&(((hx&0x7fffffff)|lx)!=0)) ||   /* x is nan */
+	   ((iy==0x7fff)&&(((hy&0x7fffffff)|ly)!=0)))     /* y is nan */
+	   return x+y;
+	if(x==y) return y;		/* x=y, return y */
+	if((ix|hx|lx)==0) {			/* x == 0 */
+	    long double u;
+	    SET_LDOUBLE_WORDS(x,esy&0x8000,0,1);/* return +-minsubnormal */
+	    u = math_opt_barrier (x);
+	    u = u * u;
+	    math_force_eval (u);		/* raise underflow flag */
+	    return x;
+	}
+	if(esx>=0) {			/* x > 0 */
+	    if(esx>esy||((esx==esy) && (hx>hy||((hx==hy)&&(lx>ly))))) {
+	      /* x > y, x -= ulp */
+		if(lx==0) {
+		    if (hx <= 0x80000000) {
+		      if (esx == 0) {
+			--hx;
+		      } else {
+			esx -= 1;
+			hx = hx - 1;
+			if (esx > 0)
+			  hx |= 0x80000000;
+		      }
+		    } else
+		      hx -= 1;
+		}
+		lx -= 1;
+	    } else {				/* x < y, x += ulp */
+		lx += 1;
+		if(lx==0) {
+		    hx += 1;
+		    if (hx==0 || (esx == 0 && hx == 0x80000000)) {
+			esx += 1;
+			hx |= 0x80000000;
+		    }
+		}
+	    }
+	} else {				/* x < 0 */
+	    if(esy>=0||(esx>esy||((esx==esy)&&(hx>hy||((hx==hy)&&(lx>ly)))))){
+	      /* x < y, x -= ulp */
+		if(lx==0) {
+		    if (hx <= 0x80000000 && esx != 0xffff8000) {
+			esx -= 1;
+			hx = hx - 1;
+			if ((esx&0x7fff) > 0)
+			  hx |= 0x80000000;
+		    } else
+		      hx -= 1;
+		}
+		lx -= 1;
+	    } else {				/* x > y, x += ulp */
+		lx += 1;
+		if(lx==0) {
+		    hx += 1;
+		    if (hx==0 || (esx == 0xffff8000 && hx == 0x80000000)) {
+			esx += 1;
+			hx |= 0x80000000;
+		    }
+		}
+	    }
+	}
+	esy = esx&0x7fff;
+	if(esy==0x7fff) {
+	    long double u = x + x;	/* overflow  */
+	    math_force_eval (u);
+	    __set_errno (ERANGE);
+	}
+	if(esy==0) {
+	    long double u = x*x;		/* underflow */
+	    math_force_eval (u);		/* raise underflow flag */
+	    __set_errno (ERANGE);
+	}
+	SET_LDOUBLE_WORDS(x,esx,hx,lx);
+	return x;
+}
+weak_alias (__nextafterl, nextafterl)
+strong_alias (__nextafterl, __nexttowardl)
+weak_alias (__nextafterl, nexttowardl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c b/REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c
new file mode 100644
index 0000000000..0b47044760
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c
@@ -0,0 +1,93 @@
+/* s_nexttoward.c
+ * Special i387 version
+ * Conversion from s_nextafter.c by Ulrich Drepper, Cygnus Support,
+ * drepper@cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+/* IEEE functions
+ *	nexttoward(x,y)
+ *	return the next machine floating-point number of x in the
+ *	direction toward y.
+ *   Special cases:
+ */
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+#include <float.h>
+
+double __nexttoward(double x, long double y)
+{
+	int32_t hx,ix,iy;
+	u_int32_t lx,hy,ly,esy;
+
+	EXTRACT_WORDS(hx,lx,x);
+	GET_LDOUBLE_WORDS(esy,hy,ly,y);
+	ix = hx&0x7fffffff;		/* |x| */
+	iy = esy&0x7fff;		/* |y| */
+
+	/* Intel's extended format has the normally implicit 1 explicit
+	   present.  Sigh!  */
+	if(((ix>=0x7ff00000)&&((ix-0x7ff00000)|lx)!=0) ||   /* x is nan */
+	   ((iy>=0x7fff)&&((hy&0x7fffffff)|ly)!=0))        /* y is nan */
+	   return x+y;
+	if((long double) x==y) return y;	/* x=y, return y */
+	if((ix|lx)==0) {			/* x == 0 */
+	    double u;
+	    INSERT_WORDS(x,(esy&0x8000)<<16,1); /* return +-minsub */
+	    u = math_opt_barrier (x);
+	    u = u * u;
+	    math_force_eval (u);		/* raise underflow flag */
+	    return x;
+	}
+	if(hx>=0) {				/* x > 0 */
+	    if (x > y) {			/* x -= ulp */
+		if(lx==0) hx -= 1;
+		lx -= 1;
+	    } else {				/* x < y, x += ulp */
+		lx += 1;
+		if(lx==0) hx += 1;
+	    }
+	} else {				/* x < 0 */
+	    if (x < y) {			/* x -= ulp */
+		if(lx==0) hx -= 1;
+		lx -= 1;
+	    } else {				/* x > y, x += ulp */
+		lx += 1;
+		if(lx==0) hx += 1;
+	    }
+	}
+	hy = hx&0x7ff00000;
+	if(hy>=0x7ff00000) {
+	  double u = x+x;			/* overflow  */
+	  math_force_eval (u);
+	  __set_errno (ERANGE);
+	}
+	if(hy<0x00100000) {
+	    double u = x*x;			/* underflow */
+	    math_force_eval (u);		/* raise underflow flag */
+	    __set_errno (ERANGE);
+	}
+	INSERT_WORDS(x,hx,lx);
+	return x;
+}
+weak_alias (__nexttoward, nexttoward)
+#ifdef NO_LONG_DOUBLE
+strong_alias (__nexttoward, __nexttowardl)
+weak_alias (__nexttoward, nexttowardl)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c b/REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c
new file mode 100644
index 0000000000..e1156d1e4f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c
@@ -0,0 +1,77 @@
+/* s_nexttowardf.c -- float version of s_nextafter.c.
+ * Special i387 version.
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+#include <float.h>
+
+float __nexttowardf(float x, long double y)
+{
+	int32_t hx,ix,iy;
+	u_int32_t hy,ly,esy;
+
+	GET_FLOAT_WORD(hx,x);
+	GET_LDOUBLE_WORDS(esy,hy,ly,y);
+	ix = hx&0x7fffffff;		/* |x| */
+	iy = esy&0x7fff;		/* |y| */
+
+	/* Intel's extended format has the normally implicit 1 explicit
+	   present.  Sigh!  */
+	if((ix>0x7f800000) ||			/* x is nan */
+	   (iy>=0x7fff&&(((hy&0x7fffffff)|ly)!=0))) /* y is nan */
+	   return x+y;
+	if((long double) x==y) return y;	/* x=y, return y */
+	if(ix==0) {				/* x == 0 */
+	    float u;
+	    SET_FLOAT_WORD(x,((esy&0x8000)<<16)|1);/* return +-minsub*/
+	    u = math_opt_barrier (x);
+	    u = u * u;
+	    math_force_eval (u);		/* raise underflow flag */
+	    return x;
+	}
+	if(hx>=0) {				/* x > 0 */
+	    if(x > y) {				/* x -= ulp */
+		hx -= 1;
+	    } else {				/* x < y, x += ulp */
+		hx += 1;
+	    }
+	} else {				/* x < 0 */
+	    if(x < y) {				/* x -= ulp */
+		hx -= 1;
+	    } else {				/* x > y, x += ulp */
+		hx += 1;
+	    }
+	}
+	hy = hx&0x7f800000;
+	if(hy>=0x7f800000) {
+	  float u = x+x;			/* overflow  */
+	  math_force_eval (u);
+	  __set_errno (ERANGE);
+	}
+	if(hy<0x00800000) {
+	    float u = x*x;			/* underflow */
+	    math_force_eval (u);		/* raise underflow flag */
+	    __set_errno (ERANGE);
+	}
+	SET_FLOAT_WORD(x,hx);
+	return x;
+}
+weak_alias (__nexttowardf, nexttowardf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_remquo.S b/REORG.TODO/sysdeps/i386/fpu/s_remquo.S
new file mode 100644
index 0000000000..341285db30
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_remquo.S
@@ -0,0 +1,45 @@
+/*
+ * Written by Ulrich Drepper <drepper@cygnus.com>.
+ * Based on e_remainder by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+#define PARMS	4		/* no space for saved regs */
+#define DVDND	PARMS
+#define DVSOR	DVDND+8
+#define QUOP	DVSOR+8
+
+	.text
+ENTRY (__remquo)
+
+	fldl	DVSOR(%esp)
+	fldl	DVDND(%esp)
+1:	fprem1
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	/* Compute the congruent of the quotient.  */
+	movl	%eax, %ecx
+	shrl	$8, %eax
+	shrl	$12, %ecx
+	andl	$4, %ecx
+	andl	$3, %eax
+	orl	%eax, %ecx
+	leal	(%ecx,%ecx,2),%ecx
+	movl	$0xef2a60, %eax
+	shrl	%cl, %eax
+	andl	$7, %eax
+	movl	QUOP(%esp), %ecx
+	movl	DVDND+4(%esp), %edx
+	xorl	DVSOR+4(%esp), %edx
+	testl	$0x80000000, %edx
+	jz	1f
+	negl	%eax
+1:	movl	%eax, (%ecx)
+
+	ret
+END (__remquo)
+weak_alias (__remquo, remquo)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_remquof.S b/REORG.TODO/sysdeps/i386/fpu/s_remquof.S
new file mode 100644
index 0000000000..62063f068f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_remquof.S
@@ -0,0 +1,45 @@
+/*
+ * Written by Ulrich Drepper <drepper@cygnus.com>.
+ * Based on e_remainder by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+#define PARMS	4		/* no space for saved regs */
+#define DVDND	PARMS
+#define DVSOR	DVDND+4
+#define QUOP	DVSOR+4
+
+	.text
+ENTRY (__remquof)
+
+	flds	DVSOR(%esp)
+	flds	DVDND(%esp)
+1:	fprem1
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	/* Compute the congruent of the quotient.  */
+	movl	%eax, %ecx
+	shrl	$8, %eax
+	shrl	$12, %ecx
+	andl	$4, %ecx
+	andl	$3, %eax
+	orl	%eax, %ecx
+	leal	(%ecx,%ecx,2),%ecx
+	movl	$0xef2a60, %eax
+	shrl	%cl, %eax
+	andl	$7, %eax
+	movl	QUOP(%esp), %ecx
+	movl	DVDND(%esp), %edx
+	xorl	DVSOR(%esp), %edx
+	testl	$0x80000000, %edx
+	jz	1f
+	negl	%eax
+1:	movl	%eax, (%ecx)
+
+	ret
+END (__remquof)
+weak_alias (__remquof, remquof)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_remquol.S b/REORG.TODO/sysdeps/i386/fpu/s_remquol.S
new file mode 100644
index 0000000000..f3d84fc7c2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_remquol.S
@@ -0,0 +1,45 @@
+/*
+ * Written by Ulrich Drepper <drepper@cygnus.com>.
+ * Based on e_remainder by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+#define PARMS	4		/* no space for saved regs */
+#define DVDND	PARMS
+#define DVSOR	DVDND+12
+#define QUOP	DVSOR+12
+
+	.text
+ENTRY (__remquol)
+
+	fldt	DVSOR(%esp)
+	fldt	DVDND(%esp)
+1:	fprem1
+	fstsw	%ax
+	sahf
+	jp	1b
+	fstp	%st(1)
+	/* Compute the congruent of the quotient.  */
+	movl	%eax, %ecx
+	shrl	$8, %eax
+	shrl	$12, %ecx
+	andl	$4, %ecx
+	andl	$3, %eax
+	orl	%eax, %ecx
+	leal	(%ecx,%ecx,2),%ecx
+	movl	$0xef2a60, %eax
+	shrl	%cl, %eax
+	andl	$7, %eax
+	movl	QUOP(%esp), %ecx
+	movl	DVDND+8(%esp), %edx
+	xorl	DVSOR+8(%esp), %edx
+	testl	$0x8000, %edx
+	jz	1f
+	negl	%eax
+1:	movl	%eax, (%ecx)
+
+	ret
+END (__remquol)
+weak_alias (__remquol, remquol)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_rint.S b/REORG.TODO/sysdeps/i386/fpu/s_rint.S
new file mode 100644
index 0000000000..be36c5f0ca
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_rint.S
@@ -0,0 +1,15 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_rint.S,v 1.4 1995/05/09 00:16:08 jtc Exp $")
+
+ENTRY(__rint)
+	fldl	4(%esp)
+	frndint
+	ret
+END (__rint)
+weak_alias (__rint, rint)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_rintf.S b/REORG.TODO/sysdeps/i386/fpu/s_rintf.S
new file mode 100644
index 0000000000..2b358c1cf1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_rintf.S
@@ -0,0 +1,15 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_rintf.S,v 1.3 1995/05/09 00:17:22 jtc Exp $")
+
+ENTRY(__rintf)
+	flds	4(%esp)
+	frndint
+	ret
+END (__rintf)
+weak_alias (__rintf, rintf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_rintl.c b/REORG.TODO/sysdeps/i386/fpu/s_rintl.c
new file mode 100644
index 0000000000..66af9cb675
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_rintl.c
@@ -0,0 +1,18 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <math_private.h>
+
+long double
+__rintl (long double x)
+{
+  long double res;
+
+  asm ("frndint" : "=t" (res) : "0" (x));
+  return res;
+}
+
+weak_alias (__rintl, rintl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbln.c b/REORG.TODO/sysdeps/i386/fpu/s_scalbln.c
new file mode 100644
index 0000000000..1009713fbc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbln.c
@@ -0,0 +1,2 @@
+/* Nothing to do.  This function is the same as scalbn.  So we define an
+   alias.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c b/REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c
new file mode 100644
index 0000000000..5e558c3540
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c
@@ -0,0 +1,2 @@
+/* Nothing to do.  This function is the same as scalbnf.  So we define an
+   alias.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c b/REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c
new file mode 100644
index 0000000000..cda2ec11c8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c
@@ -0,0 +1,2 @@
+/* Nothing to do.  This function is the same as scalbnl.  So we define an
+   alias.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbn.S b/REORG.TODO/sysdeps/i386/fpu/s_scalbn.S
new file mode 100644
index 0000000000..4e90903115
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbn.S
@@ -0,0 +1,24 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_scalbn.S,v 1.4 1995/05/09 00:19:06 jtc Exp $")
+
+ENTRY(__scalbn)
+	fildl	12(%esp)
+	fldl	4(%esp)
+	fscale
+	fstp	%st(1)
+	DBL_NARROW_EVAL
+	ret
+END (__scalbn)
+strong_alias (__scalbn, __scalbln)
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libc, GLIBC_2_1, GLIBC_2_20)
+compat_symbol (libc, __scalbn, scalbln, GLIBC_2_1);
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S b/REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S
new file mode 100644
index 0000000000..f8353c4c75
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S
@@ -0,0 +1,24 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_scalbnf.S,v 1.3 1995/05/09 00:19:59 jtc Exp $")
+
+ENTRY(__scalbnf)
+	fildl	8(%esp)
+	flds	4(%esp)
+	fscale
+	fstp	%st(1)
+	FLT_NARROW_EVAL
+	ret
+END (__scalbnf)
+strong_alias (__scalbnf, __scalblnf)
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libc, GLIBC_2_1, GLIBC_2_20)
+compat_symbol (libc, __scalbnf, scalblnf, GLIBC_2_1);
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S b/REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S
new file mode 100644
index 0000000000..839b5ff353
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S
@@ -0,0 +1,23 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__scalbnl)
+	fildl	16(%esp)
+	fldt	4(%esp)
+	fscale
+	fstp	%st(1)
+	ret
+END (__scalbnl)
+strong_alias (__scalbnl, __scalblnl)
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libc, GLIBC_2_1, GLIBC_2_20)
+compat_symbol (libc, __scalbnl, scalblnl, GLIBC_2_1);
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_significand.S b/REORG.TODO/sysdeps/i386/fpu/s_significand.S
new file mode 100644
index 0000000000..4859b7ed71
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_significand.S
@@ -0,0 +1,16 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_significand.S,v 1.4 1995/05/09 00:21:47 jtc Exp $")
+
+ENTRY(__significand)
+	fldl	4(%esp)
+	fxtract
+	fstp	%st(1)
+	ret
+END (__significand)
+weak_alias (__significand, significand)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_significandf.S b/REORG.TODO/sysdeps/i386/fpu/s_significandf.S
new file mode 100644
index 0000000000..3a2de97759
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_significandf.S
@@ -0,0 +1,16 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_significandf.S,v 1.3 1995/05/09 00:24:07 jtc Exp $")
+
+ENTRY(__significandf)
+	flds	4(%esp)
+	fxtract
+	fstp	%st(1)
+	ret
+END (__significandf)
+weak_alias (__significandf, significandf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_significandl.c b/REORG.TODO/sysdeps/i386/fpu/s_significandl.c
new file mode 100644
index 0000000000..b8cb093502
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_significandl.c
@@ -0,0 +1,19 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <math_private.h>
+
+long double
+__significandl (long double x)
+{
+  long double res;
+
+  asm ("fxtract\n"
+       "fstp	%%st(1)" : "=t" (res) : "0" (x));
+  return res;
+}
+
+weak_alias (__significandl, significandl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_trunc.S b/REORG.TODO/sysdeps/i386/fpu/s_trunc.S
new file mode 100644
index 0000000000..e9a850b877
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_trunc.S
@@ -0,0 +1,37 @@
+/* Truncate double value.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+ENTRY(__trunc)
+	fldl	4(%esp)
+	subl	$32, %esp
+	cfi_adjust_cfa_offset (32)
+	fnstenv	4(%esp)
+	movl	$0xc00, %edx
+	orl	4(%esp), %edx
+	movl	%edx, (%esp)
+	fldcw	(%esp)
+	frndint
+	fldenv	4(%esp)
+	addl	$32, %esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END(__trunc)
+weak_alias (__trunc, trunc)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_truncf.S b/REORG.TODO/sysdeps/i386/fpu/s_truncf.S
new file mode 100644
index 0000000000..a93f5b9a2e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_truncf.S
@@ -0,0 +1,37 @@
+/* Truncate float value.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+ENTRY(__truncf)
+	flds	4(%esp)
+	subl	$32, %esp
+	cfi_adjust_cfa_offset (32)
+	fnstenv	4(%esp)
+	movl	$0xc00, %edx
+	orl	4(%esp), %edx
+	movl	%edx, (%esp)
+	fldcw	(%esp)
+	frndint
+	fldenv	4(%esp)
+	addl	$32, %esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END(__truncf)
+weak_alias (__truncf, truncf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_truncl.S b/REORG.TODO/sysdeps/i386/fpu/s_truncl.S
new file mode 100644
index 0000000000..a884123612
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_truncl.S
@@ -0,0 +1,40 @@
+/* Truncate long double value.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+ENTRY(__truncl)
+	fldt	4(%esp)
+	subl	$32, %esp
+	cfi_adjust_cfa_offset (32)
+	fnstenv	4(%esp)
+	movl	$0xc00, %edx
+	orl	4(%esp), %edx
+	movl	%edx, (%esp)
+	fldcw	(%esp)
+	frndint
+	fnstsw
+	andl	$0x1, %eax
+	orl	%eax, 8(%esp)
+	fldenv	4(%esp)
+	addl	$32, %esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+END(__truncl)
+weak_alias (__truncl, truncl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/slowexp.c b/REORG.TODO/sysdeps/i386/fpu/slowexp.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/slowexp.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/slowpow.c b/REORG.TODO/sysdeps/i386/fpu/slowpow.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/slowpow.c
@@ -0,0 +1 @@
+/* Not needed.  */
diff --git a/REORG.TODO/sysdeps/i386/fpu/t_exp.c b/REORG.TODO/sysdeps/i386/fpu/t_exp.c
new file mode 100644
index 0000000000..fd37963b05
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/t_exp.c
@@ -0,0 +1 @@
+/* Empty.  Not needed. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c b/REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c
new file mode 100644
index 0000000000..ddd36d0964
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c
@@ -0,0 +1,8 @@
+/* The inline __ieee754_sqrt is not correctly rounding; it's OK for
+   most internal uses in glibc, but not for sqrt itself.  */
+#define __ieee754_sqrt __avoid_ieee754_sqrt
+#include <math.h>
+#include <math_private.h>
+#undef __ieee754_sqrt
+extern double __ieee754_sqrt (double);
+#include <math/w_sqrt_compat.c>
diff --git a/REORG.TODO/sysdeps/i386/gccframe.h b/REORG.TODO/sysdeps/i386/gccframe.h
new file mode 100644
index 0000000000..579da40ae9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/gccframe.h
@@ -0,0 +1,27 @@
+/* Definition of object in frame unwind info.  i386 version.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define DWARF_FRAME_REGISTERS 17
+
+#define CRT_GET_RFIB_DATA(BASE)		\
+  {					\
+    register void *__ebx __asm__("ebx");\
+    BASE = __ebx;			\
+  }
+
+#include <sysdeps/generic/gccframe.h>
diff --git a/REORG.TODO/sysdeps/i386/gmp-mparam.h b/REORG.TODO/sysdeps/i386/gmp-mparam.h
new file mode 100644
index 0000000000..7ea503a403
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/gmp-mparam.h
@@ -0,0 +1,28 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991-2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+#define IEEE_DOUBLE_BIG_ENDIAN 0
diff --git a/REORG.TODO/sysdeps/i386/htonl.S b/REORG.TODO/sysdeps/i386/htonl.S
new file mode 100644
index 0000000000..63279bb6e1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/htonl.S
@@ -0,0 +1,34 @@
+/* Change byte order in word.  For Intel 80x86, x >= 4.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   word		(sp + 4)
+*/
+
+	.text
+ENTRY (htonl)
+	movl	4(%esp), %eax
+	bswap	%eax
+	ret
+END (htonl)
+
+weak_alias (htonl, ntohl)
diff --git a/REORG.TODO/sysdeps/i386/htons.S b/REORG.TODO/sysdeps/i386/htons.S
new file mode 100644
index 0000000000..a3c53a9944
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/htons.S
@@ -0,0 +1,35 @@
+/* Change byte order in word.  For Intel 80x86, x >= 3.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   word		(sp + 4)
+*/
+
+	.text
+ENTRY (htons)
+	movl	4(%esp), %eax
+	andl	$0xffff, %eax
+	rorw	$8, %ax
+	ret
+END (htons)
+
+weak_alias (htons, ntohs)
diff --git a/REORG.TODO/sysdeps/i386/i386-mcount.S b/REORG.TODO/sysdeps/i386/i386-mcount.S
new file mode 100644
index 0000000000..733b8c78e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i386-mcount.S
@@ -0,0 +1,79 @@
+/* i386-specific implementation of profiling support.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* We need a special version of the `mcount' function since for ix86 it
+   must not clobber any register.  This has several reasons:
+     - there is a bug in gcc as of version 2.7.2.2 which prohibits the
+       use of profiling together with nested functions
+     - the ELF `fixup' function uses GCC's regparm feature
+     - some (future) systems might want to pass parameters in registers.  */
+
+	.globl C_SYMBOL_NAME(_mcount)
+	.type C_SYMBOL_NAME(_mcount), @function
+	.align ALIGNARG(4)
+C_LABEL(_mcount)
+	/* Save the caller-clobbered registers.  */
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+
+	movl 12(%esp), %edx
+	movl 4(%ebp), %eax
+
+	/* No need to access the PLT or GOT, __mcount_internal is an
+	   internal function and we can make a relative call.  */
+	call C_SYMBOL_NAME(__mcount_internal)
+
+	/* Pop the saved registers.  Please note that `mcount' has no
+	   return value.  */
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+	ASM_SIZE_DIRECTIVE(C_SYMBOL_NAME(_mcount))
+
+#undef mcount
+weak_alias (_mcount, mcount)
+
+	/* Same as above, but doesn't require a frame pointer */
+	.globl C_SYMBOL_NAME(__fentry__)
+	.type C_SYMBOL_NAME(__fentry__), @function
+	.align ALIGNARG(4)
+C_LABEL(__fentry__)
+	/* Save the caller-clobbered registers.  */
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+
+	movl 12(%esp), %edx
+	movl 16(%esp), %eax
+
+	/* No need to access the PLT or GOT, __mcount_internal is an
+	   internal function and we can make a relative call.  */
+	call C_SYMBOL_NAME(__mcount_internal)
+
+	/* Pop the saved registers.  Please note that `__fentry__' has no
+	   return value.  */
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+	ASM_SIZE_DIRECTIVE(C_SYMBOL_NAME(__fentry__))
diff --git a/REORG.TODO/sysdeps/i386/i586/add_n.S b/REORG.TODO/sysdeps/i386/i586/add_n.S
new file mode 100644
index 0000000000..f73df092f0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/add_n.S
@@ -0,0 +1,143 @@
+/* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16		/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define S2	S1+4
+#define SIZE	S2+4
+
+	.text
+ENTRY (__mpn_add_n)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 12)
+	movl	S1(%esp),%esi
+	cfi_rel_offset (esi, 8)
+	movl	S2(%esp),%ebx
+	cfi_rel_offset (ebx, 0)
+	movl	SIZE(%esp),%ecx
+	movl	(%ebx),%ebp
+	cfi_rel_offset (ebp, 4)
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		/* zero carry flag */
+	jz	L(end)
+	pushl	%edx
+	cfi_adjust_cfa_offset (4)
+
+	ALIGN (3)
+L(oop):	movl	28(%edi),%eax		/* fetch destination cache line */
+	leal	32(%edi),%edi
+
+L(1):	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	adcl	%ebp,%eax
+	movl	4(%ebx),%ebp
+	adcl	%ebp,%edx
+	movl	8(%ebx),%ebp
+	movl	%eax,-32(%edi)
+	movl	%edx,-28(%edi)
+
+L(2):	movl	8(%esi),%eax
+	movl	12(%esi),%edx
+	adcl	%ebp,%eax
+	movl	12(%ebx),%ebp
+	adcl	%ebp,%edx
+	movl	16(%ebx),%ebp
+	movl	%eax,-24(%edi)
+	movl	%edx,-20(%edi)
+
+L(3):	movl	16(%esi),%eax
+	movl	20(%esi),%edx
+	adcl	%ebp,%eax
+	movl	20(%ebx),%ebp
+	adcl	%ebp,%edx
+	movl	24(%ebx),%ebp
+	movl	%eax,-16(%edi)
+	movl	%edx,-12(%edi)
+
+L(4):	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	adcl	%ebp,%eax
+	movl	28(%ebx),%ebp
+	adcl	%ebp,%edx
+	movl	32(%ebx),%ebp
+	movl	%eax,-8(%edi)
+	movl	%edx,-4(%edi)
+
+	leal	32(%esi),%esi
+	leal	32(%ebx),%ebx
+	decl	%ecx
+	jnz	L(oop)
+
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+L(end):
+	decl	%edx			/* test %edx w/o clobbering carry */
+	js	L(end2)
+	incl	%edx
+L(oop2):
+	leal	4(%edi),%edi
+	movl	(%esi),%eax
+	adcl	%ebp,%eax
+	movl	4(%ebx),%ebp
+	movl	%eax,-4(%edi)
+	leal	4(%esi),%esi
+	leal	4(%ebx),%ebx
+	decl	%edx
+	jnz	L(oop2)
+L(end2):
+	movl	(%esi),%eax
+	adcl	%ebp,%eax
+	movl	%eax,(%edi)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_add_n)
diff --git a/REORG.TODO/sysdeps/i386/i586/addmul_1.S b/REORG.TODO/sysdeps/i386/i586/addmul_1.S
new file mode 100644
index 0000000000..a713192982
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/addmul_1.S
@@ -0,0 +1,94 @@
+/* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+   the result to a second limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16	/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define SIZE	S1+4
+#define S2LIMB	SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+	.text
+ENTRY (__mpn_addmul_1)
+
+	pushl	%res_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%s1_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%s2_limb
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp), %res_ptr
+	cfi_rel_offset (res_ptr, 12)
+	movl	S1(%esp), %s1_ptr
+	cfi_rel_offset (s1_ptr, 8)
+	movl	SIZE(%esp), %size
+	movl	S2LIMB(%esp), %s2_limb
+	cfi_rel_offset (s2_limb, 0)
+	leal	(%res_ptr,%size,4), %res_ptr
+	leal	(%s1_ptr,%size,4), %s1_ptr
+	negl	%size
+	xorl	%ebp, %ebp
+	cfi_rel_offset (ebp, 4)
+	ALIGN (3)
+
+L(oop):	adcl	$0, %ebp
+	movl	(%s1_ptr,%size,4), %eax
+
+	mull	%s2_limb
+
+	addl	%ebp, %eax
+	movl	(%res_ptr,%size,4), %ebp
+
+	adcl	$0, %edx
+	addl	%eax, %ebp
+
+	movl	%ebp, (%res_ptr,%size,4)
+	incl	%size
+
+	movl	%edx, %ebp
+	jnz	L(oop)
+
+	adcl	$0, %ebp
+	movl	%ebp, %eax
+	popl	%s2_limb
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s2_limb)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%s1_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s1_ptr)
+	popl	%res_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (res_ptr)
+
+	ret
+#undef size
+END (__mpn_addmul_1)
diff --git a/REORG.TODO/sysdeps/i386/i586/bzero.S b/REORG.TODO/sysdeps/i386/i586/bzero.S
new file mode 100644
index 0000000000..2a106719a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/bzero.S
@@ -0,0 +1,4 @@
+#define USE_AS_BZERO
+#define memset __bzero
+#include <sysdeps/i386/i586/memset.S>
+weak_alias (__bzero, bzero)
diff --git a/REORG.TODO/sysdeps/i386/i586/init-arch.h b/REORG.TODO/sysdeps/i386/i586/init-arch.h
new file mode 100644
index 0000000000..4711212e6f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/init-arch.h
@@ -0,0 +1,19 @@
+/* Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define MINIMUM_ISA 586
+#include <sysdeps/x86/init-arch.h>
diff --git a/REORG.TODO/sysdeps/i386/i586/lshift.S b/REORG.TODO/sysdeps/i386/i586/lshift.S
new file mode 100644
index 0000000000..7941c28d9d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/lshift.S
@@ -0,0 +1,255 @@
+/* Pentium optimized __mpn_lshift --
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16		/* space for 4 saved regs */
+#define RES	PARMS
+#define S	RES+4
+#define SIZE	S+4
+#define CNT	SIZE+4
+
+	.text
+ENTRY (__mpn_lshift)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebp, 0)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 12)
+	movl	S(%esp),%esi
+	cfi_rel_offset (esi, 8)
+	movl	SIZE(%esp),%ebx
+	cfi_rel_offset (ebx, 0)
+	movl	CNT(%esp),%ecx
+
+/* We can use faster code for shift-by-1 under certain conditions.  */
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%esi),%eax
+	cmpl	%edi,%eax
+	jnc	L(special)		/* jump if s_ptr + 1 >= res_ptr */
+	leal	(%esi,%ebx,4),%eax
+	cmpl	%eax,%edi
+	jnc	L(special)		/* jump if res_ptr >= s_ptr + size */
+
+L(normal):
+	leal	-4(%edi,%ebx,4),%edi
+	leal	-4(%esi,%ebx,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+	xorl	%eax,%eax
+	shldl	%cl,%edx,%eax		/* compute carry limb */
+	pushl	%eax			/* push carry limb onto stack */
+	cfi_adjust_cfa_offset (4)
+
+	decl	%ebx
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	shrl	$3,%ebx
+	jz	L(end)
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+L(oop):	movl	-28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebp
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	shldl	%cl,%eax,%ebp
+	shldl	%cl,%edx,%eax
+	movl	%ebp,(%edi)
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebp
+	movl	-12(%esi),%eax
+	shldl	%cl,%ebp,%edx
+	shldl	%cl,%eax,%ebp
+	movl	%edx,-8(%edi)
+	movl	%ebp,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebp
+	shldl	%cl,%edx,%eax
+	shldl	%cl,%ebp,%edx
+	movl	%eax,-16(%edi)
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	shldl	%cl,%eax,%ebp
+	shldl	%cl,%edx,%eax
+	movl	%ebp,-24(%edi)
+	movl	%eax,-28(%edi)
+
+	subl	$32,%esi
+	subl	$32,%edi
+	decl	%ebx
+	jnz	L(oop)
+
+L(end):	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	andl	$7,%ebx
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shldl	%cl,%eax,%edx
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	subl	$4,%esi
+	subl	$4,%edi
+	decl	%ebx
+	jnz	L(oop2)
+
+L(end2):
+	shll	%cl,%edx		/* compute least significant limb */
+	movl	%edx,(%edi)		/* store it */
+
+	popl	%eax			/* pop carry limb */
+	cfi_adjust_cfa_offset (-4)
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+
+/* We loop from least significant end of the arrays, which is only
+   permissible if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.
+*/
+
+	cfi_adjust_cfa_offset (16)
+	cfi_rel_offset (edi, 12)
+	cfi_rel_offset (esi, 8)
+	cfi_rel_offset (ebp, 4)
+	cfi_rel_offset (ebx, 0)
+L(special):
+	movl	(%esi),%edx
+	addl	$4,%esi
+
+	decl	%ebx
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	shrl	$3,%ebx
+
+	addl	%edx,%edx
+	incl	%ebx
+	decl	%ebx
+	jz	L(Lend)
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+L(Loop):
+	movl	28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebp
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebp,(%edi)
+	adcl	%edx,%edx
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebp
+	movl	12(%esi),%eax
+	adcl	%ebp,%ebp
+	movl	%edx,8(%edi)
+	adcl	%eax,%eax
+	movl	%ebp,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebp
+	adcl	%edx,%edx
+	movl	%eax,16(%edi)
+	adcl	%ebp,%ebp
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebp,24(%edi)
+	adcl	%edx,%edx
+	movl	%eax,28(%edi)
+
+	leal	32(%esi),%esi		/* use leal not to clobber carry */
+	leal	32(%edi),%edi
+	decl	%ebx
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	sbbl	%eax,%eax		/* save carry in %eax */
+	andl	$7,%ebx
+	jz	L(Lend2)
+	addl	%eax,%eax		/* restore carry from eax */
+L(Loop2):
+	movl	%edx,%ebp
+	movl	(%esi),%edx
+	adcl	%edx,%edx
+	movl	%ebp,(%edi)
+
+	leal	4(%esi),%esi		/* use leal not to clobber carry */
+	leal	4(%edi),%edi
+	decl	%ebx
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		/* restore carry from eax */
+L(L1):	movl	%edx,(%edi)		/* store last limb */
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_lshift)
diff --git a/REORG.TODO/sysdeps/i386/i586/memcopy.h b/REORG.TODO/sysdeps/i386/i586/memcopy.h
new file mode 100644
index 0000000000..39f020a746
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memcopy.h
@@ -0,0 +1,95 @@
+/* memcopy.h -- definitions for memory copy functions.  Pentium version.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   Contributed by Torbjorn Granlund (tege@sics.se).
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Get the i386 definitions.  We will override some of them below.  */
+#include <sysdeps/i386/memcopy.h>
+
+/* Written like this, the Pentium pipeline can execute the loop at a
+   sustained rate of 2 instructions/clock, or asymptotically 480
+   Mbytes/second at 60Mhz.  */
+
+#undef	WORD_COPY_FWD
+#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)		\
+  do									\
+    {									\
+      asm volatile ("subl	$32,%2\n"				\
+		    "js		2f\n"					\
+		    "movl	0(%0),%%edx\n"	/* alloc dest line */	\
+		    "1:\n"						\
+		    "movl	28(%0),%%eax\n"	/* alloc dest line */	\
+		    "subl	$32,%2\n"	/* decr loop count */	\
+		    "movl	0(%1),%%eax\n"	/* U pipe */		\
+		    "movl	4(%1),%%edx\n"	/* V pipe */		\
+		    "movl	%%eax,0(%0)\n"	/* U pipe */		\
+		    "movl	%%edx,4(%0)\n"	/* V pipe */		\
+		    "movl	8(%1),%%eax\n"				\
+		    "movl	12(%1),%%edx\n"				\
+		    "movl	%%eax,8(%0)\n"				\
+		    "movl	%%edx,12(%0)\n"				\
+		    "movl	16(%1),%%eax\n"				\
+		    "movl	20(%1),%%edx\n"				\
+		    "movl	%%eax,16(%0)\n"				\
+		    "movl	%%edx,20(%0)\n"				\
+		    "movl	24(%1),%%eax\n"				\
+		    "movl	28(%1),%%edx\n"				\
+		    "movl	%%eax,24(%0)\n"				\
+		    "movl	%%edx,28(%0)\n"				\
+		    "leal	32(%1),%1\n"	/* update src ptr */	\
+		    "leal	32(%0),%0\n"	/* update dst ptr */	\
+		    "jns	1b\n"					\
+		    "2: addl	$32,%2" :				\
+		    "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) :	\
+		    "0" (dst_bp), "1" (src_bp), "2" (nbytes) :		\
+		    "ax", "dx");					\
+    } while (0)
+
+#undef	WORD_COPY_BWD
+#define WORD_COPY_BWD(dst_ep, src_ep, nbytes_left, nbytes)		\
+  do									\
+    {									\
+      asm volatile ("subl	$32,%2\n"				\
+		    "js		2f\n"					\
+		    "movl	-4(%0),%%edx\n"				\
+		    "1:\n"						\
+		    "movl	-32(%0),%%eax\n"			\
+		    "subl	$32,%2\n"				\
+		    "movl	-4(%1),%%eax\n"				\
+		    "movl	-8(%1),%%edx\n"				\
+		    "movl	%%eax,-4(%0)\n"				\
+		    "movl	%%edx,-8(%0)\n"				\
+		    "movl	-12(%1),%%eax\n"			\
+		    "movl	-16(%1),%%edx\n"			\
+		    "movl	%%eax,-12(%0)\n"			\
+		    "movl	%%edx,-16(%0)\n"			\
+		    "movl	-20(%1),%%eax\n"			\
+		    "movl	-24(%1),%%edx\n"			\
+		    "movl	%%eax,-20(%0)\n"			\
+		    "movl	%%edx,-24(%0)\n"			\
+		    "movl	-28(%1),%%eax\n"			\
+		    "movl	-32(%1),%%edx\n"			\
+		    "movl	%%eax,-28(%0)\n"			\
+		    "movl	%%edx,-32(%0)\n"			\
+		    "leal	-32(%1),%1\n"				\
+		    "leal	-32(%0),%0\n"				\
+		    "jns	1b\n"					\
+		    "2: addl	$32,%2" :				\
+		    "=r" (dst_ep), "=r" (src_ep), "=r" (nbytes_left) :	\
+		    "0" (dst_ep), "1" (src_ep), "2" (nbytes) :		\
+		    "ax", "dx");					\
+    } while (0)
diff --git a/REORG.TODO/sysdeps/i386/i586/memcpy.S b/REORG.TODO/sysdeps/i386/i586/memcpy.S
new file mode 100644
index 0000000000..6474a3f653
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memcpy.S
@@ -0,0 +1,124 @@
+/* Highly optimized version for i586.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+8	/* space for 2 saved regs */
+#define RTN	PARMS
+#define DEST	RTN
+#define SRC	DEST+4
+#define LEN	SRC+4
+
+        .text
+#if defined PIC && IS_IN (libc)
+ENTRY (__memcpy_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memcpy_chk)
+#endif
+ENTRY (memcpy)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+
+	movl	DEST(%esp), %edi
+	cfi_rel_offset (edi, 4)
+	movl	SRC(%esp), %esi
+	cfi_rel_offset (esi, 0)
+	movl	LEN(%esp), %ecx
+	movl	%edi, %eax
+
+	/* We need this in any case.  */
+	cld
+
+	/* Cutoff for the big loop is a size of 32 bytes since otherwise
+	   the loop will never be entered.  */
+	cmpl	$32, %ecx
+	jbe	L(1)
+
+	negl	%eax
+	andl	$3, %eax
+	subl	%eax, %ecx
+	xchgl	%eax, %ecx
+
+	rep; movsb
+
+	movl	%eax, %ecx
+	subl	$32, %ecx
+	js	L(2)
+
+	/* Read ahead to make sure we write in the cache since the stupid
+	   i586 designers haven't implemented read-on-write-miss.  */
+	movl	(%edi), %eax
+L(3):	movl	28(%edi), %edx
+
+	/* Now correct the loop counter.  Please note that in the following
+	   code the flags are not changed anymore.  */
+	subl	$32, %ecx
+
+	movl	(%esi), %eax
+	movl	4(%esi), %edx
+	movl	%eax, (%edi)
+	movl	%edx, 4(%edi)
+	movl	8(%esi), %eax
+	movl	12(%esi), %edx
+	movl	%eax, 8(%edi)
+	movl	%edx, 12(%edi)
+	movl	16(%esi), %eax
+	movl	20(%esi), %edx
+	movl	%eax, 16(%edi)
+	movl	%edx, 20(%edi)
+	movl	24(%esi), %eax
+	movl	28(%esi), %edx
+	movl	%eax, 24(%edi)
+	movl	%edx, 28(%edi)
+
+	leal	32(%esi), %esi
+	leal	32(%edi), %edi
+
+	jns	L(3)
+
+	/* Correct extra loop counter modification.  */
+L(2):	addl	$32, %ecx
+#ifndef USE_AS_MEMPCPY
+	movl	DEST(%esp), %eax
+#endif
+
+L(1):	rep; movsb
+
+#ifdef USE_AS_MEMPCPY
+	movl	%edi, %eax
+#endif
+
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (memcpy)
+#ifndef USE_AS_MEMPCPY
+libc_hidden_builtin_def (memcpy)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i586/mempcpy.S b/REORG.TODO/sysdeps/i386/i586/mempcpy.S
new file mode 100644
index 0000000000..720a4c0923
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/mempcpy.S
@@ -0,0 +1,8 @@
+#define USE_AS_MEMPCPY
+#define memcpy __mempcpy
+#define __memcpy_chk __mempcpy_chk
+#include <sysdeps/i386/i586/memcpy.S>
+
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/i386/i586/memset.S b/REORG.TODO/sysdeps/i386/i586/memset.S
new file mode 100644
index 0000000000..4f8f1bcf94
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memset.S
@@ -0,0 +1,121 @@
+/* memset/bzero -- set memory area to CH/0
+   Highly optimized version for ix86, x>=5.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Torbjorn Granlund, <tege@matematik.su.se>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* space for 1 saved reg */
+#define RTN	PARMS
+#define DEST	RTN
+#ifdef USE_AS_BZERO
+# define LEN	DEST+4
+#else
+# define CHR	DEST+4
+# define LEN	CHR+4
+#endif
+
+        .text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk)
+#endif
+ENTRY (memset)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+
+	movl	DEST(%esp), %edi
+	cfi_rel_offset (edi, 0)
+	movl	LEN(%esp), %edx
+#ifdef USE_AS_BZERO
+	xorl	%eax, %eax	/* we fill with 0 */
+#else
+	movb	CHR(%esp), %al
+	movb	%al, %ah
+	movl	%eax, %ecx
+	shll	$16, %eax
+	movw	%cx, %ax
+#endif
+	cld
+
+/* If less than 36 bytes to write, skip tricky code (it wouldn't work).  */
+	cmpl	$36, %edx
+	movl	%edx, %ecx	/* needed when branch is taken! */
+	jl	L(2)
+
+/* First write 0-3 bytes to make the pointer 32-bit aligned.  */
+	movl	%edi, %ecx	/* Copy ptr to ecx... */
+	negl	%ecx		/* ...and negate that and... */
+	andl	$3, %ecx	/* ...mask to get byte count.  */
+	subl	%ecx, %edx	/* adjust global byte count */
+	rep
+	stosb
+
+	subl	$32, %edx	/* offset count for unrolled loop */
+	movl	(%edi), %ecx	/* Fetch destination cache line */
+
+	.align	2, 0x90		/* supply 0x90 for broken assemblers */
+L(1):	movl	28(%edi), %ecx	/* allocate cache line for destination */
+	subl	$32, %edx	/* decr loop count */
+	movl	%eax, 0(%edi)	/* store words pairwise */
+	movl	%eax, 4(%edi)
+	movl	%eax, 8(%edi)
+	movl	%eax, 12(%edi)
+	movl	%eax, 16(%edi)
+	movl	%eax, 20(%edi)
+	movl	%eax, 24(%edi)
+	movl	%eax, 28(%edi)
+	leal	32(%edi), %edi	/* update destination pointer */
+	jge	L(1)
+
+	leal	32(%edx), %ecx	/* reset offset count */
+
+/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped).  */
+L(2):	shrl	$2, %ecx	/* convert byte count to longword count */
+	rep
+	stosl
+
+/* Finally write the last 0-3 bytes.  */
+	movl	%edx, %ecx
+	andl	$3, %ecx
+	rep
+	stosb
+
+#ifndef USE_AS_BZERO
+	/* Load result (only if used as memset).  */
+	movl DEST(%esp), %eax	/* start address of destination is result */
+#endif
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (memset)
+libc_hidden_builtin_def (memset)
+
+#if defined SHARED && IS_IN (libc) && !defined __memset_chk \
+    && !defined USE_AS_BZERO
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+	.section .gnu.warning.__memset_zero_constant_len_parameter
+	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i586/memusage.h b/REORG.TODO/sysdeps/i386/i586/memusage.h
new file mode 100644
index 0000000000..c8170874d0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memusage.h
@@ -0,0 +1 @@
+#include "../i686/memusage.h"
diff --git a/REORG.TODO/sysdeps/i386/i586/mul_1.S b/REORG.TODO/sysdeps/i386/i586/mul_1.S
new file mode 100644
index 0000000000..bd3a07de90
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/mul_1.S
@@ -0,0 +1,90 @@
+/* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store
+   the result in a second limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16	/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define SIZE	S1+4
+#define S2LIMB	SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+	.text
+ENTRY (__mpn_mul_1)
+
+	pushl	%res_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%s1_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%s2_limb
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp), %res_ptr
+	cfi_rel_offset (res_ptr, 12)
+	movl	S1(%esp), %s1_ptr
+	cfi_rel_offset (s1_ptr, 8)
+	movl	SIZE(%esp), %size
+	movl	S2LIMB(%esp), %s2_limb
+	cfi_rel_offset (s2_limb, 0)
+	leal	(%res_ptr,%size,4), %res_ptr
+	leal	(%s1_ptr,%size,4), %s1_ptr
+	negl	%size
+	xorl	%ebp, %ebp
+	cfi_rel_offset (ebp, 4)
+	ALIGN (3)
+
+L(oop):	adcl	$0, %ebp
+	movl	(%s1_ptr,%size,4), %eax
+
+	mull	%s2_limb
+
+	addl	%eax, %ebp
+
+	movl	%ebp, (%res_ptr,%size,4)
+	incl	%size
+
+	movl	%edx, %ebp
+	jnz	L(oop)
+
+	adcl	$0, %ebp
+	movl	%ebp, %eax
+	popl	%s2_limb
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s2_limb)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%s1_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s1_ptr)
+	popl	%res_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (res_ptr)
+
+	ret
+#undef size
+END (__mpn_mul_1)
diff --git a/REORG.TODO/sysdeps/i386/i586/rshift.S b/REORG.TODO/sysdeps/i386/i586/rshift.S
new file mode 100644
index 0000000000..24c76ee0bb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/rshift.S
@@ -0,0 +1,255 @@
+/* Pentium optimized __mpn_rshift --
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16		/* space for 4 saved regs */
+#define RES	PARMS
+#define S	RES+4
+#define SIZE	S+4
+#define CNT	SIZE+4
+
+	.text
+ENTRY (__mpn_rshift)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebp, 0)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 12)
+	movl	S(%esp),%esi
+	cfi_rel_offset (esi, 8)
+	movl	SIZE(%esp),%ebx
+	cfi_rel_offset (ebx, 0)
+	movl	CNT(%esp),%ecx
+
+/* We can use faster code for shift-by-1 under certain conditions.  */
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%edi),%eax
+	cmpl	%esi,%eax
+	jnc	L(special)		/* jump if res_ptr + 1 >= s_ptr */
+	leal	(%edi,%ebx,4),%eax
+	cmpl	%eax,%esi
+	jnc	L(special)		/* jump if s_ptr >= res_ptr + size */
+
+L(normal):
+	movl	(%esi),%edx
+	addl	$4,%esi
+	xorl	%eax,%eax
+	shrdl	%cl,%edx,%eax		/* compute carry limb */
+	pushl	%eax			/* push carry limb onto stack */
+	cfi_adjust_cfa_offset (4)
+
+	decl	%ebx
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	shrl	$3,%ebx
+	jz	L(end)
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+L(oop):	movl	28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebp
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	shrdl	%cl,%eax,%ebp
+	shrdl	%cl,%edx,%eax
+	movl	%ebp,(%edi)
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebp
+	movl	12(%esi),%eax
+	shrdl	%cl,%ebp,%edx
+	shrdl	%cl,%eax,%ebp
+	movl	%edx,8(%edi)
+	movl	%ebp,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebp
+	shrdl	%cl,%edx,%eax
+	shrdl	%cl,%ebp,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	shrdl	%cl,%eax,%ebp
+	shrdl	%cl,%edx,%eax
+	movl	%ebp,24(%edi)
+	movl	%eax,28(%edi)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	decl	%ebx
+	jnz	L(oop)
+
+L(end):	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	andl	$7,%ebx
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shrdl	%cl,%eax,%edx		/* compute result limb */
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	addl	$4,%esi
+	addl	$4,%edi
+	decl	%ebx
+	jnz	L(oop2)
+
+L(end2):
+	shrl	%cl,%edx		/* compute most significant limb */
+	movl	%edx,(%edi)		/* store it */
+
+	popl	%eax			/* pop carry limb */
+	cfi_adjust_cfa_offset (-4)
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+
+/* We loop from least significant end of the arrays, which is only
+   permissible if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.
+*/
+
+	cfi_adjust_cfa_offset (16)
+	cfi_rel_offset (edi, 12)
+	cfi_rel_offset (esi, 8)
+	cfi_rel_offset (ebp, 4)
+	cfi_rel_offset (ebx, 0)
+L(special):
+	leal	-4(%edi,%ebx,4),%edi
+	leal	-4(%esi,%ebx,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+
+	decl	%ebx
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	shrl	$3,%ebx
+
+	shrl	$1,%edx
+	incl	%ebx
+	decl	%ebx
+	jz	L(Lend)
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+L(Loop):
+	movl	-28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebp
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	rcrl	$1,%eax
+	movl	%ebp,(%edi)
+	rcrl	$1,%edx
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebp
+	movl	-12(%esi),%eax
+	rcrl	$1,%ebp
+	movl	%edx,-8(%edi)
+	rcrl	$1,%eax
+	movl	%ebp,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebp
+	rcrl	$1,%edx
+	movl	%eax,-16(%edi)
+	rcrl	$1,%ebp
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	rcrl	$1,%eax
+	movl	%ebp,-24(%edi)
+	rcrl	$1,%edx
+	movl	%eax,-28(%edi)
+
+	leal	-32(%esi),%esi		/* use leal not to clobber carry */
+	leal	-32(%edi),%edi
+	decl	%ebx
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	sbbl	%eax,%eax		/* save carry in %eax */
+	andl	$7,%ebx
+	jz	L(Lend2)
+	addl	%eax,%eax		/* restore carry from eax */
+L(Loop2):
+	movl	%edx,%ebp
+	movl	(%esi),%edx
+	rcrl	$1,%edx
+	movl	%ebp,(%edi)
+
+	leal	-4(%esi),%esi		/* use leal not to clobber carry */
+	leal	-4(%edi),%edi
+	decl	%ebx
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		/* restore carry from eax */
+L(L1):	movl	%edx,(%edi)		/* store last limb */
+
+	movl	$0,%eax
+	rcrl	$1,%eax
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_rshift)
diff --git a/REORG.TODO/sysdeps/i386/i586/stpcpy.S b/REORG.TODO/sysdeps/i386/i586/stpcpy.S
new file mode 100644
index 0000000000..8691efd01c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/stpcpy.S
@@ -0,0 +1,8 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+
+#include <sysdeps/i386/i586/strcpy.S>
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/i386/i586/strchr.S b/REORG.TODO/sysdeps/i386/i586/strchr.S
new file mode 100644
index 0000000000..02f66b8f72
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/strchr.S
@@ -0,0 +1,348 @@
+/* Find character CH in a NUL terminated string.
+   Highly optimized version for ix85, x>=5.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/* This version is especially optimized for the i586 (and following?)
+   processors.  This is mainly done by using the two pipelines.  The
+   version optimized for i486 is weak in this aspect because to get
+   as much parallelism we have to execute some *more* instructions.
+
+   The code below is structured to reflect the pairing of the instructions
+   as *I think* it is.  I have no processor data book to verify this.
+   If you find something you think is incorrect let me know.  */
+
+
+/* The magic value which is used throughout in the whole code.  */
+#define magic 0xfefefeff
+
+#define PARMS	4+16	/* space for 4 saved regs */
+#define RTN	PARMS
+#define STR	RTN
+#define CHR	STR+4
+
+	.text
+ENTRY (strchr)
+
+	pushl %edi		/* Save callee-safe registers.  */
+	cfi_adjust_cfa_offset (-4)
+	pushl %esi
+	cfi_adjust_cfa_offset (-4)
+
+	pushl %ebx
+	cfi_adjust_cfa_offset (-4)
+	pushl %ebp
+	cfi_adjust_cfa_offset (-4)
+
+	movl STR(%esp), %eax
+	movl CHR(%esp), %edx
+
+	movl %eax, %edi		/* duplicate string pointer for later */
+	cfi_rel_offset (edi, 12)
+	xorl %ecx, %ecx		/* clear %ecx */
+
+	/* At the moment %edx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* now it is 0|0|c|c */
+	movb %dl, %cl		/* we construct the lower half in %ecx */
+
+	shll $16, %edx		/* now %edx is c|c|0|0 */
+	movb %cl, %ch		/* now %ecx is 0|0|c|c */
+
+	orl %ecx, %edx		/* and finally c|c|c|c */
+	andl $3, %edi		/* mask alignment bits */
+
+	jz L(11)		/* alignment is 0 => start loop */
+
+	movb %dl, %cl		/* 0 is needed below */
+	jp L(0)			/* exactly two bits set */
+
+	xorb (%eax), %cl	/* is byte the one we are looking for? */
+	jz L(out)		/* yes => return pointer */
+
+	xorb %dl, %cl		/* load single byte and test for NUL */
+	je L(3)			/* yes => return NULL */
+
+	movb 1(%eax), %cl	/* load single byte */
+	incl %eax
+
+	cmpb %cl, %dl		/* is byte == C? */
+	je L(out)		/* aligned => return pointer */
+
+	cmpb $0, %cl		/* is byte NUL? */
+	je L(3)			/* yes => return NULL */
+
+	incl %eax
+	decl %edi
+
+	jne L(11)
+
+L(0):	movb (%eax), %cl	/* load single byte */
+
+	cmpb %cl, %dl		/* is byte == C? */
+	je L(out)		/* aligned => return pointer */
+
+	cmpb $0, %cl		/* is byte NUL? */
+	je L(3)			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	cfi_rel_offset (esi, 8)
+	cfi_rel_offset (ebx, 4)
+	cfi_rel_offset (ebp, 0)
+
+	/* The following code is the preparation for the loop.  The
+	   four instruction up to `L1' will not be executed in the loop
+	   because the same code is found at the end of the loop, but
+	   there it is executed in parallel with other instructions.  */
+L(11):	movl (%eax), %ecx
+	movl $magic, %ebp
+
+	movl $magic, %edi
+	addl %ecx, %ebp
+
+	/* The main loop: it looks complex and indeed it is.  I would
+	   love to say `it was hard to write, so it should he hard to
+	   read' but I will give some more hints.  To fully understand
+	   this code you should first take a look at the i486 version.
+	   The basic algorithm is the same, but here the code organized
+	   in a way which permits to use both pipelines all the time.
+
+	   I tried to make it a bit more understandable by indenting
+	   the code according to stage in the algorithm.  It goes as
+	   follows:
+		check for 0 in 1st word
+			check for C in 1st word
+					check for 0 in 2nd word
+						check for C in 2nd word
+		check for 0 in 3rd word
+			check for C in 3rd word
+					check for 0 in 4th word
+						check for C in 4th word
+
+	   Please note that doing the test for NUL before the test for
+	   C allows us to overlap the test for 0 in the next word with
+	   the test for C.  */
+
+L(1):	xorl %ecx, %ebp			/* (word^magic) */
+	addl %ecx, %edi			/* add magic word */
+
+	leal 4(%eax), %eax		/* increment pointer */
+	jnc L(4)			/* previous addl caused overflow? */
+
+		movl %ecx, %ebx		/* duplicate original word */
+	orl $magic, %ebp		/* (word^magic)|magic */
+
+	addl $1, %ebp			/* (word^magic)|magic == 0xffffffff? */
+	jne L(4)				/* yes => we found word with NUL */
+
+		movl $magic, %esi	/* load magic value */
+		xorl %edx, %ebx		/* clear words which are C */
+
+					movl (%eax), %ecx
+		addl %ebx, %esi		/* (word+magic) */
+
+					movl $magic, %edi
+		jnc L(5)		/* previous addl caused overflow? */
+
+					movl %edi, %ebp
+		xorl %ebx, %esi		/* (word+magic)^word */
+
+					addl %ecx, %ebp
+		orl $magic, %esi	/* ((word+magic)^word)|magic */
+
+		addl $1, %esi		/* ((word+magic)^word)|magic==0xf..f?*/
+		jne L(5)		/* yes => we found word with C */
+
+					xorl %ecx, %ebp
+					addl %ecx, %edi
+
+					leal 4(%eax), %eax
+					jnc L(4)
+
+						movl %ecx, %ebx
+					orl $magic, %ebp
+
+					addl $1, %ebp
+					jne L(4)
+
+						movl $magic, %esi
+						xorl %edx, %ebx
+
+	movl (%eax), %ecx
+						addl %ebx, %esi
+
+	movl $magic, %edi
+						jnc L(5)
+
+	movl %edi, %ebp
+						xorl %ebx, %esi
+
+	addl %ecx, %ebp
+						orl $magic, %esi
+
+						addl $1, %esi
+						jne L(5)
+
+	xorl %ecx, %ebp
+	addl %ecx, %edi
+
+	leal 4(%eax), %eax
+	jnc L(4)
+
+		movl %ecx, %ebx
+	orl $magic, %ebp
+
+	addl $1, %ebp
+	jne L(4)
+
+		movl $magic, %esi
+		xorl %edx, %ebx
+
+					movl (%eax), %ecx
+		addl %ebx, %esi
+
+					movl $magic, %edi
+		jnc L(5)
+
+					movl %edi, %ebp
+		xorl %ebx, %esi
+
+					addl %ecx, %ebp
+		orl $magic, %esi
+
+		addl $1, %esi
+		jne L(5)
+
+					xorl %ecx, %ebp
+					addl %ecx, %edi
+
+					leal 4(%eax), %eax
+					jnc L(4)
+
+						movl %ecx, %ebx
+					orl $magic, %ebp
+
+					addl $1, %ebp
+					jne L(4)
+
+						movl $magic, %esi
+						xorl %edx, %ebx
+
+	movl (%eax), %ecx
+						addl %ebx, %esi
+
+	movl $magic, %edi
+						jnc L(5)
+
+	movl %edi, %ebp
+						xorl %ebx, %esi
+
+	addl %ecx, %ebp
+						orl $magic, %esi
+
+						addl $1, %esi
+
+						je L(1)
+
+	/* We know there is no NUL byte but a C byte in the word.
+	   %ebx contains NUL in this particular byte.  */
+L(5):	subl $4, %eax		/* adjust pointer */
+	testb %bl, %bl		/* first byte == C? */
+
+	jz L(out)		/* yes => return pointer */
+
+	incl %eax		/* increment pointer */
+	testb %bh, %bh		/* second byte == C? */
+
+	jz L(out)		/* yes => return pointer */
+
+	shrl $16, %ebx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmp $0, %bl		/* third byte == C */
+	je L(out)		/* yes => return pointer */
+
+	incl %eax		/* increment pointer */
+
+L(out):	popl %ebp		/* restore saved registers */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl %ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+
+	popl %esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl %edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+
+	cfi_adjust_cfa_offset (16)
+	cfi_rel_offset (edi, 12)
+	cfi_rel_offset (esi, 8)
+	cfi_rel_offset (ebx, 4)
+	cfi_rel_offset (ebp, 0)
+	/* We know there is a NUL byte in the word.  But we have to test
+	   whether there is an C byte before it in the word.  */
+L(4):	subl $4, %eax		/* adjust pointer */
+	cmpb %dl, %cl		/* first byte == C? */
+
+	je L(out)		/* yes => return pointer */
+
+	cmpb $0, %cl		/* first byte == NUL? */
+	je L(3)			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	cmpb %dl, %ch		/* second byte == C? */
+	je L(out)		/* yes => return pointer */
+
+	cmpb $0, %ch		/* second byte == NUL? */
+	je L(3)			/* yes => return NULL */
+
+	shrl $16, %ecx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmpb %dl, %cl		/* third byte == C? */
+	je L(out)		/* yes => return pointer */
+
+	cmpb $0, %cl		/* third byte == NUL? */
+	je L(3)			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	/* The test four the fourth byte is necessary!  */
+	cmpb %dl, %ch		/* fourth byte == C? */
+	je L(out)		/* yes => return pointer */
+
+L(3):	xorl %eax, %eax
+	jmp L(out)
+END (strchr)
+
+#undef index
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
diff --git a/REORG.TODO/sysdeps/i386/i586/strcpy.S b/REORG.TODO/sysdeps/i386/i586/strcpy.S
new file mode 100644
index 0000000000..a444604f4f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/strcpy.S
@@ -0,0 +1,169 @@
+/* strcpy/stpcpy implementation for i586.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+12	/* space for 3 saved regs */
+#define RTN	PARMS
+#define DEST	RTN
+#define SRC	DEST+4
+
+#ifndef USE_AS_STPCPY
+# define STRCPY strcpy
+#endif
+
+#define magic 0xfefefeff
+
+	.text
+ENTRY (STRCPY)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+
+	movl	DEST(%esp), %edi
+	cfi_rel_offset (edi, 8)
+	movl	SRC(%esp), %esi
+	cfi_rel_offset (esi, 4)
+
+	xorl	%eax, %eax
+	leal	-1(%esi), %ecx
+
+	movl	$magic, %ebx
+	cfi_rel_offset (ebx, 0)
+	andl	$3, %ecx
+
+#ifdef PIC
+	call	2f
+	cfi_adjust_cfa_offset (4)
+2:	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	/* 0xb is the distance between 2: and 1: but we avoid writing
+	   1f-2b because the assembler generates worse code.  */
+	leal	0xb(%edx,%ecx,8), %ecx
+#else
+	leal	1f(,%ecx,8), %ecx
+#endif
+
+	jmp	*%ecx
+
+	.align 8
+1:
+	orb	(%esi), %al
+	jz	L(end)
+	stosb
+	xorl	%eax, %eax
+	incl	%esi
+
+	orb	(%esi), %al
+	jz	L(end)
+	stosb
+	xorl	%eax, %eax
+	incl	%esi
+
+	orb	(%esi), %al
+	jz	L(end)
+	stosb
+	xorl	%eax, %eax
+	incl	%esi
+
+L(1):	movl	(%esi), %ecx
+	leal	4(%esi),%esi
+
+	subl	%ecx, %eax
+	addl	%ebx, %ecx
+
+	decl	%eax
+	jnc	L(3)
+
+	movl	%ecx, %edx
+	xorl	%ecx, %eax
+
+	subl	%ebx, %edx
+	andl	$~magic, %eax
+
+	jne	L(4)
+
+	movl	%edx, (%edi)
+	leal	4(%edi),%edi
+
+	jmp	L(1)
+
+L(3):	movl	%ecx, %edx
+
+	subl	%ebx, %edx
+
+L(4):	movb	%dl, (%edi)
+	testb	%dl, %dl
+
+	movl	%edx, %eax
+	jz	L(end2)
+
+	shrl	$16, %eax
+	movb	%dh, 1(%edi)
+#ifdef USE_AS_STPCPY
+	addl	$1, %edi
+#endif
+
+	cmpb	$0, %dh
+	jz	L(end2)
+
+#ifdef USE_AS_STPCPY
+	movb	%al, 1(%edi)
+	addl	$1, %edi
+
+	cmpb	$0, %al
+	jz	L(end2)
+
+	addl	$1, %edi
+#else
+	movb	%al, 2(%edi)
+	testb	%al, %al
+
+	leal	3(%edi), %edi
+	jz	L(end2)
+#endif
+
+L(end):	movb	%ah, (%edi)
+
+L(end2):
+#ifdef USE_AS_STPCPY
+	movl	%edi, %eax
+#else
+	movl	DEST(%esp), %eax
+#endif
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (STRCPY)
+#ifndef USE_AS_STPCPY
+libc_hidden_builtin_def (strcpy)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i586/strlen.S b/REORG.TODO/sysdeps/i386/i586/strlen.S
new file mode 100644
index 0000000000..cfea2e020f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/strlen.S
@@ -0,0 +1,182 @@
+/* strlen -- Compute length of NUL terminated string.
+   Highly optimized version for ix86, x>=5.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/* This version is especially optimized for the i586 (and following?)
+   processors.  This is mainly done by using the two pipelines.  The
+   version optimized for i486 is weak in this aspect because to get
+   as much parallelism we have to execute some *more* instructions.
+
+   The code below is structured to reflect the pairing of the instructions
+   as *I think* it is.  I have no processor data book to verify this.
+   If you find something you think is incorrect let me know.  */
+
+
+/* The magic value which is used throughout in the whole code.  */
+#define magic 0xfefefeff
+
+#define PARMS	4		/* no space for saved regs */
+#define STR	PARMS
+
+	.text
+ENTRY (strlen)
+
+	movl STR(%esp), %eax
+	movl $3, %edx		/* load mask (= 3) */
+
+	andl %eax, %edx		/* separate last two bits of address */
+
+	jz L(1)			/* aligned => start loop */
+	jp L(0)			/* exactly two bits set */
+
+	cmpb %dh, (%eax)	/* is byte NUL? */
+	je L(2)			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	cmpb %dh, (%eax)	/* is byte NUL? */
+
+	je L(2)			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	xorl $2, %edx
+
+	jz L(1)
+
+L(0):	cmpb %dh, (%eax)	/* is byte NUL? */
+	je L(2)			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	xorl %edx, %edx		/* We need %edx == 0 for later */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 Note: %edx == 0 in any case here.  */
+
+L(1):
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L(3)		/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L(3)		/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L(3)		/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L(3)		/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L(3)		/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L(3)		/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L(3)		/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	je L(1)			/* no => start loop again */
+
+
+L(3):	subl $4, %eax		/* correct too early pointer increment */
+	subl $magic, %ecx
+
+	cmpb $0, %cl		/* lowest byte NUL? */
+	jz L(2)			/* yes => return */
+
+	inc %eax		/* increment pointer */
+	testb %ch, %ch		/* second byte NUL? */
+
+	jz L(2)			/* yes => return */
+
+	shrl $16, %ecx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmpb $0, %cl		/* is third byte NUL? */
+	jz L(2)			/* yes => return */
+
+	incl %eax		/* increment pointer */
+
+L(2):	subl STR(%esp), %eax	/* now compute the length as difference
+				   between start and terminating NUL
+				   character */
+	ret
+END (strlen)
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/i386/i586/sub_n.S b/REORG.TODO/sysdeps/i386/i586/sub_n.S
new file mode 100644
index 0000000000..21b5a2742c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/sub_n.S
@@ -0,0 +1,143 @@
+/* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0
+   and store difference in a third limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16		/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define S2	S1+4
+#define SIZE	S2+4
+
+	.text
+ENTRY (__mpn_sub_n)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 12)
+	movl	S1(%esp),%esi
+	cfi_rel_offset (esi, 8)
+	movl	S2(%esp),%ebx
+	cfi_rel_offset (ebx, 0)
+	movl	SIZE(%esp),%ecx
+	movl	(%ebx),%ebp
+	cfi_rel_offset (ebp, 4)
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		/* zero carry flag */
+	jz	L(end)
+	pushl	%edx
+	cfi_adjust_cfa_offset (4)
+
+	ALIGN (3)
+L(oop):	movl	28(%edi),%eax		/* fetch destination cache line */
+	leal	32(%edi),%edi
+
+L(1):	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	sbbl	%ebp,%eax
+	movl	4(%ebx),%ebp
+	sbbl	%ebp,%edx
+	movl	8(%ebx),%ebp
+	movl	%eax,-32(%edi)
+	movl	%edx,-28(%edi)
+
+L(2):	movl	8(%esi),%eax
+	movl	12(%esi),%edx
+	sbbl	%ebp,%eax
+	movl	12(%ebx),%ebp
+	sbbl	%ebp,%edx
+	movl	16(%ebx),%ebp
+	movl	%eax,-24(%edi)
+	movl	%edx,-20(%edi)
+
+L(3):	movl	16(%esi),%eax
+	movl	20(%esi),%edx
+	sbbl	%ebp,%eax
+	movl	20(%ebx),%ebp
+	sbbl	%ebp,%edx
+	movl	24(%ebx),%ebp
+	movl	%eax,-16(%edi)
+	movl	%edx,-12(%edi)
+
+L(4):	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	sbbl	%ebp,%eax
+	movl	28(%ebx),%ebp
+	sbbl	%ebp,%edx
+	movl	32(%ebx),%ebp
+	movl	%eax,-8(%edi)
+	movl	%edx,-4(%edi)
+
+	leal	32(%esi),%esi
+	leal	32(%ebx),%ebx
+	decl	%ecx
+	jnz	L(oop)
+
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+L(end):
+	decl	%edx			/* test %edx w/o clobbering carry */
+	js	L(end2)
+	incl	%edx
+L(oop2):
+	leal	4(%edi),%edi
+	movl	(%esi),%eax
+	sbbl	%ebp,%eax
+	movl	4(%ebx),%ebp
+	movl	%eax,-4(%edi)
+	leal	4(%esi),%esi
+	leal	4(%ebx),%ebx
+	decl	%edx
+	jnz	L(oop2)
+L(end2):
+	movl	(%esi),%eax
+	sbbl	%ebp,%eax
+	movl	%eax,(%edi)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_sub_n)
diff --git a/REORG.TODO/sysdeps/i386/i586/submul_1.S b/REORG.TODO/sysdeps/i386/i586/submul_1.S
new file mode 100644
index 0000000000..5e5e121ca2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/submul_1.S
@@ -0,0 +1,94 @@
+/* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+   the result from a second limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16	/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define SIZE	S1+4
+#define S2LIMB	SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+	.text
+ENTRY (__mpn_submul_1)
+
+	pushl	%res_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%s1_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%s2_limb
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp), %res_ptr
+	cfi_rel_offset (res_ptr, 12)
+	movl	S1(%esp), %s1_ptr
+	cfi_rel_offset (s1_ptr, 8)
+	movl	SIZE(%esp), %size
+	movl	S2LIMB(%esp), %s2_limb
+	cfi_rel_offset (s2_limb, 0)
+	leal	(%res_ptr,%size,4), %res_ptr
+	leal	(%s1_ptr,%size,4), %s1_ptr
+	negl	%size
+	xorl	%ebp, %ebp
+	cfi_rel_offset (ebp, 4)
+	ALIGN (3)
+
+L(oop):	adcl	$0, %ebp
+	movl	(%s1_ptr,%size,4), %eax
+
+	mull	%s2_limb
+
+	addl	%ebp, %eax
+	movl	(%res_ptr,%size,4), %ebp
+
+	adcl	$0, %edx
+	subl	%eax, %ebp
+
+	movl	%ebp, (%res_ptr,%size,4)
+	incl	%size
+
+	movl	%edx, %ebp
+	jnz	L(oop)
+
+	adcl	$0, %ebp
+	movl	%ebp, %eax
+	popl	%s2_limb
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s2_limb)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%s1_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s1_ptr)
+	popl	%res_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (res_ptr)
+
+	ret
+#undef size
+END (__mpn_submul_1)
diff --git a/REORG.TODO/sysdeps/i386/i686/Makefile b/REORG.TODO/sysdeps/i386/i686/Makefile
new file mode 100644
index 0000000000..311042787b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/Makefile
@@ -0,0 +1,12 @@
+# So that we can test __m128's alignment
+stack-align-test-flags += -msse
+
+CFLAGS-.o += -Wa,-mtune=i686
+CFLAGS-.os += -Wa,-mtune=i686
+CFLAGS-.op += -Wa,-mtune=i686
+CFLAGS-.oS += -Wa,-mtune=i686
+
+ASFLAGS-.o += -Wa,-mtune=i686
+ASFLAGS-.os += -Wa,-mtune=i686
+ASFLAGS-.op += -Wa,-mtune=i686
+ASFLAGS-.oS += -Wa,-mtune=i686
diff --git a/REORG.TODO/sysdeps/i386/i686/add_n.S b/REORG.TODO/sysdeps/i386/i686/add_n.S
new file mode 100644
index 0000000000..4afa648ceb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/add_n.S
@@ -0,0 +1,110 @@
+/* Add two limb vectors of the same length > 0 and store sum in a third
+   limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+8		/* space for 2 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define S2	S1+4
+#define SIZE	S2+4
+
+	.text
+#ifdef PIC
+L(1):	addl    (%esp), %eax
+	ret
+#endif
+ENTRY (__mpn_add_n)
+
+	pushl %edi
+	cfi_adjust_cfa_offset (4)
+	pushl %esi
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 4)
+	movl	S1(%esp),%esi
+	cfi_rel_offset (esi, 0)
+	movl	S2(%esp),%edx
+	movl	SIZE(%esp),%ecx
+	movl	%ecx,%eax
+	shrl	$3,%ecx			/* compute count for unrolled loop */
+	negl	%eax
+	andl	$7,%eax			/* get index where to start loop */
+	jz	L(oop)			/* necessary special case for 0 */
+	incl	%ecx			/* adjust loop count */
+	shll	$2,%eax			/* adjustment for pointers... */
+	subl	%eax,%edi		/* ... since they are offset ... */
+	subl	%eax,%esi		/* ... by a constant when we ... */
+	subl	%eax,%edx		/* ... enter the loop */
+	shrl	$2,%eax			/* restore previous value */
+#ifdef PIC
+/* Calculate start address in loop for PIC.  */
+	leal	(L(oop)-L(0)-3)(%eax,%eax,8),%eax
+	call	L(1)
+L(0):
+#else
+/* Calculate start address in loop for non-PIC.  */
+ 	leal	(L(oop) - 3)(%eax,%eax,8),%eax
+#endif
+	jmp	*%eax			/* jump into loop */
+	ALIGN (3)
+L(oop):	movl	(%esi),%eax
+	adcl	(%edx),%eax
+	movl	%eax,(%edi)
+	movl	4(%esi),%eax
+	adcl	4(%edx),%eax
+	movl	%eax,4(%edi)
+	movl	8(%esi),%eax
+	adcl	8(%edx),%eax
+	movl	%eax,8(%edi)
+	movl	12(%esi),%eax
+	adcl	12(%edx),%eax
+	movl	%eax,12(%edi)
+	movl	16(%esi),%eax
+	adcl	16(%edx),%eax
+	movl	%eax,16(%edi)
+	movl	20(%esi),%eax
+	adcl	20(%edx),%eax
+	movl	%eax,20(%edi)
+	movl	24(%esi),%eax
+	adcl	24(%edx),%eax
+	movl	%eax,24(%edi)
+	movl	28(%esi),%eax
+	adcl	28(%edx),%eax
+	movl	%eax,28(%edi)
+	leal	32(%edi),%edi
+	leal	32(%esi),%esi
+	leal	32(%edx),%edx
+	decl	%ecx
+	jnz	L(oop)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl %esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl %edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_add_n)
diff --git a/REORG.TODO/sysdeps/i386/i686/bcopy.S b/REORG.TODO/sysdeps/i386/i686/bcopy.S
new file mode 100644
index 0000000000..15ef9419a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/bcopy.S
@@ -0,0 +1,3 @@
+#define USE_AS_BCOPY
+#define memmove bcopy
+#include <sysdeps/i386/i686/memmove.S>
diff --git a/REORG.TODO/sysdeps/i386/i686/bzero.S b/REORG.TODO/sysdeps/i386/i686/bzero.S
new file mode 100644
index 0000000000..c7898f18e0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/bzero.S
@@ -0,0 +1,4 @@
+#define USE_AS_BZERO
+#define memset __bzero
+#include <sysdeps/i386/i686/memset.S>
+weak_alias (__bzero, bzero)
diff --git a/REORG.TODO/sysdeps/i386/i686/dl-hash.h b/REORG.TODO/sysdeps/i386/i686/dl-hash.h
new file mode 100644
index 0000000000..ceda785b32
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/dl-hash.h
@@ -0,0 +1,79 @@
+/* Compute hash alue for given string according to ELF standard.
+   Copyright (C) 1998-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_HASH_H
+#define _DL_HASH_H	1
+
+
+/* This is the hashing function specified by the ELF ABI.  It is highly
+   optimized for the PII processors.  Though it will run on i586 it
+   would be much slower than the generic C implementation.  So don't
+   use it.  */
+static unsigned int
+__attribute__ ((unused))
+_dl_elf_hash (const char *name)
+{
+  unsigned int result;
+  unsigned int temp0;
+  unsigned int temp1;
+
+  __asm__ __volatile__
+    ("movzbl (%1),%2\n\t"
+     "testl %2, %2\n\t"
+     "jz 1f\n\t"
+     "movl %2, %0\n\t"
+     "movzbl 1(%1), %2\n\t"
+     "jecxz 1f\n\t"
+     "shll $4, %0\n\t"
+     "addl %2, %0\n\t"
+     "movzbl 2(%1), %2\n\t"
+     "jecxz 1f\n\t"
+     "shll $4, %0\n\t"
+     "addl %2, %0\n\t"
+     "movzbl 3(%1), %2\n\t"
+     "jecxz 1f\n\t"
+     "shll $4, %0\n\t"
+     "addl %2, %0\n\t"
+     "movzbl 4(%1), %2\n\t"
+     "jecxz 1f\n\t"
+     "shll $4, %0\n\t"
+     "addl $5, %1\n\t"
+     "addl %2, %0\n\t"
+     "movzbl (%1), %2\n\t"
+     "jecxz 1f\n"
+     "2:\t"
+     "shll $4, %0\n\t"
+     "movl $0xf0000000, %3\n\t"
+     "incl %1\n\t"
+     "addl %2, %0\n\t"
+     "andl %0, %3\n\t"
+     "andl $0x0fffffff, %0\n\t"
+     "shrl $24, %3\n\t"
+     "movzbl (%1), %2\n\t"
+     "xorl %3, %0\n\t"
+     "testl %2, %2\n\t"
+     "jnz 2b\n"
+     "1:\t"
+     : "=&r" (result), "=r" (name), "=&c" (temp0), "=&r" (temp1)
+     : "0" (0), "1" ((const unsigned char *) name));
+
+  return result;
+}
+
+#endif /* dl-hash.h */
diff --git a/REORG.TODO/sysdeps/i386/i686/ffs.c b/REORG.TODO/sysdeps/i386/i686/ffs.c
new file mode 100644
index 0000000000..cbe36ff873
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/ffs.c
@@ -0,0 +1,48 @@
+/* ffs -- find first set bit in a word, counted from least significant end.
+   For Intel 80x86, x>=6.
+   This file is part of the GNU C Library.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define ffsl __something_else
+#include <string.h>
+
+#undef	ffs
+
+#ifdef	__GNUC__
+
+int
+__ffs (int x)
+{
+  int cnt;
+  int tmp;
+
+  asm ("bsfl %2,%0\n"		/* Count low bits in X and store in %1.  */
+       "cmovel %1,%0\n"		/* If number was zero, use -1 as result.  */
+       : "=&r" (cnt), "=r" (tmp) : "rm" (x), "1" (-1));
+
+  return cnt + 1;
+}
+weak_alias (__ffs, ffs)
+libc_hidden_def (__ffs)
+libc_hidden_builtin_def (ffs)
+#undef ffsl
+weak_alias (__ffs, ffsl)
+
+#else
+#include <string/ffs.c>
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S
new file mode 100644
index 0000000000..73060b088c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S
@@ -0,0 +1,29 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+
+	.text
+ENTRY(__ieee754_log)
+	fldln2			// log(2)
+	fldl	4(%esp)		// x : log(2)
+	fucomi	%st
+	jp	3f
+	fyl2x			// log(x)
+	ret
+
+3:	fstp	%st(1)
+	ret
+END (__ieee754_log)
+
+ENTRY(__log_finite)
+	fldln2			// log(2)
+	fldl	4(%esp)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+END(__log_finite)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S
new file mode 100644
index 0000000000..6fd39d50d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+
+	.text
+ENTRY(__ieee754_logf)
+	fldln2			// log(2)
+	flds	4(%esp)		// x : log(2)
+	fucomi	%st
+	jp	3f
+	fyl2x			// log(x)
+	ret
+
+3:	fstp	%st(1)
+	ret
+END (__ieee754_logf)
+
+ENTRY(__logf_finite)
+	fldln2			// log(2)
+	flds	4(%esp)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+END(__logf_finite)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S
new file mode 100644
index 0000000000..7e3bc8d817
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S
@@ -0,0 +1,94 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_logl)
+	fldln2			// log(2)
+	fldt	4(%esp)		// x : log(2)
+	fucomi	%st
+	jp	3f
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	movzwl	4+8(%esp), %eax
+	cmpl	$0xc000, %eax
+	jae	5f		// x <= -2, avoid overflow from -LDBL_MAX - 1.
+	fsubl	MO(one)		// x-1 : x : log(2)
+5:	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fld	MO(limit)	// 0.29 : |x-1| : x-1 : x : log(2)
+	fcomip	%st(1)		// |x-1| : x-1 : x : log(2)
+	fstp	%st(0)		// x-1 : x : log(2)
+	jc	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	4f
+	fabs			// log(1) is +0 in all rounding modes.
+4:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+
+3:	fstp	%st(1)
+	fadd	%st(0)
+	ret
+END (__ieee754_logl)
+
+ENTRY(__logl_finite)
+	fldln2			// log(2)
+	fldt	4(%esp)		// x : log(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fld	MO(limit)	// 0.29 : |x-1| : x-1 : x : log(2)
+	fcomip	%st(1)		// |x-1| : x-1 : x : log(2)
+	fstp	%st(0)		// x-1 : x : log(2)
+	jc	2b
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	6f
+	fabs			// log(1) is +0 in all rounding modes.
+6:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+END(__logl_finite)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile
new file mode 100644
index 0000000000..7d9089232f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),math)
+libm-sysdep_routines += e_expf-sse2 e_expf-ia32 s_sinf-sse2 s_cosf-sse2 \
+                        s_sincosf-sse2
+endif
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S
new file mode 100644
index 0000000000..b486b4d1ca
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S
@@ -0,0 +1,22 @@
+/*
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define __ieee754_expf __ieee754_expf_ia32
+#define __expf_finite __expf_finite_ia32
+
+#include <sysdeps/i386/fpu/e_expf.S>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S
new file mode 100644
index 0000000000..e6bb6fa289
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S
@@ -0,0 +1,325 @@
+/* SSE2 version of __ieee754_expf and __expf_finite
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+/* Short algorithm description:
+ *
+ *  Let K = 64 (table size).
+ *       e^x  = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
+ *  where
+ *       x = m*log(2)/K + y,    y in [0.0..log(2)/K]
+ *       m = n*K + j,           m,n,j - signed integer, j in [0..K-1]
+ *       values of 2^(j/K) are tabulated as T[j].
+ *
+ *       P(y) is a minimax polynomial approximation of expf(x)-1
+ *       on small interval [0.0..log(2)/K].
+ *
+ *       P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
+ *       z = y*y;    P(y) = (P3*z + P1)*z + (P2*z + P0)*y
+ *
+ * Special cases:
+ *  __ieee754_expf_sse2(NaN) = NaN
+ *  __ieee754_expf_sse2(+INF) = +INF
+ *  __ieee754_expf_sse2(-INF) = 0
+ *  __ieee754_expf_sse2(x) = 1 for subnormals
+ *  for finite argument, only __ieee754_expf_sse2(0)=1 is exact
+ *  __ieee754_expf_sse2(x) overflows if x>700
+ *  __ieee754_expf_sse2(x) underflows if x<-700
+ *
+ * Note:
+ *  For |x|<700, __ieee754_expf_sse2 computes result in double precision,
+ *  with accuracy a bit more than needed for expf, and does not round it
+ *  to single precision.
+ */
+
+
+#ifdef	PIC
+# define MO1(symbol)			L(symbol)##@GOTOFF(%edx)
+# define MO2(symbol,reg2,_scale)	L(symbol)##@GOTOFF(%edx,reg2,_scale)
+#else
+# define MO1(symbol)			L(symbol)
+# define MO2(symbol,reg2,_scale)	L(symbol)(,reg2,_scale)
+#endif
+
+	.text
+ENTRY(__ieee754_expf_sse2)
+	/* Input: single precision x on stack at address 4(%esp) */
+
+#ifdef	PIC
+	LOAD_PIC_REG(dx)
+#endif
+
+	cvtss2sd	4(%esp), %xmm1	/* Convert x to double precision */
+	mov	4(%esp), %ecx		/* Copy x */
+	movsd	MO1(DP_KLN2), %xmm2	/* DP K/log(2) */
+	movsd	MO1(DP_P2), %xmm3	/* DP P2 */
+	movl	%ecx, %eax		/* x */
+	mulsd	%xmm1, %xmm2		/* DP x*K/log(2) */
+	andl	$0x7fffffff, %ecx	/* |x| */
+	cmpl	$0x442f0000, %ecx	/* |x|<700 ? */
+	movsd	MO1(DP_P3), %xmm4	/* DP P3 */
+	addsd	MO1(DP_RS), %xmm2	/* DP x*K/log(2)+RS */
+	jae	L(special_paths)
+
+	/* Here if |x|<700 */
+	cmpl	$0x31800000, %ecx	/* |x|<2^(-28) ? */
+	jb	L(small_arg)
+
+	/* Main path: here if 2^(-28)<=|x|<700 */
+	cvtsd2ss	%xmm2, %xmm2	/* SP x*K/log(2)+RS */
+	movd	%xmm2, %eax		/* bits of n*K+j with trash */
+	subss	MO1(SP_RS), %xmm2	/* SP t=round(x*K/log(2)) */
+	movl	%eax, %ecx		/* n*K+j with trash */
+	cvtss2sd	%xmm2, %xmm2	/* DP t */
+	andl	$0x3f, %eax		/* bits of j */
+	mulsd	MO1(DP_NLN2K), %xmm2	/* DP -t*log(2)/K */
+	andl	$0xffffffc0, %ecx	/* bits of n */
+#ifdef __AVX__
+	vaddsd	%xmm1, %xmm2, %xmm0	/* DP y=x-t*log(2)/K */
+	vmulsd	%xmm0, %xmm0, %xmm2	/* DP z=y*y */
+#else
+	addsd	%xmm1, %xmm2		/* DP y=x-t*log(2)/K */
+	movaps	%xmm2, %xmm0		/* DP y */
+	mulsd	%xmm2, %xmm2		/* DP z=y*y */
+#endif
+	mulsd	%xmm2, %xmm4		/* DP P3*z */
+	addl	$0xffc0, %ecx		/* bits of n + DP exponent bias */
+	mulsd	%xmm2, %xmm3		/* DP P2*z */
+	shrl	$2, %ecx		/* High 2 bytes of DP 2^n */
+	pxor	%xmm1, %xmm1		/* clear %xmm1 */
+	addsd	MO1(DP_P1), %xmm4	/* DP P3*z+P1 */
+	addsd	MO1(DP_P0), %xmm3	/* DP P2*z+P0 */
+	pinsrw	$3, %ecx, %xmm1		/* DP 2^n */
+	mulsd	%xmm2, %xmm4		/* DP (P3*z+P1)*z */
+	mulsd	%xmm3, %xmm0		/* DP (P2*z+P0)*y */
+	addsd	%xmm4, %xmm0		/* DP P(y) */
+	mulsd	MO2(DP_T,%eax,8), %xmm0	/* DP P(y)*T[j] */
+	addsd	MO2(DP_T,%eax,8), %xmm0	/* DP T[j]*(P(y)+1) */
+	mulsd	%xmm1, %xmm0		/* DP result=2^n*(T[j]*(P(y)+1)) */
+	cvtsd2ss	%xmm0, %xmm1
+
+	lea	-4(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	movss	%xmm1, 0(%esp)		/* Move result from sse... */
+	flds	0(%esp)			/* ...to FPU. */
+	lea	4(%esp), %esp		/* Return back 4 bytes of stack frame */
+	ret
+
+	.p2align	4
+L(small_arg):
+	/* Here if 0<=|x|<2^(-28) */
+	movss	4(%esp), %xmm0		/* load x */
+	addss	MO1(SP_ONE), %xmm0	/* 1.0 + x */
+	/* Return 1.0 with inexact raised, except for x==0 */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(special_paths):
+	/* Here if x is NaN, or Inf, or finite |x|>=700 */
+	movss	4(%esp), %xmm0		/* load x */
+
+	cmpl	$0x7f800000, %ecx	/* |x| is finite ? */
+	jae	L(arg_inf_or_nan)
+
+	/* Here if finite |x|>=700 */
+	testl	$0x80000000, %eax	/* sign of x nonzero ? */
+	je	L(res_overflow)
+
+	/* Here if finite x<=-700 */
+	movss	MO1(SP_SMALL), %xmm0	/* load small value 2^(-100) */
+	mulss	%xmm0, %xmm0		/* Return underflowed result (zero or subnormal) */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(res_overflow):
+	/* Here if finite x>=700 */
+	movss	MO1(SP_LARGE), %xmm0	/* load large value 2^100 */
+	mulss	%xmm0, %xmm0		/* Return overflowed result (Inf or max normal) */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(arg_inf_or_nan):
+	/* Here if |x| is Inf or NAN */
+	jne	L(arg_nan)	/* |x| is Inf ? */
+
+	/* Here if |x| is Inf */
+	shrl	$31, %eax		/* Get sign bit of x */
+	movss	MO2(SP_INF_0,%eax,4), %xmm0/* return zero or Inf, depending on sign of x */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(arg_nan):
+	/* Here if |x| is NaN */
+	addss	%xmm0, %xmm0		/* Return x+x (raise invalid) */
+
+	.p2align	4
+L(epilogue):
+	lea	-4(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	movss	%xmm0, 0(%esp)		/* Move result from sse... */
+	flds	0(%esp)			/* ...to FPU. */
+	lea	4(%esp), %esp		/* Return back 4 bytes of stack frame */
+	ret
+END(__ieee754_expf_sse2)
+
+	.section .rodata, "a"
+	.p2align 3
+L(DP_T): /* table of double precision values 2^(j/K) for j=[0..K-1] */
+	.long	0x00000000, 0x3ff00000
+	.long	0x3e778061, 0x3ff02c9a
+	.long	0xd3158574, 0x3ff059b0
+	.long	0x18759bc8, 0x3ff08745
+	.long	0x6cf9890f, 0x3ff0b558
+	.long	0x32d3d1a2, 0x3ff0e3ec
+	.long	0xd0125b51, 0x3ff11301
+	.long	0xaea92de0, 0x3ff1429a
+	.long	0x3c7d517b, 0x3ff172b8
+	.long	0xeb6fcb75, 0x3ff1a35b
+	.long	0x3168b9aa, 0x3ff1d487
+	.long	0x88628cd6, 0x3ff2063b
+	.long	0x6e756238, 0x3ff2387a
+	.long	0x65e27cdd, 0x3ff26b45
+	.long	0xf51fdee1, 0x3ff29e9d
+	.long	0xa6e4030b, 0x3ff2d285
+	.long	0x0a31b715, 0x3ff306fe
+	.long	0xb26416ff, 0x3ff33c08
+	.long	0x373aa9cb, 0x3ff371a7
+	.long	0x34e59ff7, 0x3ff3a7db
+	.long	0x4c123422, 0x3ff3dea6
+	.long	0x21f72e2a, 0x3ff4160a
+	.long	0x6061892d, 0x3ff44e08
+	.long	0xb5c13cd0, 0x3ff486a2
+	.long	0xd5362a27, 0x3ff4bfda
+	.long	0x769d2ca7, 0x3ff4f9b2
+	.long	0x569d4f82, 0x3ff5342b
+	.long	0x36b527da, 0x3ff56f47
+	.long	0xdd485429, 0x3ff5ab07
+	.long	0x15ad2148, 0x3ff5e76f
+	.long	0xb03a5585, 0x3ff6247e
+	.long	0x82552225, 0x3ff66238
+	.long	0x667f3bcd, 0x3ff6a09e
+	.long	0x3c651a2f, 0x3ff6dfb2
+	.long	0xe8ec5f74, 0x3ff71f75
+	.long	0x564267c9, 0x3ff75feb
+	.long	0x73eb0187, 0x3ff7a114
+	.long	0x36cf4e62, 0x3ff7e2f3
+	.long	0x994cce13, 0x3ff82589
+	.long	0x9b4492ed, 0x3ff868d9
+	.long	0x422aa0db, 0x3ff8ace5
+	.long	0x99157736, 0x3ff8f1ae
+	.long	0xb0cdc5e5, 0x3ff93737
+	.long	0x9fde4e50, 0x3ff97d82
+	.long	0x82a3f090, 0x3ff9c491
+	.long	0x7b5de565, 0x3ffa0c66
+	.long	0xb23e255d, 0x3ffa5503
+	.long	0x5579fdbf, 0x3ffa9e6b
+	.long	0x995ad3ad, 0x3ffae89f
+	.long	0xb84f15fb, 0x3ffb33a2
+	.long	0xf2fb5e47, 0x3ffb7f76
+	.long	0x904bc1d2, 0x3ffbcc1e
+	.long	0xdd85529c, 0x3ffc199b
+	.long	0x2e57d14b, 0x3ffc67f1
+	.long	0xdcef9069, 0x3ffcb720
+	.long	0x4a07897c, 0x3ffd072d
+	.long	0xdcfba487, 0x3ffd5818
+	.long	0x03db3285, 0x3ffda9e6
+	.long	0x337b9b5f, 0x3ffdfc97
+	.long	0xe78b3ff6, 0x3ffe502e
+	.long	0xa2a490da, 0x3ffea4af
+	.long	0xee615a27, 0x3ffefa1b
+	.long	0x5b6e4540, 0x3fff5076
+	.long	0x819e90d8, 0x3fffa7c1
+	.type L(DP_T), @object
+	ASM_SIZE_DIRECTIVE(L(DP_T))
+
+	.section .rodata.cst8,"aM",@progbits,8
+	.p2align 3
+L(DP_KLN2): /* double precision K/log(2) */
+	.long	0x652b82fe, 0x40571547
+	.type L(DP_KLN2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_KLN2))
+
+	.p2align 3
+L(DP_NLN2K): /* double precision -log(2)/K */
+	.long	0xfefa39ef, 0xbf862e42
+	.type L(DP_NLN2K), @object
+	ASM_SIZE_DIRECTIVE(L(DP_NLN2K))
+
+	.p2align 3
+L(DP_RS): /* double precision 2^23+2^22 */
+	.long	0x00000000, 0x41680000
+	.type L(DP_RS), @object
+	ASM_SIZE_DIRECTIVE(L(DP_RS))
+
+	.p2align 3
+L(DP_P3): /* double precision polynomial coefficient P3 */
+	.long	0xeb78fa85, 0x3fa56420
+	.type L(DP_P3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_P3))
+
+	.p2align 3
+L(DP_P1): /* double precision polynomial coefficient P1 */
+	.long	0x008d6118, 0x3fe00000
+	.type L(DP_P1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_P1))
+
+	.p2align 3
+L(DP_P2): /* double precision polynomial coefficient P2 */
+	.long	0xda752d4f, 0x3fc55550
+	.type L(DP_P2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_P2))
+
+	.p2align 3
+L(DP_P0): /* double precision polynomial coefficient P0 */
+	.long	0xffffe7c6, 0x3fefffff
+	.type L(DP_P0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_P0))
+
+	.p2align 2
+L(SP_INF_0):
+	.long	0x7f800000	/* single precision Inf */
+	.long	0		/* single precision zero */
+	.type L(SP_INF_0), @object
+	ASM_SIZE_DIRECTIVE(L(SP_INF_0))
+
+	.section .rodata.cst4,"aM",@progbits,4
+	.p2align 2
+L(SP_RS): /* single precision 2^23+2^22 */
+	.long	0x4b400000
+	.type L(SP_RS), @object
+	ASM_SIZE_DIRECTIVE(L(SP_RS))
+
+	.p2align 2
+L(SP_SMALL): /* single precision small value 2^(-100) */
+	.long	0x0d800000
+	.type L(SP_SMALL), @object
+	ASM_SIZE_DIRECTIVE(L(SP_SMALL))
+
+	.p2align 2
+L(SP_LARGE): /* single precision large value 2^100 */
+	.long	0x71800000
+	.type L(SP_LARGE), @object
+	ASM_SIZE_DIRECTIVE(L(SP_LARGE))
+
+	.p2align 2
+L(SP_ONE): /* single precision 1.0 */
+	.long	0x3f800000
+	.type L(SP_ONE), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+strong_alias (__ieee754_expf_sse2, __expf_finite_sse2)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c
new file mode 100644
index 0000000000..388cf98a39
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c
@@ -0,0 +1,37 @@
+/* Multiple versions of expf
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern double __ieee754_expf_sse2 (double);
+extern double __ieee754_expf_ia32 (double);
+
+double __ieee754_expf (double);
+libm_ifunc (__ieee754_expf,
+	    HAS_CPU_FEATURE (SSE2)
+	    ? __ieee754_expf_sse2
+	    : __ieee754_expf_ia32);
+
+extern double __expf_finite_sse2 (double);
+extern double __expf_finite_ia32 (double);
+
+double __expf_finite (double);
+libm_ifunc (__expf_finite,
+	    HAS_CPU_FEATURE (SSE2)
+	    ? __expf_finite_sse2
+	    : __expf_finite_ia32);
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
new file mode 100644
index 0000000000..04bc23b37b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
@@ -0,0 +1,2188 @@
+# Begin of automatic generation
+
+# Maximal error of functions:
+Function: "acos":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "acos_downward":
+ildouble: 2
+ldouble: 2
+
+Function: "acos_towardzero":
+ildouble: 2
+ldouble: 2
+
+Function: "acos_upward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "acosh":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 2
+
+Function: "acosh_downward":
+double: 1
+idouble: 1
+ildouble: 6
+ldouble: 4
+
+Function: "acosh_towardzero":
+double: 1
+idouble: 1
+ildouble: 6
+ldouble: 4
+
+Function: "acosh_upward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 3
+
+Function: "asin":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_downward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "asin_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asinh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "asinh_downward":
+double: 1
+float: 1
+idouble: 1
+ildouble: 5
+ldouble: 5
+
+Function: "asinh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "asinh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "atan":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atanh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "atanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 4
+
+Function: "atanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 3
+
+Function: "atanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "cabs":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cacos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacos_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "cacos_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "cacos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "cacosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacosh_downward":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cacosh_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacosh_upward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "carg":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "casin_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "casin_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "casin_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casin_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casinh_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Imaginary part of "casinh_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "casinh_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "casinh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "catan":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "catanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catanh":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cbrt":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cbrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "cbrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cbrt_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccosh_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cexp":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cexp":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cexp_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cexp_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "clog":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog10":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog10":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "clog10_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "clog10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cos":
+ildouble: 1
+ldouble: 1
+
+Function: "cos_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cos_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cos_upward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh":
+double: 1
+float: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_downward":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: "cosh_towardzero":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_upward":
+double: 4
+float: 2
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: Real part of "cpow":
+double: 2
+float: 5
+idouble: 2
+ifloat: 5
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cpow":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "cpow_downward":
+double: 5
+float: 8
+idouble: 5
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cpow_towardzero":
+double: 5
+float: 8
+idouble: 5
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cpow_upward":
+double: 4
+float: 1
+idouble: 4
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cpow_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+
+Function: Real part of "csin_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "csinh_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csqrt":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "csqrt":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csqrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "csqrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ctan_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctan_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ctanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ctanh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctanh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "erf":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erfc":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "erfc_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "exp":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_downward":
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_upward":
+ildouble: 1
+ldouble: 1
+
+Function: "exp_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "expm1":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "expm1_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "gamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "gamma_downward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_towardzero":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_upward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "hypot":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j0":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "j0_downward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "j0_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "j0_upward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "j1":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j1_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "j1_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "j1_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: "jn":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "jn_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "jn_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "jn_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "lgamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "lgamma_downward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_towardzero":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_upward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "log":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log1p":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log1p_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "log2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log_downward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow_downward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_towardzero":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_upward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "sin":
+ildouble: 1
+ldouble: 1
+
+Function: "sin_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sin_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sin_upward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos":
+ildouble: 1
+ldouble: 1
+
+Function: "sincos_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sincos_upward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sinh":
+double: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sinh_downward":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 5
+
+Function: "sinh_towardzero":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 4
+
+Function: "sinh_upward":
+double: 4
+float: 2
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 5
+
+Function: "tan":
+float: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "tan_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "tanh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 7
+ldouble: 4
+
+Function: "tanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 4
+
+Function: "tgamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_downward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_towardzero":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_upward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "y0":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "y0_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y0_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y0_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "y1":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "y1_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "y1_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y1_upward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 7
+ldouble: 7
+
+Function: "yn":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "yn_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "yn_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "yn_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+# end of automatic generation
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name
new file mode 100644
index 0000000000..193dd704b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name
@@ -0,0 +1 @@
+i686
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S
new file mode 100644
index 0000000000..f37850d0b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S
@@ -0,0 +1,553 @@
+/* Optimized with sse2 version of cosf
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ *  1) if |x| == 0: return 1.0-|x|.
+ *  2) if |x| <  2^-27: return 1.0-|x|.
+ *  3) if |x| <  2^-5 : return 1.0+x^2*DP_COS2_0+x^5*DP_COS2_1.
+ *  4) if |x| <   Pi/4: return 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
+ *  5) if |x| < 9*Pi/4:
+ *      5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+3,
+ *           t=|x|-j*Pi/4.
+ *      5.2) Reconstruction:
+ *          s = (-1.0)^((n>>2)&1)
+ *          if(n&2 != 0) {
+ *              using cos(t) polynomial for |t|<Pi/4, result is
+ *              s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+ *          } else {
+ *              using sin(t) polynomial for |t|<Pi/4, result is
+ *              s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+ *          }
+ *  6) if |x| < 2^23, large args:
+ *      6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
+ *           t=|x|-j*Pi/4.
+ *      6.2) Reconstruction same as (5.2).
+ *  7) if |x| >= 2^23, very large args:
+ *      7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
+ *           t=|x|-j*Pi/4.
+ *      7.2) Reconstruction same as (5.2).
+ *  8) if x is Inf, return x-x, and set errno=EDOM.
+ *  9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ *  cos(+-0) = 1 not raising inexact,
+ *  cos(subnormal) raises inexact,
+ *  cos(min_normalized) raises inexact,
+ *  cos(normalized) raises inexact,
+ *  cos(Inf) = NaN, raises invalid, sets errno to EDOM,
+ *  cos(NaN) = NaN.
+ */
+
+#ifdef	PIC
+# define MO1(symbol)			L(symbol)##@GOTOFF(%ebx)
+# define MO2(symbol,reg2,_scale)	L(symbol)##@GOTOFF(%ebx,reg2,_scale)
+# define CFI_PUSH(REG)	cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
+# define CFI_POP(REG)	cfi_adjust_cfa_offset(-4); cfi_restore(REG)
+# define PUSH(REG)			pushl REG; CFI_PUSH(REG)
+# define POP(REG)			popl REG; CFI_POP(REG)
+# define ENTRANCE			PUSH(%ebx); LOAD_PIC_REG(bx)
+# define RETURN				POP(%ebx); ret; CFI_PUSH(%ebx)
+# define ARG_X				8(%esp)
+#else
+# define MO1(symbol)			L(symbol)
+# define MO2(symbol,reg2,_scale)	L(symbol)(,reg2,_scale)
+# define ENTRANCE
+# define RETURN				ret
+# define ARG_X				4(%esp)
+#endif
+
+	.text
+ENTRY(__cosf_sse2)
+	/* Input: single precision x on stack at address ARG_X */
+
+	ENTRANCE
+	movl	ARG_X, %eax		/* Bits of x */
+	cvtss2sd ARG_X, %xmm0		/* DP x */
+	andl	$0x7fffffff, %eax	/* |x| */
+
+	cmpl	$0x3f490fdb, %eax	/* |x|<Pi/4?  */
+	jb	L(arg_less_pio4)
+
+	/* Here if |x|>=Pi/4 */
+	movd	%eax, %xmm3		/* SP |x| */
+	andpd	MO1(DP_ABS_MASK),%xmm0	/* DP |x| */
+	movss	MO1(SP_INVPIO4), %xmm2	/* SP 1/(Pi/4) */
+
+	cmpl	$0x40e231d6, %eax	/* |x|<9*Pi/4?  */
+	jae	L(large_args)
+
+	/* Here if Pi/4<=|x|<9*Pi/4 */
+	mulss	%xmm3, %xmm2		/* SP |x|/(Pi/4) */
+	cvttss2si %xmm2, %eax		/* k, number of Pi/4 in x */
+	addl	$1, %eax		/* k+1 */
+	movl	$0x0e, %edx
+	andl	%eax, %edx		/* j = (k+1)&0x0e */
+	addl	$2, %eax		/* n */
+	subsd	MO2(PIO4J,%edx,8), %xmm0 /* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+	/* Input: %eax=n, %xmm0=t */
+	testl	$2, %eax		/* n&2 != 0?  */
+	jz	L(sin_poly)
+
+/*L(cos_poly):*/
+	/* Here if cos(x) calculated using cos(t) polynomial for |t|<Pi/4:
+	 * y = t*t; z = y*y;
+	 * s = sign(x) * (-1.0)^((n>>2)&1)
+	 * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
+	 */
+	shrl	$2, %eax		/* n>>2 */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	andl	$1, %eax		/* (n>>2)&1 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=t^4 */
+
+	movsd	MO1(DP_C4), %xmm4	/* C4 */
+	mulsd	%xmm0, %xmm4		/* z*C4 */
+	movsd	MO1(DP_C3), %xmm3	/* C3 */
+	mulsd	%xmm0, %xmm3		/* z*C3 */
+	addsd	MO1(DP_C2), %xmm4	/* C2+z*C4 */
+	mulsd	%xmm0, %xmm4		/* z*(C2+z*C4) */
+	lea	-8(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	addsd	MO1(DP_C1), %xmm3	/* C1+z*C3 */
+	mulsd	%xmm0, %xmm3		/* z*(C1+z*C3) */
+	addsd	MO1(DP_C0), %xmm4	/* C0+z*(C2+z*C4) */
+	mulsd	%xmm1, %xmm4		/* y*(C0+z*(C2+z*C4)) */
+
+	addsd	%xmm4, %xmm3		/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	/* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	addsd	MO1(DP_ONES), %xmm3
+
+	mulsd	MO2(DP_ONES,%eax,8), %xmm3 /* DP result */
+	movsd	%xmm3, 0(%esp)		/* Move result from sse...  */
+	fldl	0(%esp)			/* ...to FPU.  */
+	/* Return back 4 bytes of stack frame */
+	lea	8(%esp), %esp
+	RETURN
+
+	.p2align	4
+L(sin_poly):
+	/* Here if cos(x) calculated using sin(t) polynomial for |t|<Pi/4:
+	 * y = t*t; z = y*y;
+	 * s = sign(x) * (-1.0)^((n>>2)&1)
+	 * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
+	 */
+
+	movaps	%xmm0, %xmm4		/* t */
+	shrl	$2, %eax		/* n>>2 */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	andl	$1, %eax		/* (n>>2)&1 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=t^4 */
+
+	movsd	MO1(DP_S4), %xmm2	/* S4 */
+	mulsd	%xmm0, %xmm2		/* z*S4 */
+	movsd	MO1(DP_S3), %xmm3	/* S3 */
+	mulsd	%xmm0, %xmm3		/* z*S3 */
+	lea	-8(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	addsd	MO1(DP_S2), %xmm2	/* S2+z*S4 */
+	mulsd	%xmm0, %xmm2		/* z*(S2+z*S4) */
+	addsd	MO1(DP_S1), %xmm3	/* S1+z*S3 */
+	mulsd	%xmm0, %xmm3		/* z*(S1+z*S3) */
+	addsd	MO1(DP_S0), %xmm2	/* S0+z*(S2+z*S4) */
+	mulsd	%xmm1, %xmm2		/* y*(S0+z*(S2+z*S4)) */
+	/* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
+	mulsd	MO2(DP_ONES,%eax,8), %xmm4
+	addsd	%xmm2, %xmm3		/* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	/* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	mulsd	%xmm4, %xmm3
+	/* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	addsd	%xmm4, %xmm3
+	movsd	%xmm3, 0(%esp)		/* Move result from sse...   */
+	fldl	0(%esp)			/* ...to FPU.  */
+	/* Return back 4 bytes of stack frame */
+	lea	8(%esp), %esp
+	RETURN
+
+	.p2align	4
+L(large_args):
+	/* Here if |x|>=9*Pi/4 */
+	cmpl	$0x7f800000, %eax	/* x is Inf or NaN?  */
+	jae	L(arg_inf_or_nan)
+
+	/* Here if finite |x|>=9*Pi/4 */
+	cmpl	$0x4b000000, %eax	/* |x|<2^23?  */
+	jae	L(very_large_args)
+
+	/* Here if 9*Pi/4<=|x|<2^23 */
+	movsd	MO1(DP_INVPIO4), %xmm1	/* 1/(Pi/4) */
+	mulsd	%xmm0, %xmm1		/* |x|/(Pi/4) */
+	cvttsd2si %xmm1, %eax		/* k=trunc(|x|/(Pi/4)) */
+	addl	$1, %eax		/* k+1 */
+	movl	%eax, %edx
+	andl	$0xfffffffe, %edx	/* j=(k+1)&0xfffffffe */
+	cvtsi2sdl %edx, %xmm4		/* DP j */
+	movsd	MO1(DP_PIO4HI), %xmm2	/* -PIO4HI = high part of -Pi/4 */
+	mulsd	%xmm4, %xmm2		/* -j*PIO4HI */
+	movsd	MO1(DP_PIO4LO), %xmm3	/* -PIO4LO = low part of -Pi/4 */
+	addsd	%xmm2, %xmm0		/* |x| - j*PIO4HI */
+	addl	$2, %eax		/* n */
+	mulsd	%xmm3, %xmm4		/* j*PIO4LO */
+	addsd	%xmm4, %xmm0		/* t = |x| - j*PIO4HI - j*PIO4LO */
+	jmp	L(reconstruction)
+
+	.p2align	4
+L(very_large_args):
+	/* Here if finite |x|>=2^23 */
+
+	/* bitpos = (ix>>23) - BIAS_32 + 59; */
+	shrl	$23, %eax		/* eb = biased exponent of x */
+	/* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
+	subl	$68, %eax
+	movl	$28, %ecx		/* %cl=28 */
+	movl	%eax, %edx		/* bitpos copy */
+
+	/* j = bitpos/28; */
+	div	%cl			/* j in register %al=%ax/%cl */
+	movapd	%xmm0, %xmm3		/* |x| */
+	/* clear unneeded remainder from %ah */
+	andl	$0xff, %eax
+
+	imull	$28, %eax, %ecx		/* j*28 */
+	movsd	MO1(DP_HI_MASK), %xmm4	/* DP_HI_MASK */
+	movapd	%xmm0, %xmm5		/* |x| */
+	mulsd	-2*8+MO2(_FPI,%eax,8), %xmm3	/* tmp3 = FPI[j-2]*|x| */
+	movapd	%xmm0, %xmm1		/* |x| */
+	mulsd	-1*8+MO2(_FPI,%eax,8), %xmm5	/* tmp2 = FPI[j-1]*|x| */
+	mulsd	0*8+MO2(_FPI,%eax,8), %xmm0	/* tmp0 = FPI[j]*|x| */
+	addl	$19, %ecx		/* j*28+19 */
+	mulsd	1*8+MO2(_FPI,%eax,8), %xmm1	/* tmp1 = FPI[j+1]*|x| */
+	cmpl	%ecx, %edx		/* bitpos>=j*28+19?  */
+	jl	L(very_large_skip1)
+
+	/* Here if bitpos>=j*28+19 */
+	andpd	%xmm3, %xmm4		/* HI(tmp3) */
+	subsd	%xmm4, %xmm3		/* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+	movsd	MO1(DP_2POW52), %xmm6
+	movapd	%xmm5, %xmm2		/* tmp2 copy */
+	addsd	%xmm3, %xmm5		/* tmp5 = tmp3 + tmp2 */
+	movl	$1, %edx
+	addsd	%xmm5, %xmm6		/* tmp6 = tmp5 + 2^52 */
+	movsd	8+MO1(DP_2POW52), %xmm4
+	movd	%xmm6, %eax		/* k = I64_LO(tmp6); */
+	addsd	%xmm6, %xmm4		/* tmp4 = tmp6 - 2^52 */
+	comisd	%xmm5, %xmm4		/* tmp4 > tmp5?  */
+	jbe	L(very_large_skip2)
+
+	/* Here if tmp4 > tmp5 */
+	subl	$1, %eax		/* k-- */
+	addsd	8+MO1(DP_ONES), %xmm4	/* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+	andl	%eax, %edx		/* k&1 */
+	subsd	%xmm4, %xmm3		/* tmp3 -= tmp4 */
+	addsd	MO2(DP_ZERONE,%edx,8), %xmm3 /* t  = DP_ZERONE[k&1] + tmp3 */
+	addsd	%xmm2, %xmm3		/* t += tmp2 */
+	addsd	%xmm3, %xmm0		/* t += tmp0 */
+	addl	$3, %eax		/* n=k+3 */
+	addsd	%xmm1, %xmm0		/* t += tmp1 */
+	mulsd	MO1(DP_PIO4), %xmm0	/* t *= PI04 */
+
+	jmp	L(reconstruction)	/* end of very_large_args peth */
+
+	.p2align	4
+L(arg_less_pio4):
+	/* Here if |x|<Pi/4 */
+	cmpl	$0x3d000000, %eax	/* |x|<2^-5?  */
+	jl	L(arg_less_2pn5)
+
+	/* Here if 2^-5<=|x|<Pi/4 */
+	mulsd	%xmm0, %xmm0		/* y=x^2 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=x^4 */
+	movsd	MO1(DP_C4), %xmm3	/* C4 */
+	mulsd	%xmm0, %xmm3		/* z*C4 */
+	movsd	MO1(DP_C3), %xmm5	/* C3 */
+	mulsd	%xmm0, %xmm5		/* z*C3 */
+	addsd	MO1(DP_C2), %xmm3	/* C2+z*C4 */
+	mulsd	%xmm0, %xmm3		/* z*(C2+z*C4) */
+	addsd	MO1(DP_C1), %xmm5	/* C1+z*C3 */
+	mulsd	%xmm0, %xmm5		/* z*(C1+z*C3) */
+	addsd	MO1(DP_C0), %xmm3	/* C0+z*(C2+z*C4) */
+	mulsd	%xmm1, %xmm3		/* y*(C0+z*(C2+z*C4)) */
+	addsd	%xmm5, %xmm3		/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	/* 1.0 + y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	addsd	MO1(DP_ONES), %xmm3
+	cvtsd2ss %xmm3, %xmm3		/* SP result */
+
+L(epilogue):
+	lea	-4(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	movss	%xmm3, 0(%esp)		/* Move result from sse...  */
+	flds	0(%esp)			/* ...to FPU.  */
+	/* Return back 4 bytes of stack frame */
+	lea	4(%esp), %esp
+	RETURN
+
+	.p2align	4
+L(arg_less_2pn5):
+	/* Here if |x|<2^-5 */
+	cmpl	$0x32000000, %eax	/* |x|<2^-27?  */
+	jl	L(arg_less_2pn27)
+
+	/* Here if 2^-27<=|x|<2^-5 */
+	mulsd	%xmm0, %xmm0		/* DP x^2 */
+	movsd	MO1(DP_COS2_1), %xmm3	/* DP DP_COS2_1 */
+	mulsd	%xmm0, %xmm3		/* DP x^2*DP_COS2_1 */
+	addsd	MO1(DP_COS2_0), %xmm3	/* DP DP_COS2_0+x^2*DP_COS2_1 */
+	mulsd	%xmm0, %xmm3		/* DP x^2*DP_COS2_0+x^4*DP_COS2_1 */
+	/* DP 1.0+x^2*DP_COS2_0+x^4*DP_COS2_1 */
+	addsd	MO1(DP_ONES), %xmm3
+	cvtsd2ss %xmm3, %xmm3		/* SP result */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(arg_less_2pn27):
+	/* Here if |x|<2^-27 */
+	movss	ARG_X, %xmm0		/* x */
+	andps	MO1(SP_ABS_MASK),%xmm0	/* |x| */
+	movss	MO1(SP_ONE), %xmm3	/* 1.0 */
+	subss	%xmm0, %xmm3		/* result is 1.0-|x| */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(arg_inf_or_nan):
+	/* Here if |x| is Inf or NAN */
+	jne	L(skip_errno_setting)	/* in case of x is NaN */
+
+	/* Here if x is Inf. Set errno to EDOM.  */
+	call	JUMPTARGET(__errno_location)
+	movl	$EDOM, (%eax)
+
+	.p2align	4
+L(skip_errno_setting):
+	/* Here if |x| is Inf or NAN. Continued.  */
+	movss	ARG_X, %xmm3		/* load x */
+	subss	%xmm3, %xmm3		/* Result is NaN */
+	jmp	L(epilogue)
+END(__cosf_sse2)
+
+	.section .rodata, "a"
+	.p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+	.long	0x00000000,0x00000000
+	.long	0x54442d18,0x3fe921fb
+	.long	0x54442d18,0x3ff921fb
+	.long	0x7f3321d2,0x4002d97c
+	.long	0x54442d18,0x400921fb
+	.long	0x2955385e,0x400f6a7a
+	.long	0x7f3321d2,0x4012d97c
+	.long	0xe9bba775,0x4015fdbb
+	.long	0x54442d18,0x401921fb
+	.long	0xbeccb2bb,0x401c463a
+	.long	0x2955385e,0x401f6a7a
+	.type L(PIO4J), @object
+	ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+	.p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+	.long	0x00000000,0x00000000
+	.long	0x6c000000,0x3ff45f30
+	.long	0x2a000000,0x3e3c9c88
+	.long	0xa8000000,0x3c54fe13
+	.long	0xd0000000,0x3aaf47d4
+	.long	0x6c000000,0x38fbb81b
+	.long	0xe0000000,0x3714acc9
+	.long	0x7c000000,0x3560e410
+	.long	0x56000000,0x33bca2c7
+	.long	0xac000000,0x31fbd778
+	.long	0xe0000000,0x300b7246
+	.long	0xe8000000,0x2e5d2126
+	.long	0x48000000,0x2c970032
+	.long	0xe8000000,0x2ad77504
+	.long	0xe0000000,0x290921cf
+	.long	0xb0000000,0x274deb1c
+	.long	0xe0000000,0x25829a73
+	.long	0xbe000000,0x23fd1046
+	.long	0x10000000,0x2224baed
+	.long	0x8e000000,0x20709d33
+	.long	0x80000000,0x1e535a2f
+	.long	0x64000000,0x1cef904e
+	.long	0x30000000,0x1b0d6398
+	.long	0x24000000,0x1964ce7d
+	.long	0x16000000,0x17b908bf
+	.type L(_FPI), @object
+	ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomial
+ for cos(x)~=1.0+x^2*DP_COS2_0+x^4*DP_COS2_1, |x|<2^-5.  */
+	.p2align 3
+L(DP_COS2_0):
+	.long	0xff5cc6fd,0xbfdfffff
+	.type L(DP_COS2_0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_COS2_0))
+
+	.p2align 3
+L(DP_COS2_1):
+	.long	0xb178dac5,0x3fa55514
+	.type L(DP_COS2_1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_COS2_1))
+
+	.p2align 3
+L(DP_ZERONE):
+	.long	0x00000000,0x00000000	/* 0.0 */
+	.long	0x00000000,0xbff00000	/* 1.0 */
+	.type L(DP_ZERONE),@object
+	ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+	.p2align 3
+L(DP_ONES):
+	.long	0x00000000,0x3ff00000	/* +1.0 */
+	.long	0x00000000,0xbff00000	/* -1.0 */
+	.type L(DP_ONES), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomial
+ for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4.  */
+	.p2align 3
+L(DP_S3):
+	.long	0x64e6b5b4,0x3ec71d72
+	.type L(DP_S3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S3))
+
+	.p2align 3
+L(DP_S1):
+	.long	0x10c2688b,0x3f811111
+	.type L(DP_S1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S1))
+
+	.p2align 3
+L(DP_S4):
+	.long	0x1674b58a,0xbe5a947e
+	.type L(DP_S4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S4))
+
+	.p2align 3
+L(DP_S2):
+	.long	0x8b4bd1f9,0xbf2a019f
+	.type L(DP_S2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S2))
+
+	.p2align 3
+L(DP_S0):
+	.long	0x55551cd9,0xbfc55555
+	.type L(DP_S0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S0))
+
+/* Coefficients of polynomial
+ for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4.  */
+	.p2align 3
+L(DP_C3):
+	.long	0x9ac43cc0,0x3efa00eb
+	.type L(DP_C3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C3))
+
+	.p2align 3
+L(DP_C1):
+	.long	0x545c50c7,0x3fa55555
+	.type L(DP_C1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C1))
+
+	.p2align 3
+L(DP_C4):
+	.long	0xdd8844d7,0xbe923c97
+	.type L(DP_C4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C4))
+
+	.p2align 3
+L(DP_C2):
+	.long	0x348b6874,0xbf56c16b
+	.type L(DP_C2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C2))
+
+	.p2align 3
+L(DP_C0):
+	.long	0xfffe98ae,0xbfdfffff
+	.type L(DP_C0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C0))
+
+	.p2align 3
+L(DP_PIO4):
+	.long	0x54442d18,0x3fe921fb	/* Pi/4 */
+	.type L(DP_PIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+	.p2align 3
+L(DP_2POW52):
+	.long	0x00000000,0x43300000	/* +2^52 */
+	.long	0x00000000,0xc3300000	/* -2^52 */
+	.type L(DP_2POW52), @object
+	ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+	.p2align 3
+L(DP_INVPIO4):
+	.long	0x6dc9c883,0x3ff45f30	/* 4/Pi */
+	.type L(DP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+	.p2align 3
+L(DP_PIO4HI):
+	.long	0x54000000,0xbfe921fb	/* High part of Pi/4 */
+	.type L(DP_PIO4HI), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+	.p2align 3
+L(DP_PIO4LO):
+	.long	0x11A62633,0xbe010b46	/* Low part of Pi/4 */
+	.type L(DP_PIO4LO), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+	.p2align 2
+L(SP_INVPIO4):
+	.long	0x3fa2f983		/* 4/Pi */
+	.type L(SP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+	.p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+	.long	0xffffffff,0x7fffffff
+	.long	0xffffffff,0x7fffffff
+	.type L(DP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+	.p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+	.long	0x00000000,0xffffffff
+	.type L(DP_HI_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+	.p2align 4
+L(SP_ABS_MASK): /* Mask for getting SP absolute value */
+	.long	0x7fffffff,0x7fffffff
+	.long	0x7fffffff,0x7fffffff
+	.type L(SP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
+
+	.p2align 2
+L(SP_ONE):
+	.long	0x3f800000		/* 1.0 */
+	.type L(SP_ONE), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+weak_alias (__cosf, cosf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c
new file mode 100644
index 0000000000..af588de9dc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c
@@ -0,0 +1,29 @@
+/* Multiple versions of cosf
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern float __cosf_sse2 (float);
+extern float __cosf_ia32 (float);
+float __cosf (float);
+
+libm_ifunc (__cosf, HAS_CPU_FEATURE (SSE2) ? __cosf_sse2 : __cosf_ia32);
+weak_alias (__cosf, cosf);
+
+#define COSF __cosf_ia32
+#include <sysdeps/ieee754/flt-32/s_cosf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S
new file mode 100644
index 0000000000..f31a925522
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S
@@ -0,0 +1,586 @@
+/* Optimized with sse2 version of sincosf
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ *  1) if |x|==0:    sin(x)=x,
+ *                   cos(x)=1.
+ *  2) if |x|<2^-27: sin(x)=x-x*DP_SMALL, raising underflow only when needed,
+ *                   cos(x)=1-|x|.
+ *  3) if |x|<2^-5 : sin(x)=x+x*x^2*DP_SIN2_0+x^5*DP_SIN2_1,
+ *                   cos(x)=1+1*x^2*DP_COS2_0+x^5*DP_COS2_1
+ *  4) if |x|< Pi/4: sin(x)=x+x*x^2*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))),
+ *                   cos(x)=1+1*x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
+ *  5) if |x| < 9*Pi/4:
+ *      5.1) Range reduction:
+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1, t=|x|-j*Pi/4.
+ *      5.2) Reconstruction:
+ *          sign_sin = sign(x) * (-1.0)^(( n   >>2)&1)
+ *          sign_cos =           (-1.0)^(((n+2)>>2)&1)
+ *          poly_sin = ((((S4*t^2 + S3)*t^2 + S2)*t^2 + S1)*t^2 + S0)*t^2*t+t
+ *          poly_cos = ((((C4*t^2 + C3)*t^2 + C2)*t^2 + C1)*t^2 + C0)*t^2*s+s
+ *          if(n&2 != 0) {
+ *              using cos(t) and sin(t) polynomials for |t|<Pi/4, results are
+ *              cos(x) = poly_sin * sign_cos
+ *              sin(x) = poly_cos * sign_sin
+ *          } else {
+ *              sin(x) = poly_sin * sign_sin
+ *              cos(x) = poly_cos * sign_cos
+ *          }
+ *  6) if |x| < 2^23, large args:
+ *      6.1) Range reduction:
+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4
+ *      6.2) Reconstruction same as (5.2).
+ *  7) if |x| >= 2^23, very large args:
+ *      7.1) Range reduction:
+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4.
+ *      7.2) Reconstruction same as (5.2).
+ *  8) if x is Inf, return x-x, and set errno=EDOM.
+ *  9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ *  sin/cos(+-0) = +-0/1 not raising inexact/underflow,
+ *  sin/cos(subnormal) raises inexact/underflow,
+ *  sin/cos(min_normalized) raises inexact/underflow,
+ *  sin/cos(normalized) raises inexact,
+ *  sin/cos(Inf) = NaN, raises invalid, sets errno to EDOM,
+ *  sin/cos(NaN) = NaN.
+ */
+
+#ifdef	PIC
+# define MO1(symbol)			L(symbol)##@GOTOFF(%ebx)
+# define MO2(symbol,reg2,_scale)	L(symbol)##@GOTOFF(%ebx,reg2,_scale)
+# define CFI_PUSH(REG)	cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
+# define CFI_POP(REG)	cfi_adjust_cfa_offset(-4); cfi_restore(REG)
+# define PUSH(REG)			pushl REG; CFI_PUSH(REG)
+# define POP(REG)			popl REG; CFI_POP(REG)
+# define ENTRANCE			PUSH(%ebx); LOAD_PIC_REG(bx)
+# define RETURN				POP(%ebx); ret; CFI_PUSH(%ebx)
+# define ARG_X				8(%esp)
+# define ARG_SIN_PTR			12(%esp)
+# define ARG_COS_PTR			16(%esp)
+#else
+# define MO1(symbol)			L(symbol)
+# define MO2(symbol,reg2,_scale)	L(symbol)(,reg2,_scale)
+# define ENTRANCE
+# define RETURN				ret
+# define ARG_X				4(%esp)
+# define ARG_SIN_PTR			8(%esp)
+# define ARG_COS_PTR			12(%esp)
+#endif
+
+	.text
+ENTRY(__sincosf_sse2)
+	/* Input: single precision x on stack at address ARG_X */
+	/*        pointer to sin result on stack at address ARG_SIN_PTR */
+	/*        pointer to cos result on stack at address ARG_COS_PTR */
+
+	ENTRANCE
+	movl	ARG_X, %eax		/* Bits of x */
+	cvtss2sd ARG_X, %xmm0		/* DP x */
+	andl	$0x7fffffff, %eax	/* |x| */
+
+	cmpl	$0x3f490fdb, %eax	/* |x|<Pi/4 ? */
+	jb	L(arg_less_pio4)
+
+	/* Here if |x|>=Pi/4 */
+	movd	%eax, %xmm3		/* SP |x| */
+	andpd	MO1(DP_ABS_MASK),%xmm0	/* DP |x| */
+	movss	MO1(SP_INVPIO4), %xmm2	/* SP 1/(Pi/4) */
+
+	cmpl	$0x40e231d6, %eax	/* |x|<9*Pi/4 ? */
+	jae	L(large_args)
+
+	/* Here if Pi/4<=|x|<9*Pi/4 */
+	mulss	%xmm3, %xmm2		/* SP |x|/(Pi/4) */
+	movl	ARG_X, %ecx		/* Load x */
+	cvttss2si %xmm2, %eax		/* k, number of Pi/4 in x */
+	shrl	$29, %ecx		/* (sign of x) << 2 */
+	addl	$1, %eax		/* k+1 */
+	movl	$0x0e, %edx
+	andl	%eax, %edx		/* j = (k+1)&0x0e */
+	subsd	MO2(PIO4J,%edx,8), %xmm0/* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+	/* Input: %eax=n, %xmm0=t, %ecx=sign(x) */
+
+	movaps	%xmm0, %xmm4		/* t */
+	movhpd	MO1(DP_ONES), %xmm4	/* 1|t */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	movl	$2, %edx
+	unpcklpd %xmm0, %xmm0		/* y|y */
+	addl	%eax, %edx		/* k+2 */
+	movaps	%xmm0, %xmm1		/* y|y */
+	mulpd	%xmm0, %xmm0		/* z=t^4|z=t^4 */
+
+	movaps	MO1(DP_SC4), %xmm2	/* S4 */
+	mulpd	%xmm0, %xmm2		/* z*S4 */
+	movaps	MO1(DP_SC3), %xmm3	/* S3 */
+	mulpd	%xmm0, %xmm3		/* z*S3 */
+	xorl	%eax, %ecx		/* (sign_x ^ (k>>2))<<2 */
+	addpd	MO1(DP_SC2), %xmm2	/* S2+z*S4 */
+	mulpd	%xmm0, %xmm2		/* z*(S2+z*S4) */
+	shrl	$2, %edx		/* (k+2)>>2 */
+	addpd	MO1(DP_SC1), %xmm3	/* S1+z*S3 */
+	mulpd	%xmm0, %xmm3		/* z*(S1+z*S3) */
+	shrl	$2, %ecx		/* sign_x ^ k>>2 */
+	addpd	MO1(DP_SC0), %xmm2	/* S0+z*(S2+z*S4) */
+	andl	$1, %edx		/* sign_cos = ((k+2)>>2)&1 */
+	mulpd	%xmm1, %xmm2		/* y*(S0+z*(S2+z*S4)) */
+	andl	$1, %ecx		/* sign_sin = sign_x ^ ((k>>2)&1) */
+	addpd	%xmm2, %xmm3		/* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	mulpd	%xmm4, %xmm3		/*t*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))))*/
+	testl	$2, %eax		/* n&2 != 0 ? */
+	addpd	%xmm4, %xmm3		/*t+t*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))*/
+	jnz	L(sin_result_sin_poly)
+
+/*L(sin_result_cos_poly):*/
+	/*
+	 * Here if
+	 * cos(x) = poly_sin * sign_cos
+	 * sin(x) = poly_cos * sign_sin
+	 */
+	movsd	MO2(DP_ONES,%ecx,8), %xmm4/* 0|sign_sin */
+	movhpd	MO2(DP_ONES,%edx,8), %xmm4/* sign_cos|sign_sin */
+	mulpd	%xmm4, %xmm3		/* result_cos|result_sin */
+	movl	ARG_SIN_PTR, %eax
+	cvtpd2ps %xmm3, %xmm0		/* SP results */
+	movl	ARG_COS_PTR, %ecx
+	movss	%xmm0, (%eax)		/* store sin(x) from xmm0[0] */
+	shufps	$1, %xmm0, %xmm0	/* move cos(x) to xmm0[0] */
+	movss	%xmm0, (%ecx)		/* store cos(x) */
+	RETURN
+
+	.p2align	4
+L(sin_result_sin_poly):
+	/*
+	 * Here if
+	 * sin(x) = poly_sin * sign_sin
+	 * cos(x) = poly_cos * sign_cos
+	 */
+	movsd	MO2(DP_ONES,%edx,8), %xmm4/* 0|sign_cos */
+	movhpd	MO2(DP_ONES,%ecx,8), %xmm4/* sign_sin|sign_cos */
+	mulpd	%xmm4, %xmm3		/* result_sin|result_cos */
+	movl	ARG_SIN_PTR, %eax
+	cvtpd2ps %xmm3, %xmm0		/* SP results */
+	movl	ARG_COS_PTR, %ecx
+	movss	%xmm0, (%ecx)		/* store cos(x) from xmm0[0] */
+	shufps	$1, %xmm0, %xmm0	/* move sin(x) to xmm0[0] */
+	movss	%xmm0, (%eax)		/* store sin(x) */
+	RETURN
+
+	.p2align	4
+L(large_args):
+	/* Here if |x|>=9*Pi/4 */
+	cmpl	$0x7f800000, %eax	/* x is Inf or NaN ? */
+	jae	L(arg_inf_or_nan)
+
+	/* Here if finite |x|>=9*Pi/4 */
+	cmpl	$0x4b000000, %eax	/* |x|<2^23 ? */
+	jae	L(very_large_args)
+
+	/* Here if 9*Pi/4<=|x|<2^23 */
+	movsd	MO1(DP_INVPIO4), %xmm1	/* 1/(Pi/4) */
+	mulsd	%xmm0, %xmm1		/* |x|/(Pi/4) */
+	cvttsd2si %xmm1, %eax		/* k=trunc(|x|/(Pi/4)) */
+	addl	$1, %eax		/* k+1 */
+	movl	%eax, %edx
+	andl	$0xfffffffe, %edx	/* j=(k+1)&0xfffffffe */
+	cvtsi2sdl %edx, %xmm4		/* DP j */
+	movl	ARG_X, %ecx		/* Load x */
+	movsd	MO1(DP_PIO4HI), %xmm2	/* -PIO4HI = high part of -Pi/4 */
+	shrl	$29, %ecx		/* (sign of x) << 2 */
+	mulsd	%xmm4, %xmm2		/* -j*PIO4HI */
+	movsd	MO1(DP_PIO4LO), %xmm3	/* -PIO4LO = low part of -Pi/4 */
+	addsd	%xmm2, %xmm0		/* |x| - j*PIO4HI */
+	mulsd	%xmm3, %xmm4		/* j*PIO4LO */
+	addsd	%xmm4, %xmm0		/* t = |x| - j*PIO4HI - j*PIO4LO */
+	jmp	L(reconstruction)
+
+	.p2align	4
+L(very_large_args):
+	/* Here if finite |x|>=2^23 */
+
+	/* bitpos = (ix>>23) - BIAS_32 + 59; */
+	shrl	$23, %eax		/* eb = biased exponent of x */
+	subl	$68, %eax		/* bitpos=eb-0x7f+59, where 0x7f */
+							/*is exponent bias */
+	movl	$28, %ecx		/* %cl=28 */
+	movl	%eax, %edx		/* bitpos copy */
+
+	/* j = bitpos/28; */
+	div	%cl			/* j in register %al=%ax/%cl */
+	movapd	%xmm0, %xmm3		/* |x| */
+	andl	$0xff, %eax		/* clear unneeded remainder from %ah*/
+
+	imull	$28, %eax, %ecx		/* j*28 */
+	movsd	MO1(DP_HI_MASK), %xmm4	/* DP_HI_MASK */
+	movapd	%xmm0, %xmm5		/* |x| */
+	mulsd	-2*8+MO2(_FPI,%eax,8), %xmm3/* tmp3 = FPI[j-2]*|x| */
+	movapd	%xmm0, %xmm1		/* |x| */
+	mulsd	-1*8+MO2(_FPI,%eax,8), %xmm5/* tmp2 = FPI[j-1]*|x| */
+	mulsd	0*8+MO2(_FPI,%eax,8), %xmm0/* tmp0 = FPI[j]*|x| */
+	addl	$19, %ecx		/* j*28+19 */
+	mulsd	1*8+MO2(_FPI,%eax,8), %xmm1/* tmp1 = FPI[j+1]*|x| */
+	cmpl	%ecx, %edx		/* bitpos>=j*28+19 ? */
+	jl	L(very_large_skip1)
+
+	/* Here if bitpos>=j*28+19 */
+	andpd	%xmm3, %xmm4		/* HI(tmp3) */
+	subsd	%xmm4, %xmm3		/* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+	movsd	MO1(DP_2POW52), %xmm6
+	movapd	%xmm5, %xmm2		/* tmp2 copy */
+	addsd	%xmm3, %xmm5		/* tmp5 = tmp3 + tmp2 */
+	movl	$1, %edx
+	addsd	%xmm5, %xmm6		/* tmp6 = tmp5 + 2^52 */
+	movsd	8+MO1(DP_2POW52), %xmm4
+	movd	%xmm6, %eax		/* k = I64_LO(tmp6); */
+	addsd	%xmm6, %xmm4		/* tmp4 = tmp6 - 2^52 */
+	movl	ARG_X, %ecx		/* Load x */
+	comisd	%xmm5, %xmm4		/* tmp4 > tmp5 ? */
+	jbe	L(very_large_skip2)
+
+	/* Here if tmp4 > tmp5 */
+	subl	$1, %eax		/* k-- */
+	addsd	8+MO1(DP_ONES), %xmm4	/* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+	andl	%eax, %edx		/* k&1 */
+	subsd	%xmm4, %xmm3		/* tmp3 -= tmp4 */
+	addsd	MO2(DP_ZERONE,%edx,8), %xmm3/* t  = DP_ZERONE[k&1] + tmp3 */
+	addsd	%xmm2, %xmm3		/* t += tmp2 */
+	shrl	$29, %ecx		/* (sign of x) << 2 */
+	addsd	%xmm3, %xmm0		/* t += tmp0 */
+	addl	$1, %eax		/* n=k+1 */
+	addsd	%xmm1, %xmm0		/* t += tmp1 */
+	mulsd	MO1(DP_PIO4), %xmm0	/* t *= PI04 */
+
+	jmp	L(reconstruction)	/* end of very_large_args peth */
+
+	.p2align	4
+L(arg_less_pio4):
+	/* Here if |x|<Pi/4 */
+	cmpl	$0x3d000000, %eax	/* |x|<2^-5 ? */
+	jl	L(arg_less_2pn5)
+
+	/* Here if 2^-5<=|x|<Pi/4 */
+	movaps	%xmm0, %xmm3		/* DP x */
+	movhpd	MO1(DP_ONES), %xmm3	/* DP 1|x */
+	mulsd	%xmm0, %xmm0		/* DP y=x^2 */
+	unpcklpd %xmm0, %xmm0		/* DP y|y */
+	movaps	%xmm0, %xmm1		/* y|y */
+	mulpd	%xmm0, %xmm0		/* z=x^4|z=x^4 */
+
+	movapd	MO1(DP_SC4), %xmm4	/* S4 */
+	mulpd	%xmm0, %xmm4		/* z*S4 */
+	movapd	MO1(DP_SC3), %xmm5	/* S3 */
+	mulpd	%xmm0, %xmm5		/* z*S3 */
+	addpd	MO1(DP_SC2), %xmm4	/* S2+z*S4 */
+	mulpd	%xmm0, %xmm4		/* z*(S2+z*S4) */
+	addpd	MO1(DP_SC1), %xmm5	/* S1+z*S3 */
+	mulpd	%xmm0, %xmm5		/* z*(S1+z*S3) */
+	addpd	MO1(DP_SC0), %xmm4	/* S0+z*(S2+z*S4) */
+	mulpd	%xmm1, %xmm4		/* y*(S0+z*(S2+z*S4)) */
+	mulpd	%xmm3, %xmm5		/* x*z*(S1+z*S3) */
+	mulpd	%xmm3, %xmm4		/* x*y*(S0+z*(S2+z*S4)) */
+	addpd	%xmm5, %xmm4		/*x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))))*/
+	movl	ARG_SIN_PTR, %eax
+	addpd	%xmm4, %xmm3		/*x+x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))*/
+	movl	ARG_COS_PTR, %ecx
+	cvtpd2ps %xmm3, %xmm0		/* SP results */
+	movss	%xmm0, (%eax)		/* store sin(x) from xmm0[0] */
+	shufps	$1, %xmm0, %xmm0	/* move cos(x) to xmm0[0] */
+	movss	%xmm0, (%ecx)		/* store cos(x) */
+	RETURN
+
+	.p2align	4
+L(arg_less_2pn5):
+	/* Here if |x|<2^-5 */
+	cmpl	$0x32000000, %eax	/* |x|<2^-27 ? */
+	jl	L(arg_less_2pn27)
+
+	/* Here if 2^-27<=|x|<2^-5 */
+	movaps	%xmm0, %xmm1		/* DP x */
+	movhpd	MO1(DP_ONES), %xmm1	/* DP 1|x */
+	mulsd	%xmm0, %xmm0		/* DP x^2 */
+	unpcklpd %xmm0, %xmm0		/* DP x^2|x^2 */
+
+	movaps	MO1(DP_SINCOS2_1), %xmm3/* DP DP_SIN2_1 */
+	mulpd	%xmm0, %xmm3		/* DP x^2*DP_SIN2_1 */
+	addpd	MO1(DP_SINCOS2_0), %xmm3/* DP DP_SIN2_0+x^2*DP_SIN2_1 */
+	mulpd	%xmm0, %xmm3		/* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */
+	mulpd	%xmm1, %xmm3		/* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+	addpd	%xmm1, %xmm3		/* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+	movl	ARG_SIN_PTR, %eax
+	cvtpd2ps %xmm3, %xmm0		/* SP results */
+	movl	ARG_COS_PTR, %ecx
+	movss	%xmm0, (%eax)		/* store sin(x) from xmm0[0] */
+	shufps	$1, %xmm0, %xmm0	/* move cos(x) to xmm0[0] */
+	movss	%xmm0, (%ecx)		/* store cos(x) */
+	RETURN
+
+	.p2align	4
+L(arg_less_2pn27):
+	movss	ARG_X, %xmm7		/* SP x */
+	cmpl	$0, %eax		/* x=0 ? */
+	je	L(arg_zero)		/* in case x=0 return sin(+-0)==+-0 */
+	/* Here if |x|<2^-27 */
+	/*
+	 * Special cases here:
+	 *  sin(subnormal) raises inexact/underflow
+	 *  sin(min_normalized) raises inexact/underflow
+	 *  sin(normalized) raises inexact
+	 *  cos(here)=1-|x| (raising inexact)
+	 */
+	movaps	%xmm0, %xmm3		/* DP x */
+	mulsd	MO1(DP_SMALL), %xmm0	/* DP x*DP_SMALL */
+	subsd	%xmm0, %xmm3		/* DP sin result is x-x*DP_SMALL */
+	andps	MO1(SP_ABS_MASK), %xmm7	/* SP |x| */
+	cvtsd2ss %xmm3, %xmm0		/* sin(x) */
+	movl	ARG_SIN_PTR, %eax
+	movss	MO1(SP_ONE), %xmm1	/* SP 1.0 */
+	movss	%xmm0, (%eax)		/* sin(x) store */
+	movl	ARG_COS_PTR, %ecx
+	subss	%xmm7, %xmm1		/* cos(x) */
+	movss	%xmm1, (%ecx)		/* cos(x) store */
+	RETURN
+
+	.p2align	4
+L(arg_zero):
+	movss	MO1(SP_ONE), %xmm0	/* 1.0 */
+	movl	ARG_SIN_PTR, %eax
+	movl	ARG_COS_PTR, %ecx
+	movss	%xmm7, (%eax)		/* sin(+-0)==x */
+	movss	%xmm0, (%ecx)		/* cos(+-0)==1 */
+	RETURN
+
+	.p2align	4
+L(arg_inf_or_nan):
+	movss	ARG_X, %xmm7		/* SP x */
+	/* Here if |x| is Inf or NAN */
+	jne	L(skip_errno_setting)	/* in case of x is NaN */
+
+	/* Here if x is Inf. Set errno to EDOM.  */
+	call	JUMPTARGET(__errno_location)
+	movl	$EDOM, (%eax)
+
+	.p2align	4
+L(skip_errno_setting):
+	/* Here if |x| is Inf or NAN. Continued. */
+	subss	%xmm7, %xmm7		/* x-x, result is NaN */
+	movl	ARG_SIN_PTR, %eax
+	movl	ARG_COS_PTR, %ecx
+	movss	%xmm7, (%eax)
+	movss	%xmm7, (%ecx)
+	RETURN
+END(__sincosf_sse2)
+
+	.section .rodata, "a"
+	.p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+	.long	0x00000000,0x00000000
+	.long	0x54442d18,0x3fe921fb
+	.long	0x54442d18,0x3ff921fb
+	.long	0x7f3321d2,0x4002d97c
+	.long	0x54442d18,0x400921fb
+	.long	0x2955385e,0x400f6a7a
+	.long	0x7f3321d2,0x4012d97c
+	.long	0xe9bba775,0x4015fdbb
+	.long	0x54442d18,0x401921fb
+	.long	0xbeccb2bb,0x401c463a
+	.long	0x2955385e,0x401f6a7a
+	.type L(PIO4J), @object
+	ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+	.p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+	.long	0x00000000,0x00000000
+	.long	0x6c000000,0x3ff45f30
+	.long	0x2a000000,0x3e3c9c88
+	.long	0xa8000000,0x3c54fe13
+	.long	0xd0000000,0x3aaf47d4
+	.long	0x6c000000,0x38fbb81b
+	.long	0xe0000000,0x3714acc9
+	.long	0x7c000000,0x3560e410
+	.long	0x56000000,0x33bca2c7
+	.long	0xac000000,0x31fbd778
+	.long	0xe0000000,0x300b7246
+	.long	0xe8000000,0x2e5d2126
+	.long	0x48000000,0x2c970032
+	.long	0xe8000000,0x2ad77504
+	.long	0xe0000000,0x290921cf
+	.long	0xb0000000,0x274deb1c
+	.long	0xe0000000,0x25829a73
+	.long	0xbe000000,0x23fd1046
+	.long	0x10000000,0x2224baed
+	.long	0x8e000000,0x20709d33
+	.long	0x80000000,0x1e535a2f
+	.long	0x64000000,0x1cef904e
+	.long	0x30000000,0x1b0d6398
+	.long	0x24000000,0x1964ce7d
+	.long	0x16000000,0x17b908bf
+	.type L(_FPI), @object
+	ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomials for */
+/* sin(x)~=x+x*x^2*(DP_SIN2_0+x^2*DP_SIN2_1) in low  DP part, */
+/* cos(x)~=1+1*x^2*(DP_COS2_0+x^2*DP_COS2_1) in high DP part, */
+/* for |x|<2^-5. */
+	.p2align 4
+L(DP_SINCOS2_0):
+	.long	0x5543d49d,0xbfc55555
+	.long	0xff5cc6fd,0xbfdfffff
+	.type L(DP_SINCOS2_0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SINCOS2_0))
+
+	.p2align 4
+L(DP_SINCOS2_1):
+	.long	0x75cec8c5,0x3f8110f4
+	.long	0xb178dac5,0x3fa55514
+	.type L(DP_SINCOS2_1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SINCOS2_1))
+
+	.p2align 3
+L(DP_ZERONE):
+	.long	0x00000000,0x00000000	/* 0.0 */
+	.long	0x00000000,0xbff00000	/* 1.0 */
+	.type L(DP_ZERONE), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+	.p2align 3
+L(DP_ONES):
+	.long	0x00000000,0x3ff00000	/* +1.0 */
+	.long	0x00000000,0xbff00000	/* -1.0 */
+	.type L(DP_ONES), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomials for */
+/* sin(t)~=t+t*t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))) in low  DP part, */
+/* cos(t)~=1+1*t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))) in high DP part, */
+/* for |t|<Pi/4. */
+	.p2align 4
+L(DP_SC4):
+	.long	0x1674b58a,0xbe5a947e
+	.long	0xdd8844d7,0xbe923c97
+	.type L(DP_SC4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC4))
+
+	.p2align 4
+L(DP_SC3):
+	.long	0x64e6b5b4,0x3ec71d72
+	.long	0x9ac43cc0,0x3efa00eb
+	.type L(DP_SC3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC3))
+
+	.p2align 4
+L(DP_SC2):
+	.long	0x8b4bd1f9,0xbf2a019f
+	.long	0x348b6874,0xbf56c16b
+	.type L(DP_SC2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC2))
+
+	.p2align 4
+L(DP_SC1):
+	.long	0x10c2688b,0x3f811111
+	.long	0x545c50c7,0x3fa55555
+	.type L(DP_SC1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC1))
+
+	.p2align 4
+L(DP_SC0):
+	.long	0x55551cd9,0xbfc55555
+	.long	0xfffe98ae,0xbfdfffff
+	.type L(DP_SC0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC0))
+
+	.p2align 3
+L(DP_SMALL):
+	.long	0x00000000,0x3cd00000	/* 2^(-50) */
+	.type L(DP_SMALL), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SMALL))
+
+	.p2align 3
+L(DP_PIO4):
+	.long	0x54442d18,0x3fe921fb	/* Pi/4 */
+	.type L(DP_PIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+	.p2align 3
+L(DP_2POW52):
+	.long	0x00000000,0x43300000	/* +2^52 */
+	.long	0x00000000,0xc3300000	/* -2^52 */
+	.type L(DP_2POW52), @object
+	ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+	.p2align 3
+L(DP_INVPIO4):
+	.long	0x6dc9c883,0x3ff45f30	/* 4/Pi */
+	.type L(DP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+	.p2align 3
+L(DP_PIO4HI):
+	.long	0x54000000,0xbfe921fb	/* High part of Pi/4 */
+	.type L(DP_PIO4HI), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+	.p2align 3
+L(DP_PIO4LO):
+	.long	0x11A62633,0xbe010b46	/* Low part of Pi/4 */
+	.type L(DP_PIO4LO), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+	.p2align 2
+L(SP_INVPIO4):
+	.long	0x3fa2f983		/* 4/Pi */
+	.type L(SP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+	.p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+	.long	0xffffffff,0x7fffffff
+	.long	0xffffffff,0x7fffffff
+	.type L(DP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+	.p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+	.long	0x00000000,0xffffffff
+	.type L(DP_HI_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+	.p2align 4
+L(SP_ABS_MASK): /* Mask for getting SP absolute value */
+	.long	0x7fffffff,0x7fffffff
+	.long	0x7fffffff,0x7fffffff
+	.type L(SP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
+
+	.p2align 2
+L(SP_ONE):
+	.long	0x3f800000		/* 1.0 */
+	.type L(SP_ONE), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+weak_alias(__sincosf, sincosf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c
new file mode 100644
index 0000000000..9428f9b4ea
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c
@@ -0,0 +1,30 @@
+/* Multiple versions of sincosf
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern void __sincosf_sse2 (float, float *, float *);
+extern void __sincosf_ia32 (float, float *, float *);
+void __sincosf (float, float *, float *);
+
+libm_ifunc (__sincosf,
+	    HAS_CPU_FEATURE (SSE2) ? __sincosf_sse2 : __sincosf_ia32);
+weak_alias (__sincosf, sincosf);
+
+#define SINCOSF __sincosf_ia32
+#include <sysdeps/ieee754/flt-32/s_sincosf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S
new file mode 100644
index 0000000000..ee96018061
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S
@@ -0,0 +1,566 @@
+/* Optimized with sse2 version of sinf
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ *  1) if |x| == 0: return x.
+ *  2) if |x| <  2^-27: return x-x*DP_SMALL, raise underflow only when needed.
+ *  3) if |x| <  2^-5 : return x+x^3*DP_SIN2_0+x^5*DP_SIN2_1.
+ *  4) if |x| <   Pi/4: return x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).
+ *  5) if |x| < 9*Pi/4:
+ *      5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1,
+ *           t=|x|-j*Pi/4.
+ *      5.2) Reconstruction:
+ *          s = sign(x) * (-1.0)^((n>>2)&1)
+ *          if(n&2 != 0) {
+ *              using cos(t) polynomial for |t|<Pi/4, result is
+ *              s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+ *          } else {
+ *              using sin(t) polynomial for |t|<Pi/4, result is
+ *              s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+ *          }
+ *  6) if |x| < 2^23, large args:
+ *      6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ *           t=|x|-j*Pi/4.
+ *      6.2) Reconstruction same as (5.2).
+ *  7) if |x| >= 2^23, very large args:
+ *      7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ *           t=|x|-j*Pi/4.
+ *      7.2) Reconstruction same as (5.2).
+ *  8) if x is Inf, return x-x, and set errno=EDOM.
+ *  9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ *  sin(+-0) = +-0 not raising inexact/underflow,
+ *  sin(subnormal) raises inexact/underflow,
+ *  sin(min_normalized) raises inexact/underflow,
+ *  sin(normalized) raises inexact,
+ *  sin(Inf) = NaN, raises invalid, sets errno to EDOM,
+ *  sin(NaN) = NaN.
+ */
+
+#ifdef	PIC
+# define MO1(symbol)			L(symbol)##@GOTOFF(%ebx)
+# define MO2(symbol,reg2,_scale)	L(symbol)##@GOTOFF(%ebx,reg2,_scale)
+# define CFI_PUSH(REG)	cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
+# define CFI_POP(REG)	cfi_adjust_cfa_offset(-4); cfi_restore(REG)
+# define PUSH(REG)			pushl REG; CFI_PUSH(REG)
+# define POP(REG)			popl REG; CFI_POP(REG)
+# define ENTRANCE			PUSH(%ebx); LOAD_PIC_REG(bx)
+# define RETURN				POP(%ebx); ret; CFI_PUSH(%ebx)
+# define ARG_X				8(%esp)
+#else
+# define MO1(symbol)			L(symbol)
+# define MO2(symbol,reg2,_scale)	L(symbol)(,reg2,_scale)
+# define ENTRANCE
+# define RETURN				ret
+# define ARG_X				4(%esp)
+#endif
+
+	.text
+ENTRY(__sinf_sse2)
+	/* Input: single precision x on stack at address ARG_X */
+
+	ENTRANCE
+	movl	ARG_X, %eax		/* Bits of x */
+	cvtss2sd ARG_X, %xmm0		/* DP x */
+	andl	$0x7fffffff, %eax	/* |x| */
+
+	cmpl	$0x3f490fdb, %eax	/* |x|<Pi/4?  */
+	jb	L(arg_less_pio4)
+
+	/* Here if |x|>=Pi/4 */
+	movd	%eax, %xmm3		/* SP |x| */
+	andpd	MO1(DP_ABS_MASK),%xmm0	/* DP |x| */
+	movss	MO1(SP_INVPIO4), %xmm2	/* SP 1/(Pi/4) */
+
+	cmpl	$0x40e231d6, %eax	/* |x|<9*Pi/4?  */
+	jae	L(large_args)
+
+	/* Here if Pi/4<=|x|<9*Pi/4 */
+	mulss	%xmm3, %xmm2		/* SP |x|/(Pi/4) */
+	movl	ARG_X, %ecx		/* Load x */
+	cvttss2si %xmm2, %eax		/* k, number of Pi/4 in x */
+	shrl	$31, %ecx		/* sign of x */
+	addl	$1, %eax		/* k+1 */
+	movl	$0x0e, %edx
+	andl	%eax, %edx		/* j = (k+1)&0x0e */
+	subsd	MO2(PIO4J,%edx,8), %xmm0 /* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+	/* Input: %eax=n, %xmm0=t, %ecx=sign(x) */
+	testl	$2, %eax		/* n&2 != 0?  */
+	jz	L(sin_poly)
+
+/*L(cos_poly):*/
+	/* Here if sin(x) calculated using cos(t) polynomial for |t|<Pi/4:
+	 * y = t*t; z = y*y;
+	 * s = sign(x) * (-1.0)^((n>>2)&1)
+	 * result = s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
+	 */
+	shrl	$2, %eax		/* n>>2 */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	andl	$1, %eax		/* (n>>2)&1 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=t^4 */
+
+	movsd	MO1(DP_C4), %xmm4	/* C4 */
+	mulsd	%xmm0, %xmm4		/* z*C4 */
+	xorl	%eax, %ecx		/* (-1.0)^((n>>2)&1) XOR sign(x) */
+	movsd	MO1(DP_C3), %xmm3	/* C3 */
+	mulsd	%xmm0, %xmm3		/* z*C3 */
+	addsd	MO1(DP_C2), %xmm4	/* C2+z*C4 */
+	mulsd	%xmm0, %xmm4		/* z*(C2+z*C4) */
+	lea	-8(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	addsd	MO1(DP_C1), %xmm3	/* C1+z*C3 */
+	mulsd	%xmm0, %xmm3		/* z*(C1+z*C3) */
+	addsd	MO1(DP_C0), %xmm4	/* C0+z*(C2+z*C4) */
+	mulsd	%xmm1, %xmm4		/* y*(C0+z*(C2+z*C4)) */
+
+	addsd	%xmm4, %xmm3		/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	/* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	addsd	MO1(DP_ONES), %xmm3
+
+	mulsd	MO2(DP_ONES,%ecx,8), %xmm3 /* DP result */
+	movsd	%xmm3, 0(%esp)		/* Move result from sse...  */
+	fldl	0(%esp)			/* ...to FPU.  */
+	/* Return back 4 bytes of stack frame */
+	lea	8(%esp), %esp
+	RETURN
+
+	.p2align	4
+L(sin_poly):
+	/* Here if sin(x) calculated using sin(t) polynomial for |t|<Pi/4:
+	 * y = t*t; z = y*y;
+	 * s = sign(x) * (-1.0)^((n>>2)&1)
+	 * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
+	 */
+
+	movaps	%xmm0, %xmm4		/* t */
+	shrl	$2, %eax		/* n>>2 */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	andl	$1, %eax		/* (n>>2)&1 */
+	movaps	%xmm0, %xmm1		/* y */
+	xorl	%eax, %ecx		/* (-1.0)^((n>>2)&1) XOR sign(x) */
+	mulsd	%xmm0, %xmm0		/* z=t^4 */
+
+	movsd	MO1(DP_S4), %xmm2	/* S4 */
+	mulsd	%xmm0, %xmm2		/* z*S4 */
+	movsd	MO1(DP_S3), %xmm3	/* S3 */
+	mulsd	%xmm0, %xmm3		/* z*S3 */
+	lea	-8(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	addsd	MO1(DP_S2), %xmm2	/* S2+z*S4 */
+	mulsd	%xmm0, %xmm2		/* z*(S2+z*S4) */
+	addsd	MO1(DP_S1), %xmm3	/* S1+z*S3 */
+	mulsd	%xmm0, %xmm3		/* z*(S1+z*S3) */
+	addsd	MO1(DP_S0), %xmm2	/* S0+z*(S2+z*S4) */
+	mulsd	%xmm1, %xmm2		/* y*(S0+z*(S2+z*S4)) */
+	/* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
+	mulsd	MO2(DP_ONES,%ecx,8), %xmm4
+	addsd	%xmm2, %xmm3		/* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	/* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	mulsd	%xmm4, %xmm3
+	/* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	addsd	%xmm4, %xmm3
+	movsd	%xmm3, 0(%esp)		/* Move result from sse...  */
+	fldl	0(%esp)			/* ...to FPU.  */
+	/* Return back 4 bytes of stack frame */
+	lea	8(%esp), %esp
+	RETURN
+
+	.p2align	4
+L(large_args):
+	/* Here if |x|>=9*Pi/4 */
+	cmpl	$0x7f800000, %eax	/* x is Inf or NaN?  */
+	jae	L(arg_inf_or_nan)
+
+	/* Here if finite |x|>=9*Pi/4 */
+	cmpl	$0x4b000000, %eax	/* |x|<2^23?  */
+	jae	L(very_large_args)
+
+	/* Here if 9*Pi/4<=|x|<2^23 */
+	movsd	MO1(DP_INVPIO4), %xmm1	/* 1/(Pi/4) */
+	mulsd	%xmm0, %xmm1		/* |x|/(Pi/4) */
+	cvttsd2si %xmm1, %eax		/* k=trunc(|x|/(Pi/4)) */
+	addl	$1, %eax		/* k+1 */
+	movl	%eax, %edx
+	andl	$0xfffffffe, %edx	/* j=(k+1)&0xfffffffe */
+	cvtsi2sdl %edx, %xmm4		/* DP j */
+	movl	ARG_X, %ecx		/* Load x */
+	movsd	MO1(DP_PIO4HI), %xmm2	/* -PIO4HI = high part of -Pi/4 */
+	shrl	$31, %ecx		/* sign bit of x */
+	mulsd	%xmm4, %xmm2		/* -j*PIO4HI */
+	movsd	MO1(DP_PIO4LO), %xmm3	/* -PIO4LO = low part of -Pi/4 */
+	addsd	%xmm2, %xmm0		/* |x| - j*PIO4HI */
+	mulsd	%xmm3, %xmm4		/* j*PIO4LO */
+	addsd	%xmm4, %xmm0		/* t = |x| - j*PIO4HI - j*PIO4LO */
+	jmp	L(reconstruction)
+
+	.p2align	4
+L(very_large_args):
+	/* Here if finite |x|>=2^23 */
+
+	/* bitpos = (ix>>23) - BIAS_32 + 59; */
+	shrl	$23, %eax		/* eb = biased exponent of x */
+	/* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
+	subl	$68, %eax
+	movl	$28, %ecx		/* %cl=28 */
+	movl	%eax, %edx		/* bitpos copy */
+
+	/* j = bitpos/28; */
+	div	%cl			/* j in register %al=%ax/%cl */
+	movapd	%xmm0, %xmm3		/* |x| */
+	/* clear unneeded remainder from %ah */
+	andl	$0xff, %eax
+
+	imull	$28, %eax, %ecx		/* j*28 */
+	movsd	MO1(DP_HI_MASK), %xmm4	/* DP_HI_MASK */
+	movapd	%xmm0, %xmm5		/* |x| */
+	mulsd	-2*8+MO2(_FPI,%eax,8), %xmm3	/* tmp3 = FPI[j-2]*|x| */
+	movapd	%xmm0, %xmm1		/* |x| */
+	mulsd	-1*8+MO2(_FPI,%eax,8), %xmm5	/* tmp2 = FPI[j-1]*|x| */
+	mulsd	0*8+MO2(_FPI,%eax,8), %xmm0	/* tmp0 = FPI[j]*|x| */
+	addl	$19, %ecx		/* j*28+19 */
+	mulsd	1*8+MO2(_FPI,%eax,8), %xmm1	/* tmp1 = FPI[j+1]*|x| */
+	cmpl	%ecx, %edx		/* bitpos>=j*28+19?   */
+	jl	L(very_large_skip1)
+
+	/* Here if bitpos>=j*28+19 */
+	andpd	%xmm3, %xmm4		/* HI(tmp3) */
+	subsd	%xmm4, %xmm3		/* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+	movsd	MO1(DP_2POW52), %xmm6
+	movapd	%xmm5, %xmm2		/* tmp2 copy */
+	addsd	%xmm3, %xmm5		/* tmp5 = tmp3 + tmp2 */
+	movl	$1, %edx
+	addsd	%xmm5, %xmm6		/* tmp6 = tmp5 + 2^52 */
+	movsd	8+MO1(DP_2POW52), %xmm4
+	movd	%xmm6, %eax		/* k = I64_LO(tmp6); */
+	addsd	%xmm6, %xmm4		/* tmp4 = tmp6 - 2^52 */
+	movl	ARG_X, %ecx		/* Load x */
+	comisd	%xmm5, %xmm4		/* tmp4 > tmp5?  */
+	jbe	L(very_large_skip2)
+
+	/* Here if tmp4 > tmp5 */
+	subl	$1, %eax		/* k-- */
+	addsd	8+MO1(DP_ONES), %xmm4	/* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+	andl	%eax, %edx		/* k&1 */
+	subsd	%xmm4, %xmm3		/* tmp3 -= tmp4 */
+	addsd	MO2(DP_ZERONE,%edx,8), %xmm3 /* t  = DP_ZERONE[k&1] + tmp3 */
+	addsd	%xmm2, %xmm3		/* t += tmp2 */
+	shrl	$31, %ecx		/* sign of x */
+	addsd	%xmm3, %xmm0		/* t += tmp0 */
+	addl	$1, %eax		/* n=k+1 */
+	addsd	%xmm1, %xmm0		/* t += tmp1 */
+	mulsd	MO1(DP_PIO4), %xmm0	/* t *= PI04 */
+
+	jmp	L(reconstruction)	/* end of very_large_args peth */
+
+	.p2align	4
+L(arg_less_pio4):
+	/* Here if |x|<Pi/4 */
+	cmpl	$0x3d000000, %eax	/* |x|<2^-5?  */
+	jl	L(arg_less_2pn5)
+
+	/* Here if 2^-5<=|x|<Pi/4 */
+	movaps	%xmm0, %xmm3		/* x */
+	mulsd	%xmm0, %xmm0		/* y=x^2 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=x^4 */
+	movsd	MO1(DP_S4), %xmm4	/* S4 */
+	mulsd	%xmm0, %xmm4		/* z*S4 */
+	movsd	MO1(DP_S3), %xmm5	/* S3 */
+	mulsd	%xmm0, %xmm5		/* z*S3 */
+	addsd	MO1(DP_S2), %xmm4	/* S2+z*S4 */
+	mulsd	%xmm0, %xmm4		/* z*(S2+z*S4) */
+	addsd	MO1(DP_S1), %xmm5	/* S1+z*S3 */
+	mulsd	%xmm0, %xmm5		/* z*(S1+z*S3) */
+	addsd	MO1(DP_S0), %xmm4	/* S0+z*(S2+z*S4) */
+	mulsd	%xmm1, %xmm4		/* y*(S0+z*(S2+z*S4)) */
+	mulsd	%xmm3, %xmm5		/* x*z*(S1+z*S3) */
+	mulsd	%xmm3, %xmm4		/* x*y*(S0+z*(S2+z*S4)) */
+	/* x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	addsd	%xmm5, %xmm4
+	/* x + x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	addsd	%xmm4, %xmm3
+	cvtsd2ss %xmm3, %xmm3		/* SP result */
+
+L(epilogue):
+	lea	-4(%esp), %esp		/* Borrow 4 bytes of stack frame */
+	movss	%xmm3, 0(%esp)		/* Move result from sse...  */
+	flds	0(%esp)			/* ...to FPU.  */
+	/* Return back 4 bytes of stack frame */
+	lea	4(%esp), %esp
+	RETURN
+
+	.p2align	4
+L(arg_less_2pn5):
+	/* Here if |x|<2^-5 */
+	cmpl	$0x32000000, %eax	/* |x|<2^-27?  */
+	jl	L(arg_less_2pn27)
+
+	/* Here if 2^-27<=|x|<2^-5 */
+	movaps	%xmm0, %xmm1		/* DP x */
+	mulsd	%xmm0, %xmm0		/* DP x^2 */
+	movsd	MO1(DP_SIN2_1), %xmm3	/* DP DP_SIN2_1 */
+	mulsd	%xmm0, %xmm3		/* DP x^2*DP_SIN2_1 */
+	addsd	MO1(DP_SIN2_0), %xmm3	/* DP DP_SIN2_0+x^2*DP_SIN2_1 */
+	mulsd	%xmm0, %xmm3		/* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */
+	mulsd	%xmm1, %xmm3		/* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+	addsd	%xmm1, %xmm3		/* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+	cvtsd2ss %xmm3, %xmm3		/* SP result */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(arg_less_2pn27):
+	movss	ARG_X, %xmm3		/* SP x */
+	cmpl	$0, %eax		/* x=0?  */
+	je	L(epilogue)		/* in case x=0 return sin(+-0)==+-0 */
+	/* Here if |x|<2^-27 */
+	/*
+	 * Special cases here:
+	 *  sin(subnormal) raises inexact/underflow
+	 *  sin(min_normalized) raises inexact/underflow
+	 *  sin(normalized) raises inexact
+	 */
+	movaps	%xmm0, %xmm3		/* Copy of DP x */
+	mulsd	MO1(DP_SMALL), %xmm0	/* x*DP_SMALL */
+	subsd	%xmm0, %xmm3		/* Result is x-x*DP_SMALL */
+	cvtsd2ss %xmm3, %xmm3		/* Result converted to SP */
+	jmp	L(epilogue)
+
+	.p2align	4
+L(arg_inf_or_nan):
+	/* Here if |x| is Inf or NAN */
+	jne	L(skip_errno_setting)	/* in case of x is NaN */
+
+	/* Here if x is Inf. Set errno to EDOM.  */
+	call	JUMPTARGET(__errno_location)
+	movl	$EDOM, (%eax)
+
+	.p2align	4
+L(skip_errno_setting):
+	/* Here if |x| is Inf or NAN. Continued.  */
+	movss	ARG_X, %xmm3		/* load x */
+	subss	%xmm3, %xmm3		/* Result is NaN */
+	jmp	L(epilogue)
+END(__sinf_sse2)
+
+	.section .rodata, "a"
+	.p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+	.long	0x00000000,0x00000000
+	.long	0x54442d18,0x3fe921fb
+	.long	0x54442d18,0x3ff921fb
+	.long	0x7f3321d2,0x4002d97c
+	.long	0x54442d18,0x400921fb
+	.long	0x2955385e,0x400f6a7a
+	.long	0x7f3321d2,0x4012d97c
+	.long	0xe9bba775,0x4015fdbb
+	.long	0x54442d18,0x401921fb
+	.long	0xbeccb2bb,0x401c463a
+	.long	0x2955385e,0x401f6a7a
+	.type L(PIO4J), @object
+	ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+	.p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+	.long	0x00000000,0x00000000
+	.long	0x6c000000,0x3ff45f30
+	.long	0x2a000000,0x3e3c9c88
+	.long	0xa8000000,0x3c54fe13
+	.long	0xd0000000,0x3aaf47d4
+	.long	0x6c000000,0x38fbb81b
+	.long	0xe0000000,0x3714acc9
+	.long	0x7c000000,0x3560e410
+	.long	0x56000000,0x33bca2c7
+	.long	0xac000000,0x31fbd778
+	.long	0xe0000000,0x300b7246
+	.long	0xe8000000,0x2e5d2126
+	.long	0x48000000,0x2c970032
+	.long	0xe8000000,0x2ad77504
+	.long	0xe0000000,0x290921cf
+	.long	0xb0000000,0x274deb1c
+	.long	0xe0000000,0x25829a73
+	.long	0xbe000000,0x23fd1046
+	.long	0x10000000,0x2224baed
+	.long	0x8e000000,0x20709d33
+	.long	0x80000000,0x1e535a2f
+	.long	0x64000000,0x1cef904e
+	.long	0x30000000,0x1b0d6398
+	.long	0x24000000,0x1964ce7d
+	.long	0x16000000,0x17b908bf
+	.type L(_FPI), @object
+	ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomial
+   for sin(x)~=x+x^3*DP_SIN2_0+x^5*DP_SIN2_1, |x|<2^-5.  */
+	.p2align 3
+L(DP_SIN2_0):
+	.long	0x5543d49d,0xbfc55555
+	.type L(DP_SIN2_0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SIN2_0))
+
+	.p2align 3
+L(DP_SIN2_1):
+	.long	0x75cec8c5,0x3f8110f4
+	.type L(DP_SIN2_1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SIN2_1))
+
+	.p2align 3
+L(DP_ZERONE):
+	.long	0x00000000,0x00000000	/* 0.0 */
+	.long	0x00000000,0xbff00000	/* 1.0 */
+	.type L(DP_ZERONE), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+	.p2align 3
+L(DP_ONES):
+	.long	0x00000000,0x3ff00000	/* +1.0 */
+	.long	0x00000000,0xbff00000	/* -1.0 */
+	.type L(DP_ONES), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomial
+   for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4.  */
+	.p2align 3
+L(DP_S3):
+	.long	0x64e6b5b4,0x3ec71d72
+	.type L(DP_S3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S3))
+
+	.p2align 3
+L(DP_S1):
+	.long	0x10c2688b,0x3f811111
+	.type L(DP_S1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S1))
+
+	.p2align 3
+L(DP_S4):
+	.long	0x1674b58a,0xbe5a947e
+	.type L(DP_S4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S4))
+
+	.p2align 3
+L(DP_S2):
+	.long	0x8b4bd1f9,0xbf2a019f
+	.type L(DP_S2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S2))
+
+	.p2align 3
+L(DP_S0):
+	.long	0x55551cd9,0xbfc55555
+	.type L(DP_S0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S0))
+
+	.p2align 3
+L(DP_SMALL):
+	.long	0x00000000,0x3cd00000	/* 2^(-50) */
+	.type L(DP_SMALL), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SMALL))
+
+/* Coefficients of polynomial
+   for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4.  */
+	.p2align 3
+L(DP_C3):
+	.long	0x9ac43cc0,0x3efa00eb
+	.type L(DP_C3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C3))
+
+	.p2align 3
+L(DP_C1):
+	.long	0x545c50c7,0x3fa55555
+	.type L(DP_C1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C1))
+
+	.p2align 3
+L(DP_C4):
+	.long	0xdd8844d7,0xbe923c97
+	.type L(DP_C4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C4))
+
+	.p2align 3
+L(DP_C2):
+	.long	0x348b6874,0xbf56c16b
+	.type L(DP_C2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C2))
+
+	.p2align 3
+L(DP_C0):
+	.long	0xfffe98ae,0xbfdfffff
+	.type L(DP_C0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C0))
+
+	.p2align 3
+L(DP_PIO4):
+	.long	0x54442d18,0x3fe921fb	/* Pi/4 */
+	.type L(DP_PIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+	.p2align 3
+L(DP_2POW52):
+	.long	0x00000000,0x43300000	/* +2^52 */
+	.long	0x00000000,0xc3300000	/* -2^52 */
+	.type L(DP_2POW52), @object
+	ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+	.p2align 3
+L(DP_INVPIO4):
+	.long	0x6dc9c883,0x3ff45f30	/* 4/Pi */
+	.type L(DP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+	.p2align 3
+L(DP_PIO4HI):
+	.long	0x54000000,0xbfe921fb	/* High part of Pi/4 */
+	.type L(DP_PIO4HI), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+	.p2align 3
+L(DP_PIO4LO):
+	.long	0x11A62633,0xbe010b46	/* Low part of Pi/4 */
+	.type L(DP_PIO4LO), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+	.p2align 2
+L(SP_INVPIO4):
+	.long	0x3fa2f983		/* 4/Pi */
+	.type L(SP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+	.p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+	.long	0xffffffff,0x7fffffff
+	.long	0xffffffff,0x7fffffff
+	.type L(DP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+	.p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+	.long	0x00000000,0xffffffff
+	.type L(DP_HI_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+weak_alias (__sinf, sinf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c
new file mode 100644
index 0000000000..8ccdd2f34d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c
@@ -0,0 +1,28 @@
+/* Multiple versions of sinf
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern float __sinf_sse2 (float);
+extern float __sinf_ia32 (float);
+float __sinf (float);
+
+libm_ifunc (__sinf, HAS_CPU_FEATURE (SSE2) ? __sinf_sse2 : __sinf_ia32);
+weak_alias (__sinf, sinf);
+#define SINF __sinf_ia32
+#include <sysdeps/ieee754/flt-32/s_sinf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S
new file mode 100644
index 0000000000..ace8db9410
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S
@@ -0,0 +1,39 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmax)
+	fldl	4(%esp)		// x
+	fldl	12(%esp)	// x : y
+
+	fucomi	%st(0), %st
+	fcmovu	%st(1), %st	// now %st contains y if not NaN, x otherwise
+
+	fxch
+
+	fucomi	%st(1), %st
+	fcmovb	%st(1), %st
+
+	fstp	%st(1)
+
+	ret
+END(__fmax)
+weak_alias (__fmax, fmax)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S
new file mode 100644
index 0000000000..3a25951a09
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S
@@ -0,0 +1,39 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmaxf)
+	flds	4(%esp)		// x
+	flds	8(%esp)		// x : y
+
+	fucomi	%st(0), %st
+	fcmovu	%st(1), %st	// now %st contains y if not NaN, x otherwise
+
+	fxch
+
+	fucomi	%st(1), %st
+	fcmovb	%st(1), %st
+
+	fstp	%st(1)
+
+	ret
+END(__fmaxf)
+weak_alias (__fmaxf, fmaxf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S
new file mode 100644
index 0000000000..3f6c21c63d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S
@@ -0,0 +1,58 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmaxl)
+	fldt	4(%esp)		// x
+	fldt	16(%esp)	// x : y
+
+	fucomi	%st(1), %st
+	jp	2f
+	fcmovb	%st(1), %st
+
+	fstp	%st(1)
+
+	ret
+
+2:	// Unordered.
+	fucomi	%st(0), %st
+	jp	3f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 11(%esp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+3:	// st(0) is a NaN; st(1) may or may not be.
+	fxch
+	fucomi	%st(0), %st
+	jp	4f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 23(%esp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+4:	// Both arguments are NaNs, or one is a signaling NaN.
+	faddp
+	ret
+END(__fmaxl)
+weak_alias (__fmaxl, fmaxl)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S
new file mode 100644
index 0000000000..72d306fd79
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S
@@ -0,0 +1,37 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmin)
+	fldl	4(%esp)		// x
+	fldl	12(%esp)	// x : y
+
+	fucomi	%st(0), %st
+	fcmovu	%st(1), %st	// now %st contains y if not NaN, x otherwise
+
+	fucomi	%st(1), %st
+	fcmovnb	%st(1), %st
+
+	fstp	%st(1)
+
+	ret
+END(__fmin)
+weak_alias (__fmin, fmin)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S
new file mode 100644
index 0000000000..52ea892bad
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S
@@ -0,0 +1,37 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fminf)
+	flds	4(%esp)		// x
+	flds	8(%esp)		// x : y
+
+	fucomi	%st(0), %st
+	fcmovu	%st(1), %st	// now %st contains y if not NaN, x otherwise
+
+	fucomi	%st(1), %st
+	fcmovnb	%st(1), %st
+
+	fstp	%st(1)
+
+	ret
+END(__fminf)
+weak_alias (__fminf, fminf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S
new file mode 100644
index 0000000000..e1cb83fed7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S
@@ -0,0 +1,58 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fminl)
+	fldt	4(%esp)		// x
+	fldt	16(%esp)	// x : y
+
+	fucomi	%st(1), %st
+	jp	2f
+	fcmovnb	%st(1), %st
+
+	fstp	%st(1)
+
+	ret
+
+2:	// Unordered.
+	fucomi	%st(0), %st
+	jp	3f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 11(%esp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+3:	// st(0) is a NaN; st(1) may or may not be.
+	fxch
+	fucomi	%st(0), %st
+	jp	4f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 23(%esp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+4:	// Both arguments are NaNs, or one is a signaling NaN.
+	faddp
+	ret
+END(__fminl)
+weak_alias (__fminl, fminl)
diff --git a/REORG.TODO/sysdeps/i386/i686/hp-timing.h b/REORG.TODO/sysdeps/i386/i686/hp-timing.h
new file mode 100644
index 0000000000..1b11410feb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/hp-timing.h
@@ -0,0 +1,42 @@
+/* High precision, low overhead timing functions.  i686 version.
+   Copyright (C) 1998-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _HP_TIMING_H
+#define _HP_TIMING_H	1
+
+/* We always assume having the timestamp register.  */
+#define HP_TIMING_AVAIL		(1)
+#define HP_SMALL_TIMING_AVAIL	(1)
+
+/* We indeed have inlined functions.  */
+#define HP_TIMING_INLINE	(1)
+
+/* We use 64bit values for the times.  */
+typedef unsigned long long int hp_timing_t;
+
+/* That's quite simple.  Use the `rdtsc' instruction.  Note that the value
+   might not be 100% accurate since there might be some more instructions
+   running in this moment.  This could be changed by using a barrier like
+   'cpuid' right before the `rdtsc' instruciton.  But we are not interested
+   in accurate clock cycles here so we don't do this.  */
+#define HP_TIMING_NOW(Var)	__asm__ __volatile__ ("rdtsc" : "=A" (Var))
+
+#include <hp-timing-common.h>
+
+#endif	/* hp-timing.h */
diff --git a/REORG.TODO/sysdeps/i386/i686/init-arch.h b/REORG.TODO/sysdeps/i386/i686/init-arch.h
new file mode 100644
index 0000000000..f55f80efa0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/init-arch.h
@@ -0,0 +1,19 @@
+/* Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define MINIMUM_ISA 686
+#include <sysdeps/x86/init-arch.h>
diff --git a/REORG.TODO/sysdeps/i386/i686/memcmp.S b/REORG.TODO/sysdeps/i386/i686/memcmp.S
new file mode 100644
index 0000000000..5140ee2145
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memcmp.S
@@ -0,0 +1,408 @@
+/* Compare two memory blocks for differences in the first COUNT bytes.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS		4+4	/* Preserve EBX.  */
+#define BLK1		PARMS
+#define BLK2		BLK1+4
+#define LEN		BLK2+4
+#define ENTRANCE	pushl %ebx; cfi_adjust_cfa_offset (4); \
+			cfi_rel_offset (ebx, 0)
+#define RETURN		popl %ebx; cfi_adjust_cfa_offset (-4); \
+			cfi_restore (ebx); ret
+
+/* Load an entry in a jump table into EBX.  TABLE is a jump table
+   with relative offsets.  INDEX is a register contains the index
+   into the jump table.  */
+#define LOAD_JUMP_TABLE_ENTRY(TABLE, INDEX) \
+  /* We first load PC into EBX.  */					      \
+  SETUP_PIC_REG(bx);							      \
+  /* Get the address of the jump table.  */				      \
+  addl	$(TABLE - .), %ebx;						      \
+  /* Get the entry and convert the relative offset to the		      \
+     absolute address.  */						      \
+  addl	(%ebx,INDEX,4), %ebx
+
+        .text
+	ALIGN (4)
+ENTRY (memcmp)
+	ENTRANCE
+
+	movl	BLK1(%esp), %eax
+	movl	BLK2(%esp), %edx
+	movl	LEN(%esp), %ecx
+
+	cmpl 	$1, %ecx
+	jne	L(not_1)
+	movzbl	(%eax), %ecx		/* LEN == 1  */
+	cmpb	(%edx), %cl
+	jne	L(neq)
+L(bye):
+	xorl	%eax, %eax
+	RETURN
+
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+L(neq):
+	sbbl	%eax, %eax
+	sbbl	$-1, %eax
+	RETURN
+
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+L(not_1):
+	jl	L(bye)			/* LEN == 0  */
+
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	movl	%eax, %esi
+	cfi_rel_offset (esi, 0)
+	cmpl	$32, %ecx;
+	jge	L(32bytesormore)	/* LEN => 32  */
+
+	LOAD_JUMP_TABLE_ENTRY (L(table_32bytes), %ecx)
+	addl	%ecx, %edx
+	addl	%ecx, %esi
+	jmp	*%ebx
+
+	ALIGN (4)
+L(28bytes):
+	movl	-28(%esi), %eax
+	movl	-28(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(24bytes):
+	movl	-24(%esi), %eax
+	movl	-24(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(20bytes):
+	movl	-20(%esi), %eax
+	movl	-20(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(16bytes):
+	movl	-16(%esi), %eax
+	movl	-16(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(12bytes):
+	movl	-12(%esi), %eax
+	movl	-12(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(8bytes):
+	movl	-8(%esi), %eax
+	movl	-8(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(4bytes):
+	movl	-4(%esi), %eax
+	movl	-4(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(0bytes):
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	xorl	%eax, %eax
+	RETURN
+
+	cfi_adjust_cfa_offset (8)
+	cfi_rel_offset (esi, 0)
+	cfi_rel_offset (ebx, 4)
+L(29bytes):
+	movl	-29(%esi), %eax
+	movl	-29(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(25bytes):
+	movl	-25(%esi), %eax
+	movl	-25(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(21bytes):
+	movl	-21(%esi), %eax
+	movl	-21(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(17bytes):
+	movl	-17(%esi), %eax
+	movl	-17(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(13bytes):
+	movl	-13(%esi), %eax
+	movl	-13(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(9bytes):
+	movl	-9(%esi), %eax
+	movl	-9(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(5bytes):
+	movl	-5(%esi), %eax
+	movl	-5(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(1bytes):
+	movzbl	-1(%esi), %eax
+	cmpb	-1(%edx), %al
+	jne	L(set)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	xorl	%eax, %eax
+	RETURN
+
+	cfi_adjust_cfa_offset (8)
+	cfi_rel_offset (esi, 0)
+	cfi_rel_offset (ebx, 4)
+L(30bytes):
+	movl	-30(%esi), %eax
+	movl	-30(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(26bytes):
+	movl	-26(%esi), %eax
+	movl	-26(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(22bytes):
+	movl	-22(%esi), %eax
+	movl	-22(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(18bytes):
+	movl	-18(%esi), %eax
+	movl	-18(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(14bytes):
+	movl	-14(%esi), %eax
+	movl	-14(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(10bytes):
+	movl	-10(%esi), %eax
+	movl	-10(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(6bytes):
+	movl	-6(%esi), %eax
+	movl	-6(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%esi), %eax
+	movzwl	-2(%edx), %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+	cmpl	%ecx, %eax
+	jne	L(set)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	xorl	%eax, %eax
+	RETURN
+
+	cfi_adjust_cfa_offset (8)
+	cfi_rel_offset (esi, 0)
+	cfi_rel_offset (ebx, 4)
+L(31bytes):
+	movl	-31(%esi), %eax
+	movl	-31(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(27bytes):
+	movl	-27(%esi), %eax
+	movl	-27(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(23bytes):
+	movl	-23(%esi), %eax
+	movl	-23(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(19bytes):
+	movl	-19(%esi), %eax
+	movl	-19(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%esi), %eax
+	movl	-15(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%esi), %eax
+	movl	-11(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%esi), %eax
+	movl	-7(%edx), %ecx
+	cmpl	%ecx, %eax
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%esi), %eax
+	movzwl	-3(%edx), %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+	cmpl	%ecx, %eax
+	jne	L(set)
+	movzbl	-1(%esi), %eax
+	cmpb	-1(%edx), %al
+	jne	L(set)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	xorl	%eax, %eax
+	RETURN
+
+	cfi_adjust_cfa_offset (8)
+	cfi_rel_offset (esi, 0)
+	cfi_rel_offset (ebx, 4)
+	ALIGN (4)
+/* ECX >= 32.  */
+L(32bytesormore):
+	subl	$32, %ecx
+
+	movl	(%esi), %eax
+	cmpl	(%edx), %eax
+	jne	L(load_ecx)
+
+	movl	4(%esi), %eax
+	cmpl	4(%edx), %eax
+	jne	L(load_ecx_4)
+
+	movl	8(%esi), %eax
+	cmpl	8(%edx), %eax
+	jne	L(load_ecx_8)
+
+	movl	12(%esi), %eax
+	cmpl	12(%edx), %eax
+	jne	L(load_ecx_12)
+
+	movl	16(%esi), %eax
+	cmpl	16(%edx), %eax
+	jne	L(load_ecx_16)
+
+	movl	20(%esi), %eax
+	cmpl	20(%edx), %eax
+	jne	L(load_ecx_20)
+
+	movl	24(%esi), %eax
+	cmpl	24(%edx), %eax
+	jne	L(load_ecx_24)
+
+	movl	28(%esi), %eax
+	cmpl	28(%edx), %eax
+	jne	L(load_ecx_28)
+
+	addl	$32, %esi
+	addl	$32, %edx
+	cmpl	$32, %ecx
+	jge	L(32bytesormore)
+
+	LOAD_JUMP_TABLE_ENTRY (L(table_32bytes), %ecx)
+	addl	%ecx, %edx
+	addl	%ecx, %esi
+	jmp	*%ebx
+
+L(load_ecx_28):
+	addl	$0x4, %edx
+L(load_ecx_24):
+	addl	$0x4, %edx
+L(load_ecx_20):
+	addl	$0x4, %edx
+L(load_ecx_16):
+	addl	$0x4, %edx
+L(load_ecx_12):
+	addl	$0x4, %edx
+L(load_ecx_8):
+	addl	$0x4, %edx
+L(load_ecx_4):
+	addl	$0x4, %edx
+L(load_ecx):
+	movl	(%edx), %ecx
+
+L(find_diff):
+	cmpb	%cl, %al
+	jne	L(set)
+	cmpb	%ch, %ah
+	jne	L(set)
+	shrl	$16,%eax
+	shrl	$16,%ecx
+	cmpb	%cl, %al
+	jne	L(set)
+	/* We get there only if we already know there is a
+	   difference.  */
+	cmpl	%ecx, %eax
+L(set):
+	sbbl	%eax, %eax
+	sbbl	$-1, %eax
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	RETURN
+END (memcmp)
+
+	.section	.rodata
+	ALIGN (2)
+L(table_32bytes) :
+	.long	L(0bytes) - L(table_32bytes)
+	.long	L(1bytes) - L(table_32bytes)
+	.long	L(2bytes) - L(table_32bytes)
+	.long	L(3bytes) - L(table_32bytes)
+	.long	L(4bytes) - L(table_32bytes)
+	.long	L(5bytes) - L(table_32bytes)
+	.long	L(6bytes) - L(table_32bytes)
+	.long	L(7bytes) - L(table_32bytes)
+	.long	L(8bytes) - L(table_32bytes)
+	.long	L(9bytes) - L(table_32bytes)
+	.long	L(10bytes) - L(table_32bytes)
+	.long	L(11bytes) - L(table_32bytes)
+	.long	L(12bytes) - L(table_32bytes)
+	.long	L(13bytes) - L(table_32bytes)
+	.long	L(14bytes) - L(table_32bytes)
+	.long	L(15bytes) - L(table_32bytes)
+	.long	L(16bytes) - L(table_32bytes)
+	.long	L(17bytes) - L(table_32bytes)
+	.long	L(18bytes) - L(table_32bytes)
+	.long	L(19bytes) - L(table_32bytes)
+	.long	L(20bytes) - L(table_32bytes)
+	.long	L(21bytes) - L(table_32bytes)
+	.long	L(22bytes) - L(table_32bytes)
+	.long	L(23bytes) - L(table_32bytes)
+	.long	L(24bytes) - L(table_32bytes)
+	.long	L(25bytes) - L(table_32bytes)
+	.long	L(26bytes) - L(table_32bytes)
+	.long	L(27bytes) - L(table_32bytes)
+	.long	L(28bytes) - L(table_32bytes)
+	.long	L(29bytes) - L(table_32bytes)
+	.long	L(30bytes) - L(table_32bytes)
+	.long	L(31bytes) - L(table_32bytes)
+
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
+libc_hidden_builtin_def (memcmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/memcpy.S b/REORG.TODO/sysdeps/i386/i686/memcpy.S
new file mode 100644
index 0000000000..1d61447430
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memcpy.S
@@ -0,0 +1,98 @@
+/* Copy memory block and return pointer to beginning of destination block
+   For Intel 80x86, x>=6.
+   This file is part of the GNU C Library.
+   Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define RTN	PARMS
+#define DEST	RTN
+#define SRC	DEST+4
+#define LEN	SRC+4
+
+	.text
+#if defined PIC && IS_IN (libc)
+ENTRY_CHK (__memcpy_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memcpy_chk)
+#endif
+ENTRY (memcpy)
+
+	movl	%edi, %eax
+	movl	DEST(%esp), %edi
+	movl	%esi, %edx
+	movl	SRC(%esp), %esi
+
+	movl	%edi, %ecx
+	xorl	%esi, %ecx
+	andl	$3, %ecx
+	movl	LEN(%esp), %ecx
+	cld
+	jne	.Lunaligned
+
+	cmpl	$3, %ecx
+	jbe	.Lunaligned
+
+	testl	$3, %esi
+	je	1f
+	movsb
+	decl	%ecx
+	testl	$3, %esi
+	je	1f
+	movsb
+	decl	%ecx
+	testl	$3, %esi
+	je	1f
+	movsb
+	decl	%ecx
+1:	pushl	%eax
+	movl	%ecx, %eax
+	shrl	$2, %ecx
+	andl	$3, %eax
+	rep
+	movsl
+	movl	%eax, %ecx
+	rep
+	movsb
+	popl	%eax
+
+.Lend:	movl	%eax, %edi
+	movl	%edx, %esi
+	movl	DEST(%esp), %eax
+
+	ret
+
+	/* When we come here the pointers do not have the same
+	   alignment or the length is too short.  No need to optimize for
+	   aligned memory accesses. */
+.Lunaligned:
+	shrl	$1, %ecx
+	jnc	1f
+	movsb
+1:	shrl	$1, %ecx
+	jnc	2f
+	movsw
+2:	rep
+	movsl
+	jmp	.Lend
+END (memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/memmove.S b/REORG.TODO/sysdeps/i386/i686/memmove.S
new file mode 100644
index 0000000000..f60c3d501b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memmove.S
@@ -0,0 +1,120 @@
+/* Copy memory block and return pointer to beginning of destination block
+   For Intel 80x86, x>=6.
+   This file is part of the GNU C Library.
+   Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 2003.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* one spilled register */
+#define RTN	PARMS
+
+	.text
+
+#ifdef USE_AS_BCOPY
+# define SRC	RTN
+# define DEST	SRC+4
+# define LEN	DEST+4
+#else
+# define DEST	RTN
+# define SRC	DEST+4
+# define LEN	SRC+4
+
+# if defined PIC && IS_IN (libc)
+ENTRY_CHK (__memmove_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memmove_chk)
+# endif
+#endif
+
+ENTRY (memmove)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+
+	movl	LEN(%esp), %ecx
+	movl	DEST(%esp), %edi
+	cfi_rel_offset (edi, 0)
+	movl	%esi, %edx
+	movl	SRC(%esp), %esi
+	cfi_register (esi, edx)
+
+	movl	%edi, %eax
+	subl	%esi, %eax
+	cmpl	%eax, %ecx
+	ja	3f
+
+	cld
+	shrl	$1, %ecx
+	jnc	1f
+	movsb
+1:	shrl	$1, %ecx
+	jnc	2f
+	movsw
+2:	rep
+	movsl
+	movl	%edx, %esi
+	cfi_restore (esi)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#endif
+
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+	cfi_register (esi, edx)
+
+	/* Backward copying.  */
+3:	std
+	leal	-1(%edi, %ecx), %edi
+	leal	-1(%esi, %ecx), %esi
+	shrl	$1, %ecx
+	jnc	1f
+	movsb
+1:	subl	$1, %edi
+	subl	$1, %esi
+	shrl	$1, %ecx
+	jnc	2f
+	movsw
+2:	subl	$2, %edi
+	subl	$2, %esi
+	rep
+	movsl
+	movl	%edx, %esi
+	cfi_restore (esi)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#endif
+
+	cld
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (memmove)
+#ifndef USE_AS_BCOPY
+libc_hidden_builtin_def (memmove)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/mempcpy.S b/REORG.TODO/sysdeps/i386/i686/mempcpy.S
new file mode 100644
index 0000000000..31cb4efdb2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/mempcpy.S
@@ -0,0 +1,65 @@
+/* Copy memory block and return pointer to following byte.
+   For Intel 80x86, x>=6.
+   This file is part of the GNU C Library.
+   Copyright (C) 1998-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define RTN	PARMS
+#define DEST	RTN
+#define SRC	DEST+4
+#define LEN	SRC+4
+
+	.text
+#if defined PIC && IS_IN (libc)
+ENTRY_CHK (__mempcpy_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__mempcpy_chk)
+#endif
+ENTRY (__mempcpy)
+
+	movl	LEN(%esp), %ecx
+	movl	%edi, %eax
+	cfi_register (edi, eax)
+	movl	DEST(%esp), %edi
+	movl	%esi, %edx
+	cfi_register (esi, edx)
+	movl	SRC(%esp), %esi
+	cld
+	shrl	$1, %ecx
+	jnc	1f
+	movsb
+1:	shrl	$1, %ecx
+	jnc	2f
+	movsw
+2:	rep
+	movsl
+	xchgl	%edi, %eax
+	cfi_restore (edi)
+	movl	%edx, %esi
+	cfi_restore (esi)
+
+	ret
+END (__mempcpy)
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/memset.S b/REORG.TODO/sysdeps/i386/i686/memset.S
new file mode 100644
index 0000000000..24d06178d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memset.S
@@ -0,0 +1,100 @@
+/* memset/bzero -- set memory area to CH/0
+   Highly optimized version for ix86, x>=6.
+   Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* space for 1 saved reg */
+#ifdef USE_AS_BZERO
+# define DEST	PARMS
+# define LEN	DEST+4
+#else
+# define RTN	PARMS
+# define DEST	RTN
+# define CHR	DEST+4
+# define LEN	CHR+4
+#endif
+
+        .text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY_CHK (__memset_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memset_chk)
+#endif
+ENTRY (memset)
+
+	cld
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	movl	DEST(%esp), %edx
+	movl	LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+	xorl	%eax, %eax	/* fill with 0 */
+#else
+	movzbl	CHR(%esp), %eax
+#endif
+	jecxz	1f
+	movl	%edx, %edi
+	cfi_rel_offset (edi, 0)
+	andl	$3, %edx
+	jz	2f	/* aligned */
+	jp	3f	/* misaligned at 3, store just one byte below */
+	stosb		/* misaligned at 1 or 2, store two bytes */
+	decl	%ecx
+	jz	1f
+3:	stosb
+	decl	%ecx
+	jz	1f
+	xorl	$1, %edx
+	jnz	2f	/* was misaligned at 2 or 3, now aligned */
+	stosb		/* was misaligned at 1, store third byte */
+	decl	%ecx
+2:	movl	%ecx, %edx
+	shrl	$2, %ecx
+	andl	$3, %edx
+#ifndef USE_AS_BZERO
+	imul	$0x01010101, %eax
+#endif
+	rep
+	stosl
+	movl	%edx, %ecx
+	rep
+	stosb
+
+1:
+#ifndef USE_AS_BZERO
+	movl DEST(%esp), %eax	/* start address of destination is result */
+#endif
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (memset)
+libc_hidden_builtin_def (memset)
+
+#if defined SHARED && IS_IN (libc) && !defined __memset_chk \
+    && !defined USE_AS_BZERO
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+	.section .gnu.warning.__memset_zero_constant_len_parameter
+	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/memusage.h b/REORG.TODO/sysdeps/i386/i686/memusage.h
new file mode 100644
index 0000000000..77a020d7c0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memusage.h
@@ -0,0 +1,21 @@
+/* Copyright (C) 2000-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define GETSP() ({ register uintptr_t stack_ptr asm ("esp"); stack_ptr; })
+#define GETTIME(low,high) asm ("rdtsc" : "=a" (low), "=d" (high))
+
+#include <sysdeps/generic/memusage.h>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile
new file mode 100644
index 0000000000..4a0c20c051
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile
@@ -0,0 +1,44 @@
+ifeq ($(subdir),csu)
+tests += test-multiarch
+endif
+
+ifeq ($(subdir),string)
+gen-as-const-headers += locale-defines.sym
+sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
+		   memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \
+		   memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
+		   memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
+		   strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
+		   memcmp-ssse3 memcmp-sse4 varshift \
+		   strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \
+		   strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \
+		   strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \
+		   strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \
+		   strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
+		   memchr-sse2 memchr-sse2-bsf \
+		   memrchr-sse2 memrchr-sse2-bsf memrchr-c \
+		   rawmemchr-sse2 rawmemchr-sse2-bsf \
+		   strnlen-sse2 strnlen-c \
+		   strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \
+		   strncase_l-c strncase-c strncase_l-ssse3 \
+		   strcasecmp_l-sse4 strncase_l-sse4 \
+		   bcopy-sse2-unaligned memcpy-sse2-unaligned \
+		   mempcpy-sse2-unaligned memmove-sse2-unaligned \
+		   strcspn-c strpbrk-c strspn-c
+CFLAGS-varshift.c += -msse4
+CFLAGS-strcspn-c.c += -msse4
+CFLAGS-strpbrk-c.c += -msse4
+CFLAGS-strspn-c.c += -msse4
+endif
+
+ifeq ($(subdir),wcsmbs)
+sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \
+		   wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \
+		   wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c
+endif
+
+ifeq ($(subdir),math)
+libm-sysdep_routines += s_fma-fma s_fmaf-fma
+CFLAGS-s_fma-fma.c += -mavx -mfpmath=sse
+CFLAGS-s_fmaf-fma.c += -mavx -mfpmath=sse
+endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S
new file mode 100644
index 0000000000..efef2a10dd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
new file mode 100644
index 0000000000..cbc8b420e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
new file mode 100644
index 0000000000..36aac44b9c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S
new file mode 100644
index 0000000000..877f82c28f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S
@@ -0,0 +1,59 @@
+/* Multiple versions of bcopy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(bcopy)
+	.type	bcopy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__bcopy_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bcopy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bcopy_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bcopy_ssse3_rep)
+2:	ret
+END(bcopy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __bcopy_ia32, @function; \
+	.p2align 4; \
+	.globl __bcopy_ia32; \
+	.hidden __bcopy_ia32; \
+	__bcopy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32
+
+#endif
+
+#include "../bcopy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
new file mode 100644
index 0000000000..507b288bb3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2_rep __bzero_sse2_rep
+#include "memset-sse2-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S
new file mode 100644
index 0000000000..8d04512e4e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2 __bzero_sse2
+#include "memset-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S
new file mode 100644
index 0000000000..9dac490aa2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S
@@ -0,0 +1,62 @@
+/* Multiple versions of bzero
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__bzero)
+	.type	__bzero, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__bzero_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX ( __bzero_sse2)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bzero_sse2_rep)
+2:	ret
+END(__bzero)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __bzero_ia32, @function; \
+	.p2align 4; \
+	.globl __bzero_ia32; \
+	.hidden __bzero_ia32; \
+	__bzero_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __bzero_ia32, .-__bzero_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI___bzero; __GI___bzero = __bzero_ia32
+# endif
+#endif
+
+#include "../bzero.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..e8026a2a78
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
@@ -0,0 +1,376 @@
+/* Enumerate available IFUNC implementations of a function.  i686 version.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ifunc-impl-list.h>
+#include "init-arch.h"
+
+/* Maximum number of IFUNC implementations.  */
+#define MAX_IFUNC	4
+
+/* Fill ARRAY of MAX elements with IFUNC implementations for function
+   NAME and return the number of valid entries.  */
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+			size_t max)
+{
+  assert (max >= MAX_IFUNC);
+
+  size_t i = 0;
+
+  /* Support sysdeps/i386/i686/multiarch/bcopy.S.  */
+  IFUNC_IMPL (i, name, bcopy,
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3),
+			      __bcopy_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3),
+			      __bcopy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSE2),
+			      __bcopy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/bzero.S.  */
+  IFUNC_IMPL (i, name, bzero,
+	      IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
+			      __bzero_sse2_rep)
+	      IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
+			      __bzero_sse2)
+	      IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memchr.S.  */
+  IFUNC_IMPL (i, name, memchr,
+	      IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2),
+			      __memchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2),
+			      __memchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memcmp.S.  */
+  IFUNC_IMPL (i, name, memcmp,
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __memcmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
+			      __memcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memmove_chk.S.  */
+  IFUNC_IMPL (i, name, __memmove_chk,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memmove_chk_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memmove_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memmove_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+			      __memmove_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memmove.S.  */
+  IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+			      __memmove_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+			      __memmove_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSE2),
+			      __memmove_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memrchr.S.  */
+  IFUNC_IMPL (i, name, memrchr,
+	      IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2),
+			      __memrchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2),
+			      __memrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memset_chk.S.  */
+  IFUNC_IMPL (i, name, __memset_chk,
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memset_chk_sse2_rep)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memset_chk_sse2)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memset.S.  */
+  IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
+			      __memset_sse2_rep)
+	      IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
+			      __memset_sse2)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/rawmemchr.S.  */
+  IFUNC_IMPL (i, name, rawmemchr,
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2),
+			      __rawmemchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2),
+			      __rawmemchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/stpncpy.S.  */
+  IFUNC_IMPL (i, name, stpncpy,
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpncpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSE2),
+			      __stpncpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/stpcpy.S.  */
+  IFUNC_IMPL (i, name, stpcpy,
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSE2),
+			      __stpcpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcasecmp.S.  */
+  IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strcasecmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strcasecmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcasecmp_l.S.  */
+  IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strcasecmp_l_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strcasecmp_l_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
+			      __strcasecmp_l_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcat.S.  */
+  IFUNC_IMPL (i, name, strcat,
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
+			      __strcat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSE2),
+			      __strcat_sse2)
+	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strchr.S.  */
+  IFUNC_IMPL (i, name, strchr,
+	      IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2),
+			      __strchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2),
+			      __strchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcmp.S.  */
+  IFUNC_IMPL (i, name, strcmp,
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __strcmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
+			      __strcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcpy.S.  */
+  IFUNC_IMPL (i, name, strcpy,
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
+			      __strcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSE2),
+			      __strcpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcspn.S.  */
+  IFUNC_IMPL (i, name, strcspn,
+	      IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strcspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncase.S.  */
+  IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strncasecmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strncasecmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
+			      __strncasecmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncase_l.S.  */
+  IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strncasecmp_l_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strncasecmp_l_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
+			      __strncasecmp_l_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncat.S.  */
+  IFUNC_IMPL (i, name, strncat,
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
+			      __strncat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSE2),
+			      __strncat_sse2)
+	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncpy.S.  */
+  IFUNC_IMPL (i, name, strncpy,
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
+			      __strncpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSE2),
+			      __strncpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strnlen.S.  */
+  IFUNC_IMPL (i, name, strnlen,
+	      IFUNC_IMPL_ADD (array, i, strnlen, HAS_CPU_FEATURE (SSE2),
+			      __strnlen_sse2)
+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strpbrk.S.  */
+  IFUNC_IMPL (i, name, strpbrk,
+	      IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2),
+			      __strpbrk_sse42)
+	      IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strrchr.S.  */
+  IFUNC_IMPL (i, name, strrchr,
+	      IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2),
+			      __strrchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2),
+			      __strrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strspn.S.  */
+  IFUNC_IMPL (i, name, strspn,
+	      IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcschr.S.  */
+  IFUNC_IMPL (i, name, wcschr,
+	      IFUNC_IMPL_ADD (array, i, wcschr, HAS_CPU_FEATURE (SSE2),
+			      __wcschr_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcscmp.S.  */
+  IFUNC_IMPL (i, name, wcscmp,
+	      IFUNC_IMPL_ADD (array, i, wcscmp, HAS_CPU_FEATURE (SSE2),
+			      __wcscmp_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcscpy.S.  */
+  IFUNC_IMPL (i, name, wcscpy,
+	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
+			      __wcscpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcslen.S.  */
+  IFUNC_IMPL (i, name, wcslen,
+	      IFUNC_IMPL_ADD (array, i, wcslen, HAS_CPU_FEATURE (SSE2),
+			      __wcslen_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcsrchr.S.  */
+  IFUNC_IMPL (i, name, wcsrchr,
+	      IFUNC_IMPL_ADD (array, i, wcsrchr, HAS_CPU_FEATURE (SSE2),
+			      __wcsrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wmemcmp.S.  */
+  IFUNC_IMPL (i, name, wmemcmp,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __wmemcmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
+			      __wmemcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_ia32))
+
+#ifdef SHARED
+  /* Support sysdeps/i386/i686/multiarch/memcpy_chk.S.  */
+  IFUNC_IMPL (i, name, __memcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_chk_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memcpy.S.  */
+  IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSE2),
+			      __memcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S.  */
+  IFUNC_IMPL (i, name, __mempcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_chk_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __mempcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/mempcpy.S.  */
+  IFUNC_IMPL (i, name, mempcpy,
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSE2),
+			      __mempcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strlen.S.  */
+  IFUNC_IMPL (i, name, strlen,
+	      IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2),
+			      __strlen_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2),
+			      __strlen_sse2)
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncmp.S.  */
+  IFUNC_IMPL (i, name, strncmp,
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
+			      __strncmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
+			      __strncmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_ia32))
+#endif
+
+  return i;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym
new file mode 100644
index 0000000000..aebff9a4f9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym
@@ -0,0 +1,11 @@
+#include <locale/localeinfo.h>
+#include <langinfo.h>
+#include <stddef.h>
+
+--
+
+LOCALE_T___LOCALES		offsetof (struct __locale_struct, __locales)
+LC_CTYPE
+_NL_CTYPE_NONASCII_CASE
+LOCALE_DATA_VALUES		offsetof (struct __locale_data, values)
+SIZEOF_VALUES			sizeof (((struct __locale_data *) 0)->values[0])
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
new file mode 100644
index 0000000000..dd316486e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
@@ -0,0 +1,502 @@
+/* Optimized memchr with sse2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+#  define LEN   STR2+4
+#  define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+# endif
+
+# ifndef MEMCHR
+#  define MEMCHR __memchr_sse2_bsf
+# endif
+
+	.text
+ENTRY (MEMCHR)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+# ifndef USE_AS_RAWMEMCHR
+	mov	LEN(%esp), %edx
+	test	%edx, %edx
+	jz	L(return_null_1)
+# endif
+	mov	%ecx, %eax
+
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+	movdqu	(%eax), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	je	L(unaligned_no_match_1)
+/* Check which byte is a match.  */
+	bsf	%ecx, %ecx
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	%ecx, %edx
+	jbe	L(return_null_1)
+# endif
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(unaligned_no_match_1):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$16, %edx
+	jbe	L(return_null_1)
+	PUSH	(%edi)
+	lea	16(%eax), %edi
+	and	$15, %eax
+	and	$-16, %edi
+	add	%eax, %edx
+# else
+	lea	16(%eax), %edx
+	and	$-16, %edx
+# endif
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(return_null_1):
+	xor	%eax, %eax
+	ret
+
+# ifndef USE_AS_RAWMEMCHR
+	CFI_POP	(%edi)
+# endif
+
+	.p2align 4
+L(crosscache):
+/* Handle unaligned string.  */
+
+# ifndef USE_AS_RAWMEMCHR
+	PUSH	(%edi)
+	mov	%eax, %edi
+	and	$15, %ecx
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+# else
+	mov	%eax, %edx
+	and	$15, %ecx
+	and	$-16, %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	sar	%cl, %eax
+	test	%eax, %eax
+	je	L(unaligned_no_match)
+/* Check which byte is a match.  */
+	bsf	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	%eax, %edx
+	jbe	L(return_null)
+	add	%edi, %eax
+	add	%ecx, %eax
+	RETURN
+# else
+	add	%edx, %eax
+	add	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(unaligned_no_match):
+# ifndef USE_AS_RAWMEMCHR
+        /* Calculate the last acceptable address and check for possible
+           addition overflow by using satured math:
+           edx = ecx + edx
+           edx |= -(edx < ecx)  */
+	add	%ecx, %edx
+	sbb	%eax, %eax
+	or	%eax, %edx
+	sub	$16, %edx
+	jbe	L(return_null)
+	add	$16, %edi
+# else
+	add	$16, %edx
+# endif
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+	test	$0x3f, %edi
+# else
+	test	$0x3f, %edx
+# endif
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm3
+# else
+	movdqa	48(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	test	%eax, %eax
+	jnz	L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+	mov	%edi, %ecx
+	and	$-64, %edi
+	and	$63, %ecx
+	add	%ecx, %edx
+# else
+	and	$-64, %edx
+# endif
+
+	.p2align 4
+L(align64_loop):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+	movdqa	16(%edi), %xmm2
+	movdqa	32(%edi), %xmm3
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	(%edx), %xmm0
+	movdqa	16(%edx), %xmm2
+	movdqa	32(%edx), %xmm3
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+	pmovmskb %xmm4, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edi
+# else
+	sub	$64, %edx
+# endif
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+
+	pcmpeqb	%xmm1, %xmm3
+
+# ifndef USE_AS_RAWMEMCHR
+	pcmpeqb	48(%edi), %xmm1
+# else
+	pcmpeqb	48(%edx), %xmm1
+# endif
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	pmovmskb %xmm1, %eax
+	bsf	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	48(%edi, %eax), %eax
+	RETURN
+# else
+	lea	48(%edx, %eax), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	48(%edi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	16(%edi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	xor	%eax, %eax
+	RETURN
+# endif
+	.p2align 4
+L(matches0):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	-16(%eax, %edi), %eax
+	RETURN
+# else
+	lea	-16(%eax, %edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	add	%edi, %eax
+	RETURN
+# else
+	add	%edx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches16):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	16(%eax, %edi), %eax
+	RETURN
+# else
+	lea	16(%eax, %edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches32):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	32(%eax, %edi), %eax
+	RETURN
+# else
+	lea	32(%eax, %edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(matches_1):
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	add	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(matches16_1):
+	sub	$16, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	16(%edi, %eax), %eax
+	RETURN
+
+	.p2align 4
+L(matches32_1):
+	sub	$32, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	32(%edi, %eax), %eax
+	RETURN
+
+	.p2align 4
+L(matches48_1):
+	sub	$48, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	48(%edi, %eax), %eax
+	RETURN
+# endif
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	RETURN
+# else
+	ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S
new file mode 100644
index 0000000000..172d70de13
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S
@@ -0,0 +1,709 @@
+/* Optimized memchr with sse2 without bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef USE_AS_RAWMEMCHR
+#  define ENTRANCE PUSH(%edi);
+#  define PARMS  8
+#  define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+# else
+#  define ENTRANCE
+#  define PARMS  4
+# endif
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+#  define LEN   STR2+4
+# endif
+
+# ifndef MEMCHR
+#  define MEMCHR __memchr_sse2
+# endif
+
+	atom_text_section
+ENTRY (MEMCHR)
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+# ifndef USE_AS_RAWMEMCHR
+	mov	LEN(%esp), %edx
+	test	%edx, %edx
+	jz	L(return_null)
+# endif
+
+	punpcklbw %xmm1, %xmm1
+# ifndef USE_AS_RAWMEMCHR
+	mov	%ecx, %edi
+# else
+	mov	%ecx, %edx
+# endif
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqu	(%edi), %xmm0
+# else
+	movdqu	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(match_case2_prolog)
+
+	sub	$16, %edx
+	jbe	L(return_null)
+	lea	16(%edi), %edi
+	and	$15, %ecx
+	and	$-16, %edi
+	add	%ecx, %edx
+# else
+	jnz	L(match_case1_prolog)
+	lea	16(%edx), %edx
+	and	$-16, %edx
+# endif
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(crosscache):
+	and	$15, %ecx
+# ifndef USE_AS_RAWMEMCHR
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+# else
+	and	$-16, %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	sar	%cl, %eax
+	test	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(match_case2_prolog1)
+        /* "ecx" is less than 16.  Calculate "edx + ecx - 16" by using
+	   "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void
+	   possible addition overflow.  */
+	neg	%ecx
+	add	$16, %ecx
+	sub	%ecx, %edx
+	jbe	L(return_null)
+	lea	16(%edi), %edi
+# else
+	jnz	L(match_case1_prolog1)
+	lea	16(%edx), %edx
+# endif
+
+	.p2align 4
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	64(%edi), %edi
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%edi), %xmm0
+# else
+	lea	64(%edx), %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	64(%edi), %edi
+	mov	%edi, %ecx
+	and	$-64, %edi
+	and	$63, %ecx
+	add	%ecx, %edx
+# else
+	lea	64(%edx), %edx
+	and	$-64, %edx
+# endif
+
+	.p2align 4
+L(align64_loop):
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+	movdqa	16(%edi), %xmm2
+	movdqa	32(%edi), %xmm3
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	(%edx), %xmm0
+	movdqa	16(%edx), %xmm2
+	movdqa	32(%edx), %xmm3
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	pmovmskb %xmm4, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edi
+# else
+	sub	$64, %edx
+# endif
+
+	pmovmskb %xmm0, %eax
+	xor	%ecx, %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	pmovmskb %xmm2, %eax
+	lea	16(%ecx), %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	lea	16(%ecx), %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	pcmpeqb	48(%edi), %xmm1
+# else
+	pcmpeqb	48(%edx), %xmm1
+# endif
+	pmovmskb %xmm1, %eax
+	lea	16(%ecx), %ecx
+
+	.p2align 4
+L(match_case1):
+# ifndef USE_AS_RAWMEMCHR
+	add	%ecx, %edi
+# else
+L(match_case1_prolog1):
+	add	%ecx, %edx
+L(match_case1_prolog):
+# endif
+	test	%al, %al
+	jz	L(match_case1_high)
+	mov	%al, %cl
+	and	$15, %cl
+	jz	L(match_case1_8)
+	test	$0x01, %al
+	jnz	L(ExitCase1_1)
+	test	$0x02, %al
+	jnz	L(ExitCase1_2)
+	test	$0x04, %al
+	jnz	L(ExitCase1_3)
+# ifndef USE_AS_RAWMEMCHR
+	lea	3(%edi), %eax
+	RETURN
+# else
+	lea	3(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_8):
+	test	$0x10, %al
+	jnz	L(ExitCase1_5)
+	test	$0x20, %al
+	jnz	L(ExitCase1_6)
+	test	$0x40, %al
+	jnz	L(ExitCase1_7)
+# ifndef USE_AS_RAWMEMCHR
+	lea	7(%edi), %eax
+	RETURN
+# else
+	lea	7(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_high):
+	mov	%ah, %ch
+	and	$15, %ch
+	jz	L(match_case1_high_8)
+	test	$0x01, %ah
+	jnz	L(ExitCase1_9)
+	test	$0x02, %ah
+	jnz	L(ExitCase1_10)
+	test	$0x04, %ah
+	jnz	L(ExitCase1_11)
+# ifndef USE_AS_RAWMEMCHR
+	lea	11(%edi), %eax
+	RETURN
+# else
+	lea	11(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_high_8):
+	test	$0x10, %ah
+	jnz	L(ExitCase1_13)
+	test	$0x20, %ah
+	jnz	L(ExitCase1_14)
+	test	$0x40, %ah
+	jnz	L(ExitCase1_15)
+# ifndef USE_AS_RAWMEMCHR
+	lea	15(%edi), %eax
+	RETURN
+# else
+	lea	15(%edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$32, %edx
+	jbe	L(return_null)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	48(%edi), %xmm1
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+
+	xor	%eax, %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(ExitCase1_1):
+# ifndef USE_AS_RAWMEMCHR
+	mov	%edi, %eax
+	RETURN
+# else
+	mov	%edx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_2):
+# ifndef USE_AS_RAWMEMCHR
+	lea	1(%edi), %eax
+	RETURN
+# else
+	lea	1(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_3):
+# ifndef USE_AS_RAWMEMCHR
+	lea	2(%edi), %eax
+	RETURN
+# else
+	lea	2(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_5):
+# ifndef USE_AS_RAWMEMCHR
+	lea	4(%edi), %eax
+	RETURN
+# else
+	lea	4(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_6):
+# ifndef USE_AS_RAWMEMCHR
+	lea	5(%edi), %eax
+	RETURN
+# else
+	lea	5(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_7):
+# ifndef USE_AS_RAWMEMCHR
+	lea	6(%edi), %eax
+	RETURN
+# else
+	lea	6(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_9):
+# ifndef USE_AS_RAWMEMCHR
+	lea	8(%edi), %eax
+	RETURN
+# else
+	lea	8(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_10):
+# ifndef USE_AS_RAWMEMCHR
+	lea	9(%edi), %eax
+	RETURN
+# else
+	lea	9(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_11):
+# ifndef USE_AS_RAWMEMCHR
+	lea	10(%edi), %eax
+	RETURN
+# else
+	lea	10(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_13):
+# ifndef USE_AS_RAWMEMCHR
+	lea	12(%edi), %eax
+	RETURN
+# else
+	lea	12(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_14):
+# ifndef USE_AS_RAWMEMCHR
+	lea	13(%edi), %eax
+	RETURN
+# else
+	lea	13(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_15):
+# ifndef USE_AS_RAWMEMCHR
+	lea	14(%edi), %eax
+	RETURN
+# else
+	lea	14(%edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(match_case2):
+	sub	%ecx, %edx
+L(match_case2_prolog1):
+	add	%ecx, %edi
+L(match_case2_prolog):
+	test	%al, %al
+	jz	L(match_case2_high)
+	mov	%al, %cl
+	and	$15, %cl
+	jz	L(match_case2_8)
+	test	$0x01, %al
+	jnz	L(ExitCase2_1)
+	test	$0x02, %al
+	jnz	L(ExitCase2_2)
+	test	$0x04, %al
+	jnz	L(ExitCase2_3)
+	sub	$4, %edx
+	jb	L(return_null)
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_8):
+	test	$0x10, %al
+	jnz	L(ExitCase2_5)
+	test	$0x20, %al
+	jnz	L(ExitCase2_6)
+	test	$0x40, %al
+	jnz	L(ExitCase2_7)
+	sub	$8, %edx
+	jb	L(return_null)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_high):
+	mov	%ah, %ch
+	and	$15, %ch
+	jz	L(match_case2_high_8)
+	test	$0x01, %ah
+	jnz	L(ExitCase2_9)
+	test	$0x02, %ah
+	jnz	L(ExitCase2_10)
+	test	$0x04, %ah
+	jnz	L(ExitCase2_11)
+	sub	$12, %edx
+	jb	L(return_null)
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_high_8):
+	test	$0x10, %ah
+	jnz	L(ExitCase2_13)
+	test	$0x20, %ah
+	jnz	L(ExitCase2_14)
+	test	$0x40, %ah
+	jnz	L(ExitCase2_15)
+	sub	$16, %edx
+	jb	L(return_null)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_1):
+	mov	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_2):
+	sub	$2, %edx
+	jb	L(return_null)
+	lea	1(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_3):
+	sub	$3, %edx
+	jb	L(return_null)
+	lea	2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_5):
+	sub	$5, %edx
+	jb	L(return_null)
+	lea	4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_6):
+	sub	$6, %edx
+	jb	L(return_null)
+	lea	5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_7):
+	sub	$7, %edx
+	jb	L(return_null)
+	lea	6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_9):
+	sub	$9, %edx
+	jb	L(return_null)
+	lea	8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_10):
+	sub	$10, %edx
+	jb	L(return_null)
+	lea	9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_11):
+	sub	$11, %edx
+	jb	L(return_null)
+	lea	10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_13):
+	sub	$13, %edx
+	jb	L(return_null)
+	lea	12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_14):
+	sub	$14, %edx
+	jb	L(return_null)
+	lea	13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_15):
+	sub	$15, %edx
+	jb	L(return_null)
+	lea	14(%edi), %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	RETURN
+# else
+	ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S
new file mode 100644
index 0000000000..bd0dace290
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S
@@ -0,0 +1,65 @@
+/* Multiple versions of memchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__memchr)
+	.type	__memchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	3f
+
+	LOAD_FUNC_GOT_EAX ( __memchr_sse2)
+	ret
+
+2:	LOAD_FUNC_GOT_EAX (__memchr_ia32)
+	ret
+
+3:	LOAD_FUNC_GOT_EAX (__memchr_sse2_bsf)
+	ret
+END(__memchr)
+
+weak_alias(__memchr, memchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memchr_ia32, @function; \
+	.globl __memchr_ia32; \
+	.p2align 4; \
+	__memchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memchr_ia32, .-__memchr_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_memchr; __GI_memchr = __memchr_ia32
+
+#endif
+#include "../../memchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S
new file mode 100644
index 0000000000..2aa13048b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -0,0 +1,1225 @@
+/* memcmp with SSE4.2, wmemcmp with SSE4.2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_sse4_2
+# endif
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define BLK1	PARMS
+# define BLK2	BLK1 + 4
+# define LEN	BLK2 + 4
+# define RETURN	POP (%ebx); ret; CFI_PUSH (%ebx)
+
+
+# ifdef SHARED
+#  define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+/* We first load PC into EBX.  */	\
+	SETUP_PIC_REG(bx);	\
+/* Get the address of the jump table.  */	\
+	addl	$(TABLE - .), %ebx;	\
+/* Get the entry and convert the relative offset to the	\
+	absolute	address.  */	\
+	addl	(%ebx,INDEX,SCALE), %ebx;	\
+/* We loaded the jump table and adjusted EDX/ESI. Go.  */	\
+	jmp	*%ebx
+# else
+#  define JMPTBL(I, B)	I
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (MEMCMP)
+	movl	BLK1(%esp), %eax
+	movl	BLK2(%esp), %edx
+	movl	LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(return0)
+# else
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+# endif
+
+	pxor	%xmm0, %xmm0
+	cmp	$64, %ecx
+	ja	L(64bytesormore)
+	cmp	$8, %ecx
+
+# ifndef USE_AS_WMEMCMP
+	PUSH	(%ebx)
+	jb	L(less8bytes)
+# else
+	jb	L(less8bytes)
+	PUSH	(%ebx)
+# endif
+
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %bl
+	cmpb	(%edx), %bl
+	jne	L(nonzero)
+
+	mov	1(%eax), %bl
+	cmpb	1(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$2, %ecx
+	jz	L(0bytes)
+
+	mov	2(%eax), %bl
+	cmpb	2(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$3, %ecx
+	jz	L(0bytes)
+
+	mov	3(%eax), %bl
+	cmpb	3(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$4, %ecx
+	jz	L(0bytes)
+
+	mov	4(%eax), %bl
+	cmpb	4(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$5, %ecx
+	jz	L(0bytes)
+
+	mov	5(%eax), %bl
+	cmpb	5(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$6, %ecx
+	jz	L(0bytes)
+
+	mov	6(%eax), %bl
+	cmpb	6(%edx), %bl
+	je	L(0bytes)
+
+L(nonzero):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(above)
+	neg	%eax
+L(above):
+	ret
+	CFI_PUSH (%ebx)
+# endif
+
+	.p2align 4
+L(0bytes):
+	POP	(%ebx)
+	xor	%eax, %eax
+	ret
+
+# ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	je	L(return0)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+	.p2align 4
+L(return0):
+	xor	%eax, %eax
+	ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less1bytes):
+	jb	L(0bytesend)
+	movzbl	(%eax), %eax
+	movzbl	(%edx), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(0bytesend):
+	xor	%eax, %eax
+	ret
+# endif
+	.p2align 4
+L(64bytesormore):
+	PUSH	(%ebx)
+	mov	%ecx, %ebx
+	mov	$64, %ecx
+	sub	$64, %ebx
+L(64bytesormore_loop):
+	movdqu	(%eax), %xmm1
+	movdqu	(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_16diff)
+
+	movdqu	16(%eax), %xmm1
+	movdqu	16(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_32diff)
+
+	movdqu	32(%eax), %xmm1
+	movdqu	32(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_48diff)
+
+	movdqu	48(%eax), %xmm1
+	movdqu	48(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_64diff)
+	add	%ecx, %eax
+	add	%ecx, %edx
+	sub	%ecx, %ebx
+	jae	L(64bytesormore_loop)
+	add	%ebx, %ecx
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+# ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+# endif
+	.p2align 4
+L(find_16diff):
+	sub	$16, %ecx
+L(find_32diff):
+	sub	$16, %ecx
+L(find_48diff):
+	sub	$16, %ecx
+L(find_64diff):
+	add	%ecx, %edx
+	add	%ecx, %eax
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# else
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	cmp	-4(%edx), %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(49bytes):
+	movdqu	-49(%eax), %xmm1
+	movdqu	-49(%edx), %xmm2
+	mov	$-49, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(33bytes):
+	movdqu	-33(%eax), %xmm1
+	movdqu	-33(%edx), %xmm2
+	mov	$-33, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(50bytes):
+	mov	$-50, %ebx
+	movdqu	-50(%eax), %xmm1
+	movdqu	-50(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(34bytes):
+	mov	$-34, %ebx
+	movdqu	-34(%eax), %xmm1
+	movdqu	-34(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(51bytes):
+	mov	$-51, %ebx
+	movdqu	-51(%eax), %xmm1
+	movdqu	-51(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(35bytes):
+	mov	$-35, %ebx
+	movdqu	-35(%eax), %xmm1
+	movdqu	-35(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+L(1bytes):
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+	.p2align 4
+L(52bytes):
+	movdqu	-52(%eax), %xmm1
+	movdqu	-52(%edx), %xmm2
+	mov	$-52, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(36bytes):
+	movdqu	-36(%eax), %xmm1
+	movdqu	-36(%edx), %xmm2
+	mov	$-36, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(20bytes):
+	movdqu	-20(%eax), %xmm1
+	movdqu	-20(%edx), %xmm2
+	mov	$-20, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(53bytes):
+	movdqu	-53(%eax), %xmm1
+	movdqu	-53(%edx), %xmm2
+	mov	$-53, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(37bytes):
+	mov	$-37, %ebx
+	movdqu	-37(%eax), %xmm1
+	movdqu	-37(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(21bytes):
+	mov	$-21, %ebx
+	movdqu	-21(%eax), %xmm1
+	movdqu	-21(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(54bytes):
+	movdqu	-54(%eax), %xmm1
+	movdqu	-54(%edx), %xmm2
+	mov	$-54, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(38bytes):
+	mov	$-38, %ebx
+	movdqu	-38(%eax), %xmm1
+	movdqu	-38(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(22bytes):
+	mov	$-22, %ebx
+	movdqu	-22(%eax), %xmm1
+	movdqu	-22(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(55bytes):
+	movdqu	-55(%eax), %xmm1
+	movdqu	-55(%edx), %xmm2
+	mov	$-55, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(39bytes):
+	mov	$-39, %ebx
+	movdqu	-39(%eax), %xmm1
+	movdqu	-39(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(23bytes):
+	mov	$-23, %ebx
+	movdqu	-23(%eax), %xmm1
+	movdqu	-23(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+	.p2align 4
+L(56bytes):
+	movdqu	-56(%eax), %xmm1
+	movdqu	-56(%edx), %xmm2
+	mov	$-56, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(40bytes):
+	mov	$-40, %ebx
+	movdqu	-40(%eax), %xmm1
+	movdqu	-40(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(24bytes):
+	mov	$-24, %ebx
+	movdqu	-24(%eax), %xmm1
+	movdqu	-24(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(57bytes):
+	movdqu	-57(%eax), %xmm1
+	movdqu	-57(%edx), %xmm2
+	mov	$-57, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(41bytes):
+	mov	$-41, %ebx
+	movdqu	-41(%eax), %xmm1
+	movdqu	-41(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(25bytes):
+	mov	$-25, %ebx
+	movdqu	-25(%eax), %xmm1
+	movdqu	-25(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(58bytes):
+	movdqu	-58(%eax), %xmm1
+	movdqu	-58(%edx), %xmm2
+	mov	$-58, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(42bytes):
+	mov	$-42, %ebx
+	movdqu	-42(%eax), %xmm1
+	movdqu	-42(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(26bytes):
+	mov	$-26, %ebx
+	movdqu	-26(%eax), %xmm1
+	movdqu	-26(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(59bytes):
+	movdqu	-59(%eax), %xmm1
+	movdqu	-59(%edx), %xmm2
+	mov	$-59, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(43bytes):
+	mov	$-43, %ebx
+	movdqu	-43(%eax), %xmm1
+	movdqu	-43(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(27bytes):
+	mov	$-27, %ebx
+	movdqu	-27(%eax), %xmm1
+	movdqu	-27(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+	.p2align 4
+L(60bytes):
+	movdqu	-60(%eax), %xmm1
+	movdqu	-60(%edx), %xmm2
+	mov	$-60, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(44bytes):
+	mov	$-44, %ebx
+	movdqu	-44(%eax), %xmm1
+	movdqu	-44(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(28bytes):
+	mov	$-28, %ebx
+	movdqu	-28(%eax), %xmm1
+	movdqu	-28(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-12(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(61bytes):
+	movdqu	-61(%eax), %xmm1
+	movdqu	-61(%edx), %xmm2
+	mov	$-61, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(45bytes):
+	mov	$-45, %ebx
+	movdqu	-45(%eax), %xmm1
+	movdqu	-45(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(29bytes):
+	mov	$-29, %ebx
+	movdqu	-29(%eax), %xmm1
+	movdqu	-29(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(62bytes):
+	movdqu	-62(%eax), %xmm1
+	movdqu	-62(%edx), %xmm2
+	mov	$-62, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(46bytes):
+	mov	$-46, %ebx
+	movdqu	-46(%eax), %xmm1
+	movdqu	-46(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(30bytes):
+	mov	$-30, %ebx
+	movdqu	-30(%eax), %xmm1
+	movdqu	-30(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(63bytes):
+	movdqu	-63(%eax), %xmm1
+	movdqu	-63(%edx), %xmm2
+	mov	$-63, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(47bytes):
+	mov	$-47, %ebx
+	movdqu	-47(%eax), %xmm1
+	movdqu	-47(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(31bytes):
+	mov	$-31, %ebx
+	movdqu	-31(%eax), %xmm1
+	movdqu	-31(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+
+	.p2align 4
+L(64bytes):
+	movdqu	-64(%eax), %xmm1
+	movdqu	-64(%edx), %xmm2
+	mov	$-64, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(48bytes):
+	movdqu	-48(%eax), %xmm1
+	movdqu	-48(%edx), %xmm2
+	mov	$-48, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(32bytes):
+	movdqu	-32(%eax), %xmm1
+	movdqu	-32(%edx), %xmm2
+	mov	$-32, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-16(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-16(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-12(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	mov	(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	mov	4(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	mov	8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	mov	12(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# else
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	cmp	4(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	cmp	8(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	cmp	12(%edx), %ecx
+
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# endif
+
+	.p2align 4
+L(find_diff):
+# ifndef USE_AS_WMEMCMP
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+L(end):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+# else
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(bigger):
+	ret
+# endif
+END (MEMCMP)
+
+	.section .rodata.sse4.2,"a",@progbits
+	.p2align 2
+	.type	L(table_64bytes), @object
+# ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(1bytes), L(table_64bytes))
+	.int	JMPTBL (L(2bytes), L(table_64bytes))
+	.int	JMPTBL (L(3bytes), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(5bytes), L(table_64bytes))
+	.int	JMPTBL (L(6bytes), L(table_64bytes))
+	.int	JMPTBL (L(7bytes), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(9bytes), L(table_64bytes))
+	.int	JMPTBL (L(10bytes), L(table_64bytes))
+	.int	JMPTBL (L(11bytes), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(13bytes), L(table_64bytes))
+	.int	JMPTBL (L(14bytes), L(table_64bytes))
+	.int	JMPTBL (L(15bytes), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(17bytes), L(table_64bytes))
+	.int	JMPTBL (L(18bytes), L(table_64bytes))
+	.int	JMPTBL (L(19bytes), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(21bytes), L(table_64bytes))
+	.int	JMPTBL (L(22bytes), L(table_64bytes))
+	.int	JMPTBL (L(23bytes), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(25bytes), L(table_64bytes))
+	.int	JMPTBL (L(26bytes), L(table_64bytes))
+	.int	JMPTBL (L(27bytes), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(29bytes), L(table_64bytes))
+	.int	JMPTBL (L(30bytes), L(table_64bytes))
+	.int	JMPTBL (L(31bytes), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(33bytes), L(table_64bytes))
+	.int	JMPTBL (L(34bytes), L(table_64bytes))
+	.int	JMPTBL (L(35bytes), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(37bytes), L(table_64bytes))
+	.int	JMPTBL (L(38bytes), L(table_64bytes))
+	.int	JMPTBL (L(39bytes), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(41bytes), L(table_64bytes))
+	.int	JMPTBL (L(42bytes), L(table_64bytes))
+	.int	JMPTBL (L(43bytes), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(45bytes), L(table_64bytes))
+	.int	JMPTBL (L(46bytes), L(table_64bytes))
+	.int	JMPTBL (L(47bytes), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(49bytes), L(table_64bytes))
+	.int	JMPTBL (L(50bytes), L(table_64bytes))
+	.int	JMPTBL (L(51bytes), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(53bytes), L(table_64bytes))
+	.int	JMPTBL (L(54bytes), L(table_64bytes))
+	.int	JMPTBL (L(55bytes), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(57bytes), L(table_64bytes))
+	.int	JMPTBL (L(58bytes), L(table_64bytes))
+	.int	JMPTBL (L(59bytes), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(61bytes), L(table_64bytes))
+	.int	JMPTBL (L(62bytes), L(table_64bytes))
+	.int	JMPTBL (L(63bytes), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+# else
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000000..5ebf5a4d73
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -0,0 +1,2157 @@
+/* memcmp with SSSE3, wmemcmp with SSSE3
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP		__memcmp_ssse3
+# endif
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS		4
+# define BLK1		PARMS
+# define BLK2		BLK1+4
+# define LEN		BLK2+4
+# define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
+# define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	atom_text_section
+ENTRY (MEMCMP)
+	movl	LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(zero)
+# endif
+
+	movl	BLK1(%esp), %eax
+	cmp	$48, %ecx
+	movl	BLK2(%esp), %edx
+	jae	L(48bytesormore)
+
+# ifndef USE_AS_WMEMCMP
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+# endif
+
+	PUSH	(%ebx)
+	add	%ecx, %edx
+	add	%ecx, %eax
+	jmp	L(less48bytes)
+
+	CFI_POP	(%ebx)
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less1bytes):
+	jb	L(zero)
+	movb	(%eax), %cl
+	cmp	(%edx), %cl
+	je	L(zero)
+	mov	$1, %eax
+	ja	L(1bytesend)
+	neg	%eax
+L(1bytesend):
+	ret
+# endif
+
+	.p2align 4
+L(zero):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(48bytesormore):
+	PUSH	(%ebx)
+	PUSH	(%esi)
+	PUSH	(%edi)
+	cfi_remember_state
+	movdqu	(%eax), %xmm3
+	movdqu	(%edx), %xmm0
+	movl	%eax, %edi
+	movl	%edx, %esi
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%edi), %edi
+
+	sub	$0xffff, %edx
+	lea	16(%esi), %esi
+	jnz	L(less16bytes)
+	mov	%edi, %edx
+	and	$0xf, %edx
+	xor	%edx, %edi
+	sub	%edx, %esi
+	add	%edx, %ecx
+	mov	%esi, %edx
+	and	$0xf, %edx
+	jz	L(shr_0)
+	xor	%edx, %esi
+
+# ifndef USE_AS_WMEMCMP
+	cmp	$8, %edx
+	jae	L(next_unaligned_table)
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$1, %edx
+	je	L(shr_1)
+	cmp	$2, %edx
+	je	L(shr_2)
+	cmp	$3, %edx
+	je	L(shr_3)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$5, %edx
+	je	L(shr_5)
+	cmp	$6, %edx
+	je	L(shr_6)
+	jmp	L(shr_7)
+
+	.p2align 2
+L(next_unaligned_table):
+	cmp	$8, %edx
+	je	L(shr_8)
+	cmp	$9, %edx
+	je	L(shr_9)
+	cmp	$10, %edx
+	je	L(shr_10)
+	cmp	$11, %edx
+	je	L(shr_11)
+	cmp	$12, %edx
+	je	L(shr_12)
+	cmp	$13, %edx
+	je	L(shr_13)
+	cmp	$14, %edx
+	je	L(shr_14)
+	jmp	L(shr_15)
+# else
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$8, %edx
+	je	L(shr_8)
+	jmp	L(shr_12)
+# endif
+
+	.p2align 4
+L(shr_0):
+	cmp	$80, %ecx
+	jae	L(shr_0_gobble)
+	lea	-48(%ecx), %ecx
+	xor	%eax, %eax
+	movaps	(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+	movaps	16(%esi), %xmm2
+	pcmpeqb	16(%edi), %xmm2
+	pand	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
+	add	$32, %edi
+	add	$32, %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_0_gobble):
+	lea	-48(%ecx), %ecx
+	movdqa	(%esi), %xmm0
+	xor	%eax, %eax
+	pcmpeqb	(%edi), %xmm0
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm2
+	pcmpeqb	16(%edi), %xmm2
+L(shr_0_gobble_loop):
+	pand	%xmm0, %xmm2
+	sub	$32, %ecx
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	movdqa	32(%esi), %xmm0
+	movdqa	48(%esi), %xmm2
+	sbb	$0xffff, %edx
+	pcmpeqb	32(%edi), %xmm0
+	pcmpeqb	48(%edi), %xmm2
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	jz	L(shr_0_gobble_loop)
+
+	pand	%xmm0, %xmm2
+	cmp	$0, %ecx
+	jge	L(shr_0_gobble_loop_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_0_gobble_loop_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_1):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_1_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$1,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$1,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	1(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_1_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$1,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$1,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_1_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$1,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$1,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_1_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_1_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_1_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	1(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_2):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_2_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$2,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$2,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	2(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_2_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$2,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$2,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_2_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$2,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$2,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_2_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_2_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_2_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	2(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_3):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_3_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$3,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$3,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	3(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_3_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$3,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$3,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_3_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$3,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$3,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_3_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_3_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_3_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	3(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_4):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_4_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$4,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$4,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	4(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_4_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$4,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$4,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_4_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$4,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$4,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_4_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_4_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_4_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	4(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_5):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_5_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$5,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$5,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	5(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_5_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$5,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$5,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_5_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$5,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$5,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_5_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_5_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_5_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	5(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_6):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_6_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$6,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$6,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	6(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_6_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$6,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$6,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_6_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$6,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$6,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_6_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_6_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_6_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	6(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_7):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_7_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$7,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$7,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	7(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_7_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$7,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$7,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_7_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$7,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$7,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_7_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_7_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_7_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	7(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_8):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_8_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$8,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$8,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	8(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_8_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$8,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$8,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_8_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$8,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$8,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_8_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_8_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_8_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	8(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_9):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_9_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$9,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$9,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	9(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_9_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$9,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$9,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_9_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$9,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$9,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_9_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_9_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_9_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	9(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_10):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_10_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$10, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$10,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	10(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_10_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$10, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$10, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_10_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$10,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$10,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_10_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_10_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_10_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	10(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_11):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_11_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$11, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	11(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_11_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$11, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$11, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_11_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$11,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$11,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_11_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_11_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_11_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	11(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_12):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_12_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$12, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	12(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_12_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$12, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$12, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_12_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$12,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$12,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_12_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_12_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_12_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	12(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_13):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_13_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$13, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	13(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_13_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$13, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$13, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_13_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$13,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$13,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_13_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_13_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_13_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	13(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_14):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_14_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$14, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	14(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_14_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$14, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$14, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_14_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$14,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$14,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_14_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_14_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_14_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	14(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_15):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_15_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$15, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	15(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_15_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$15, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$15, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_15_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$15,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$15,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_15_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_15_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_15_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	15(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(exit):
+	pmovmskb %xmm1, %ebx
+	sub	$0xffff, %ebx
+	jz	L(first16bytes)
+	lea	-16(%esi), %esi
+	lea	-16(%edi), %edi
+	mov	%ebx, %edx
+
+L(first16bytes):
+	add	%eax, %esi
+L(less16bytes):
+
+# ifndef USE_AS_WMEMCMP
+	test	%dl, %dl
+	jz	L(next_24_bytes)
+
+	test	$0x01, %dl
+	jnz	L(Byte16)
+
+	test	$0x02, %dl
+	jnz	L(Byte17)
+
+	test	$0x04, %dl
+	jnz	L(Byte18)
+
+	test	$0x08, %dl
+	jnz	L(Byte19)
+
+	test	$0x10, %dl
+	jnz	L(Byte20)
+
+	test	$0x20, %dl
+	jnz	L(Byte21)
+
+	test	$0x40, %dl
+	jnz	L(Byte22)
+L(Byte23):
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte16):
+	movzbl	-16(%edi), %eax
+	movzbl	-16(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte17):
+	movzbl	-15(%edi), %eax
+	movzbl	-15(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte18):
+	movzbl	-14(%edi), %eax
+	movzbl	-14(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte19):
+	movzbl	-13(%edi), %eax
+	movzbl	-13(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte20):
+	movzbl	-12(%edi), %eax
+	movzbl	-12(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte21):
+	movzbl	-11(%edi), %eax
+	movzbl	-11(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte22):
+	movzbl	-10(%edi), %eax
+	movzbl	-10(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(next_24_bytes):
+	lea	8(%edi), %edi
+	lea	8(%esi), %esi
+	test	$0x01, %dh
+	jnz	L(Byte16)
+
+	test	$0x02, %dh
+	jnz	L(Byte17)
+
+	test	$0x04, %dh
+	jnz	L(Byte18)
+
+	test	$0x08, %dh
+	jnz	L(Byte19)
+
+	test	$0x10, %dh
+	jnz	L(Byte20)
+
+	test	$0x20, %dh
+	jnz	L(Byte21)
+
+	test	$0x40, %dh
+	jnz	L(Byte22)
+
+	.p2align 4
+L(Byte31):
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
+	sub	%edx, %eax
+	RETURN_END
+# else
+
+/* special for wmemcmp */
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	-16(%edi), %eax
+	cmp	-16(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word):
+	mov	-12(%edi), %eax
+	cmp	-12(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	-8(%edi), %eax
+	cmp	-8(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word):
+	mov	-4(%edi), %eax
+	cmp	-4(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(nequal):
+	mov	$1, %eax
+	jg	L(nequal_bigger)
+	neg	%eax
+	RETURN
+
+	.p2align 4
+L(nequal_bigger):
+	RETURN_END
+# endif
+
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(more8bytes):
+	cmp	$16, %ecx
+	jae	L(more16bytes)
+	cmp	$8, %ecx
+	je	L(8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$9, %ecx
+	je	L(9bytes)
+	cmp	$10, %ecx
+	je	L(10bytes)
+	cmp	$11, %ecx
+	je	L(11bytes)
+	cmp	$12, %ecx
+	je	L(12bytes)
+	cmp	$13, %ecx
+	je	L(13bytes)
+	cmp	$14, %ecx
+	je	L(14bytes)
+	jmp	L(15bytes)
+# else
+	jmp	L(12bytes)
+# endif
+
+	.p2align 4
+L(more16bytes):
+	cmp	$24, %ecx
+	jae	L(more24bytes)
+	cmp	$16, %ecx
+	je	L(16bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$17, %ecx
+	je	L(17bytes)
+	cmp	$18, %ecx
+	je	L(18bytes)
+	cmp	$19, %ecx
+	je	L(19bytes)
+	cmp	$20, %ecx
+	je	L(20bytes)
+	cmp	$21, %ecx
+	je	L(21bytes)
+	cmp	$22, %ecx
+	je	L(22bytes)
+	jmp	L(23bytes)
+# else
+	jmp	L(20bytes)
+# endif
+
+	.p2align 4
+L(more24bytes):
+	cmp	$32, %ecx
+	jae	L(more32bytes)
+	cmp	$24, %ecx
+	je	L(24bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$25, %ecx
+	je	L(25bytes)
+	cmp	$26, %ecx
+	je	L(26bytes)
+	cmp	$27, %ecx
+	je	L(27bytes)
+	cmp	$28, %ecx
+	je	L(28bytes)
+	cmp	$29, %ecx
+	je	L(29bytes)
+	cmp	$30, %ecx
+	je	L(30bytes)
+	jmp	L(31bytes)
+# else
+	jmp	L(28bytes)
+# endif
+
+	.p2align 4
+L(more32bytes):
+	cmp	$40, %ecx
+	jae	L(more40bytes)
+	cmp	$32, %ecx
+	je	L(32bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$33, %ecx
+	je	L(33bytes)
+	cmp	$34, %ecx
+	je	L(34bytes)
+	cmp	$35, %ecx
+	je	L(35bytes)
+	cmp	$36, %ecx
+	je	L(36bytes)
+	cmp	$37, %ecx
+	je	L(37bytes)
+	cmp	$38, %ecx
+	je	L(38bytes)
+	jmp	L(39bytes)
+# else
+	jmp	L(36bytes)
+# endif
+
+	.p2align 4
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+# else
+	jmp	L(4bytes)
+# endif
+
+	.p2align 4
+L(more40bytes):
+	cmp	$40, %ecx
+	je	L(40bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$41, %ecx
+	je	L(41bytes)
+	cmp	$42, %ecx
+	je	L(42bytes)
+	cmp	$43, %ecx
+	je	L(43bytes)
+	cmp	$44, %ecx
+	je	L(44bytes)
+	cmp	$45, %ecx
+	je	L(45bytes)
+	cmp	$46, %ecx
+	je	L(46bytes)
+	jmp	L(47bytes)
+
+	.p2align 4
+L(44bytes):
+	mov	-44(%eax), %ecx
+	mov	-44(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	mov	-40(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	mov	-36(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	mov	-32(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	mov	-28(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	mov	-24(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	mov	-20(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+# else
+	.p2align 4
+L(44bytes):
+	mov	-44(%eax), %ecx
+	cmp	-44(%edx), %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	cmp	-40(%edx), %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	cmp	-36(%edx), %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	cmp	-32(%edx), %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	cmp	-28(%edx), %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	cmp	-24(%edx), %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	cmp	-20(%edx), %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	xor	%eax, %eax
+	cmp	-4(%edx), %ecx
+	jne	L(find_diff)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+# endif
+
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
+L(45bytes):
+	mov	-45(%eax), %ecx
+	mov	-45(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(41bytes):
+	mov	-41(%eax), %ecx
+	mov	-41(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(37bytes):
+	mov	-37(%eax), %ecx
+	mov	-37(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(33bytes):
+	mov	-33(%eax), %ecx
+	mov	-33(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(29bytes):
+	mov	-29(%eax), %ecx
+	mov	-29(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(25bytes):
+	mov	-25(%eax), %ecx
+	mov	-25(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(21bytes):
+	mov	-21(%eax), %ecx
+	mov	-21(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(46bytes):
+	mov	-46(%eax), %ecx
+	mov	-46(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(42bytes):
+	mov	-42(%eax), %ecx
+	mov	-42(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(38bytes):
+	mov	-38(%eax), %ecx
+	mov	-38(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(34bytes):
+	mov	-34(%eax), %ecx
+	mov	-34(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(30bytes):
+	mov	-30(%eax), %ecx
+	mov	-30(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(26bytes):
+	mov	-26(%eax), %ecx
+	mov	-26(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(22bytes):
+	mov	-22(%eax), %ecx
+	mov	-22(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(47bytes):
+	movl	-47(%eax), %ecx
+	movl	-47(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(43bytes):
+	movl	-43(%eax), %ecx
+	movl	-43(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(39bytes):
+	movl	-39(%eax), %ecx
+	movl	-39(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(35bytes):
+	movl	-35(%eax), %ecx
+	movl	-35(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(31bytes):
+	movl	-31(%eax), %ecx
+	movl	-31(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(27bytes):
+	movl	-27(%eax), %ecx
+	movl	-27(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(23bytes):
+	movl	-23(%eax), %ecx
+	movl	-23(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(find_diff):
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+
+	.p2align 4
+L(end):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+# else
+
+/* for wmemcmp */
+	.p2align 4
+L(find_diff):
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+# endif
+END (MEMCMP)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S
new file mode 100644
index 0000000000..1fc5994a17
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S
@@ -0,0 +1,62 @@
+/* Multiple versions of memcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(memcmp)
+	.type	memcmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcmp_sse4_2)
+2:	ret
+END(memcmp)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcmp_ia32, @function; \
+	.p2align 4; \
+	.globl __memcmp_ia32; \
+	.hidden __memcmp_ia32; \
+	__memcmp_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcmp_ia32, .-__memcmp_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memcmp; __GI_memcmp = __memcmp_ia32
+# endif
+#endif
+
+#include "../memcmp.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..2fe2072cb1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
@@ -0,0 +1,681 @@
+/* memcpy optimized with SSE2 unaligned memory access instructions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+#  define MEMCPY	__memcpy_sse2_unaligned
+#  define MEMCPY_CHK	__memcpy_chk_sse2_unaligned
+# endif
+
+# ifdef USE_AS_BCOPY
+#  define SRC		PARMS
+#  define DEST		SRC+4
+#  define LEN		DEST+4
+# else
+#  define DEST		PARMS
+#  define SRC		DEST+4
+#  define LEN		SRC+4
+# endif
+
+# define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
+  cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
+  cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS		8		/* Preserve EBX.  */
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN	RETURN_END; CFI_PUSH (%ebx)
+
+	.section .text.sse2,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+	cmp	%edx, %eax
+
+# ifdef USE_AS_MEMMOVE
+	jg	L(check_forward)
+
+L(mm_len_0_or_more_backward):
+/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
+	separately.  */
+	cmp	$16, %ecx
+	jbe	L(mm_len_0_16_bytes_backward)
+
+	cmpl	$32, %ecx
+	jg	L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_32_or_more_backward):
+	cmpl	$64, %ecx
+	jg	L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	-16(%eax, %ecx), %xmm2
+	movdqu	-32(%eax, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -16(%edx, %ecx)
+	movdqu	%xmm3, -32(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_64_or_more_backward):
+	cmpl	$128, %ecx
+	jg	L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_128_or_more_backward):
+	add	%ecx, %eax
+	cmp	%edx, %eax
+	movl	SRC(%esp), %eax
+	jle	L(forward)
+	PUSH (%esi)
+	PUSH (%edi)
+	PUSH (%ebx)
+
+/* Aligning the address of destination. */
+	movdqu	(%eax), %xmm4
+	movdqu	16(%eax), %xmm5
+	movdqu	32(%eax), %xmm6
+	movdqu	48(%eax), %xmm7
+	leal	(%edx, %ecx), %esi
+	movdqu	-16(%eax, %ecx), %xmm0
+	subl	$16, %esp
+	movdqu	%xmm0, (%esp)
+	mov	%ecx, %edi
+	movl	%esi, %ecx
+	andl	$-16, %ecx
+	leal	(%ecx), %ebx
+	subl	%edx, %ebx
+	leal	(%eax, %ebx), %eax
+	shrl	$6, %ebx
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %edi
+# else
+#  ifdef SHARED
+	PUSH (%ebx)
+	SETUP_PIC_REG (bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %edi
+	POP (%ebx)
+#  else
+	cmp	__x86_shared_cache_size_half, %edi
+#  endif
+# endif
+	jae	L(mm_large_page_loop_backward)
+
+	.p2align 4
+L(mm_main_loop_backward):
+
+	prefetcht0 -128(%eax)
+
+	movdqu	-64(%eax), %xmm0
+	movdqu	-48(%eax), %xmm1
+	movdqu	-32(%eax), %xmm2
+	movdqu	-16(%eax), %xmm3
+	movaps	%xmm0, -64(%ecx)
+	subl	$64, %eax
+	movaps	%xmm1, -48(%ecx)
+	movaps	%xmm2, -32(%ecx)
+	movaps	%xmm3, -16(%ecx)
+	subl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_main_loop_backward)
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, -16(%esi)
+	movdqu	%xmm4, (%edx)
+	movdqu	%xmm5, 16(%edx)
+	movdqu	%xmm6, 32(%edx)
+	movdqu	%xmm7, 48(%edx)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+
+/* Copy [0..16] and return.  */
+L(mm_len_0_16_bytes_backward):
+	testb	$24, %cl
+	jnz	L(mm_len_9_16_bytes_backward)
+	testb	$4, %cl
+	.p2align 4,,5
+	jnz	L(mm_len_5_8_bytes_backward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	testb	$2, %cl
+	.p2align 4,,1
+	jne	L(mm_len_3_4_bytes_backward)
+	movzbl	-1(%eax,%ecx), %ebx
+	movzbl	(%eax), %eax
+	movb	%bl, -1(%edx,%ecx)
+	movb	%al, (%edx)
+	jmp	L(return)
+
+L(mm_len_3_4_bytes_backward):
+	movzwl	-2(%eax,%ecx), %ebx
+	movzwl	(%eax), %eax
+	movw	%bx, -2(%edx,%ecx)
+	movw	%ax, (%edx)
+	jmp	L(return)
+
+L(mm_len_9_16_bytes_backward):
+	PUSH (%esi)
+	movl	-4(%eax,%ecx), %ebx
+	movl	-8(%eax,%ecx), %esi
+	movl	%ebx, -4(%edx,%ecx)
+	movl	%esi, -8(%edx,%ecx)
+	subl	$8, %ecx
+	POP (%esi)
+	jmp	L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+	movl	(%eax), %ebx
+	movl	-4(%eax,%ecx), %eax
+	movl	%ebx, (%edx)
+	movl	%eax, -4(%edx,%ecx)
+	jmp	L(return)
+
+/* Big length copy backward part.  */
+	.p2align 4
+L(mm_large_page_loop_backward):
+	movdqu	-64(%eax), %xmm0
+	movdqu	-48(%eax), %xmm1
+	movdqu	-32(%eax), %xmm2
+	movdqu	-16(%eax), %xmm3
+	movntdq	%xmm0, -64(%ecx)
+	subl	$64, %eax
+	movntdq	%xmm1, -48(%ecx)
+	movntdq	%xmm2, -32(%ecx)
+	movntdq	%xmm3, -16(%ecx)
+	subl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_large_page_loop_backward)
+	sfence
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, -16(%esi)
+	movdqu	%xmm4, (%edx)
+	movdqu	%xmm5, 16(%edx)
+	movdqu	%xmm6, 32(%edx)
+	movdqu	%xmm7, 48(%edx)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+
+L(check_forward):
+	add	%edx, %ecx
+	cmp	%eax, %ecx
+	movl	LEN(%esp), %ecx
+	jle	L(forward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+	separately.  */
+	cmp	$16, %ecx
+	jbe	L(mm_len_0_16_bytes_forward)
+
+	cmpl	$32, %ecx
+	ja	L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_32_or_more_forward):
+	cmpl	$64, %ecx
+	ja	L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	-16(%eax, %ecx), %xmm2
+	movdqu	-32(%eax, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -16(%edx, %ecx)
+	movdqu	%xmm3, -32(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_64_or_more_forward):
+	cmpl	$128, %ecx
+	ja	L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_128_or_more_forward):
+	PUSH (%esi)
+	PUSH (%edi)
+	PUSH (%ebx)
+
+/* Aligning the address of destination. */
+	movdqu	-16(%eax, %ecx), %xmm4
+	movdqu	-32(%eax, %ecx), %xmm5
+	movdqu	-48(%eax, %ecx), %xmm6
+	movdqu	-64(%eax, %ecx), %xmm7
+	leal	(%edx, %ecx), %esi
+	movdqu	(%eax), %xmm0
+	subl	$16, %esp
+	movdqu	%xmm0, (%esp)
+	mov	%ecx, %edi
+	leal	16(%edx), %ecx
+	andl	$-16, %ecx
+	movl	%ecx, %ebx
+	subl	%edx, %ebx
+	addl	%ebx, %eax
+	movl	%esi, %ebx
+	subl	%ecx, %ebx
+	shrl	$6, %ebx
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %edi
+# else
+#  ifdef SHARED
+	PUSH (%ebx)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %edi
+	POP (%ebx)
+#  else
+	cmp	__x86_shared_cache_size_half, %edi
+#  endif
+# endif
+	jae	L(mm_large_page_loop_forward)
+
+	.p2align 4
+L(mm_main_loop_forward):
+
+	prefetcht0 128(%eax)
+
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqa	%xmm0, (%ecx)
+	addl	$64, %eax
+	movaps	%xmm1, 16(%ecx)
+	movaps	%xmm2, 32(%ecx)
+	movaps	%xmm3, 48(%ecx)
+	addl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_main_loop_forward)
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm4, -16(%esi)
+	movdqu	%xmm5, -32(%esi)
+	movdqu	%xmm6, -48(%esi)
+	movdqu	%xmm7, -64(%esi)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_len_0_16_bytes_forward):
+	testb	$24, %cl
+	jne	L(mm_len_9_16_bytes_forward)
+	testb	$4, %cl
+	.p2align 4,,5
+	jne	L(mm_len_5_8_bytes_forward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	testb	$2, %cl
+	.p2align 4,,1
+	jne	L(mm_len_2_4_bytes_forward)
+	movzbl	-1(%eax,%ecx), %ebx
+	movzbl	(%eax), %eax
+	movb	%bl, -1(%edx,%ecx)
+	movb	%al, (%edx)
+	jmp	L(return)
+
+L(mm_len_2_4_bytes_forward):
+	movzwl	-2(%eax,%ecx), %ebx
+	movzwl	(%eax), %eax
+	movw	%bx, -2(%edx,%ecx)
+	movw	%ax, (%edx)
+	jmp	L(return)
+
+L(mm_len_5_8_bytes_forward):
+	movl	(%eax), %ebx
+	movl	-4(%eax,%ecx), %eax
+	movl	%ebx, (%edx)
+	movl	%eax, -4(%edx,%ecx)
+	jmp	L(return)
+
+L(mm_len_9_16_bytes_forward):
+	movq	(%eax), %xmm0
+	movq	-8(%eax, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_return_pop_all):
+	movl	%edx, %eax
+	POP (%edi)
+	POP (%esi)
+	RETURN
+
+/* Big length copy forward part.  */
+	.p2align 4
+L(mm_large_page_loop_forward):
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movntdq	%xmm0, (%ecx)
+	addl	$64, %eax
+	movntdq	%xmm1, 16(%ecx)
+	movntdq	%xmm2, 32(%ecx)
+	movntdq	%xmm3, 48(%ecx)
+	addl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_large_page_loop_forward)
+	sfence
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm4, -16(%esi)
+	movdqu	%xmm5, -32(%esi)
+	movdqu	%xmm6, -48(%esi)
+	movdqu	%xmm7, -64(%esi)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+# endif
+
+L(forward):
+	cmp	$16, %ecx
+	jbe	L(len_0_16_bytes)
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_shared_cache_size_half, %ecx
+#  endif
+# endif
+	jae     L(large_page)
+
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	cmpl    $32, %ecx
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jbe	L(return)
+
+	movdqu	16(%eax), %xmm0
+	movdqu	-32(%eax, %ecx), %xmm1
+	cmpl    $64, %ecx
+	movdqu	%xmm0, 16(%edx)
+	movdqu	%xmm1, -32(%edx, %ecx)
+	jbe	L(return)
+
+	movdqu	32(%eax), %xmm0
+	movdqu	48(%eax), %xmm1
+	movdqu	-48(%eax, %ecx), %xmm2
+	movdqu	-64(%eax, %ecx), %xmm3
+	cmpl    $128, %ecx
+	movdqu	%xmm0, 32(%edx)
+	movdqu	%xmm1, 48(%edx)
+	movdqu	%xmm2, -48(%edx, %ecx)
+	movdqu	%xmm3, -64(%edx, %ecx)
+	jbe	L(return)
+
+/* Now the main loop: we align the address of the destination.  */
+	leal	64(%edx), %ebx
+	andl	$-64, %ebx
+
+	addl	%edx, %ecx
+	andl	$-64, %ecx
+
+	subl	%edx, %eax
+
+/* We should stop two iterations before the termination
+	(in order not to misprefetch).  */
+	subl	$64, %ecx
+	cmpl	%ebx, %ecx
+	je	L(main_loop_just_one_iteration)
+
+	subl	$64, %ecx
+	cmpl	%ebx, %ecx
+	je	L(main_loop_last_two_iterations)
+
+	.p2align 4
+L(main_loop_cache):
+
+	prefetcht0 128(%ebx, %eax)
+
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqa	%xmm0, (%ebx)
+	movaps	%xmm1, 16(%ebx)
+	movaps	%xmm2, 32(%ebx)
+	movaps	%xmm3, 48(%ebx)
+	lea	64(%ebx), %ebx
+	cmpl	%ebx, %ecx
+	jne	L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqu	64(%ebx, %eax), %xmm4
+	movdqu	80(%ebx, %eax), %xmm5
+	movdqu	96(%ebx, %eax), %xmm6
+	movdqu	112(%ebx, %eax), %xmm7
+	movdqa	%xmm0, (%ebx)
+	movaps	%xmm1, 16(%ebx)
+	movaps	%xmm2, 32(%ebx)
+	movaps	%xmm3, 48(%ebx)
+	movaps	%xmm4, 64(%ebx)
+	movaps	%xmm5, 80(%ebx)
+	movaps	%xmm6, 96(%ebx)
+	movaps	%xmm7, 112(%ebx)
+	jmp	L(return)
+
+L(main_loop_just_one_iteration):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqa	%xmm0, (%ebx)
+	movaps	%xmm1, 16(%ebx)
+	movaps	%xmm2, 32(%ebx)
+	movaps	%xmm3, 48(%ebx)
+	jmp	L(return)
+
+L(large_page):
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+
+	movdqu	64(%eax), %xmm0
+	movdqu	80(%eax), %xmm1
+	movdqu	96(%eax), %xmm2
+	movdqu	112(%eax), %xmm3
+	movdqu	-128(%eax, %ecx), %xmm4
+	movdqu	-112(%eax, %ecx), %xmm5
+	movdqu	-96(%eax, %ecx), %xmm6
+	movdqu	-80(%eax, %ecx), %xmm7
+	movdqu	%xmm0, 64(%edx)
+	movdqu	%xmm1, 80(%edx)
+	movdqu	%xmm2, 96(%edx)
+	movdqu	%xmm3, 112(%edx)
+	movdqu	%xmm4, -128(%edx, %ecx)
+	movdqu	%xmm5, -112(%edx, %ecx)
+	movdqu	%xmm6, -96(%edx, %ecx)
+	movdqu	%xmm7, -80(%edx, %ecx)
+
+/* Now the main loop with non temporal stores. We align
+	the address of the destination.  */
+	leal	128(%edx), %ebx
+	andl	$-128, %ebx
+
+	addl	%edx, %ecx
+	andl	$-128, %ecx
+
+	subl	%edx, %eax
+
+	.p2align 4
+L(main_loop_large_page):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqu	64(%ebx, %eax), %xmm4
+	movdqu	80(%ebx, %eax), %xmm5
+	movdqu	96(%ebx, %eax), %xmm6
+	movdqu	112(%ebx, %eax), %xmm7
+	movntdq	%xmm0, (%ebx)
+	movntdq	%xmm1, 16(%ebx)
+	movntdq	%xmm2, 32(%ebx)
+	movntdq	%xmm3, 48(%ebx)
+	movntdq	%xmm4, 64(%ebx)
+	movntdq	%xmm5, 80(%ebx)
+	movntdq	%xmm6, 96(%ebx)
+	movntdq	%xmm7, 112(%ebx)
+	lea	128(%ebx), %ebx
+	cmpl	%ebx, %ecx
+	jne	L(main_loop_large_page)
+	sfence
+	jmp	L(return)
+
+L(len_0_16_bytes):
+	testb	$24, %cl
+	jne	L(len_9_16_bytes)
+	testb	$4, %cl
+	.p2align 4,,5
+	jne	L(len_5_8_bytes)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	movzbl	(%eax), %ebx
+	testb	$2, %cl
+	movb	%bl, (%edx)
+	je	L(return)
+	movzwl	-2(%eax,%ecx), %ebx
+	movw	%bx, -2(%edx,%ecx)
+	jmp	L(return)
+
+L(len_9_16_bytes):
+	movq	(%eax), %xmm0
+	movq	-8(%eax, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(return)
+
+L(len_5_8_bytes):
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+	movl	-4(%eax,%ecx), %ebx
+	movl	%ebx, -4(%edx,%ecx)
+
+L(return):
+	movl	%edx, %eax
+# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+	RETURN
+
+END (MEMCPY)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
new file mode 100644
index 0000000000..687e083147
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
@@ -0,0 +1,1809 @@
+/* memcpy with SSSE3 and REP string.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3_rep
+# define MEMCPY_CHK	__memcpy_chk_ssse3_rep
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC		PARMS
+# define DEST		SRC+4
+# define LEN		DEST+4
+#else
+# define DEST		PARMS
+# define SRC		DEST+4
+# define LEN		SRC+4
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef SHARED
+# define PARMS		8		/* Preserve EBX.  */
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */				\
+    SETUP_PIC_REG(bx);						\
+    /* Get the address of the jump table.  */			\
+    addl	$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
+    addl	$(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+#else
+# define PARMS		4
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%eax, %edx
+	jb	L(copy_forward)
+	je	L(fwd_write_0bytes)
+	cmp	$48, %ecx
+	jb	L(bk_write_less48bytes)
+	add	%ecx, %eax
+	cmp	%eax, %edx
+	movl	SRC(%esp), %eax
+	jb	L(copy_backward)
+
+L(copy_forward):
+#endif
+	cmp	$48, %ecx
+	jae	L(48bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dl, %al
+	jb	L(bk_write)
+#endif
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+#endif
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(48bytesormore):
+	movdqu	(%eax), %xmm0
+	PUSH (%edi)
+	movl	%edx, %edi
+	and	$-16, %edx
+	PUSH (%esi)
+	cfi_remember_state
+	add	$16, %edx
+	movl	%edi, %esi
+	sub	%edx, %edi
+	add	%edi, %ecx
+	sub	%edi, %eax
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_shared_cache_size_half, %ecx
+# endif
+#endif
+
+	mov	%eax, %edi
+	jae	L(large_page)
+	and	$0xf, %edi
+	jz	L(shl_0)
+
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+	ALIGN (4)
+L(shl_0):
+	movdqu	%xmm0, (%esi)
+	xor	%edi, %edi
+	cmp	$127, %ecx
+	ja	L(shl_0_gobble)
+	lea	-32(%ecx), %ecx
+L(shl_0_loop):
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+L(shl_0_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	add	%edi, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+L(shl_0_gobble):
+
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_data_cache_size_half@GOTOFF(%ebx), %edi
+# else
+	mov	__x86_data_cache_size_half, %edi
+# endif
+#endif
+	mov	%edi, %esi
+	shr	$3, %esi
+	sub	%esi, %edi
+	cmp	%edi, %ecx
+	jae	L(shl_0_gobble_mem_start)
+	sub	$128, %ecx
+	ALIGN (4)
+L(shl_0_gobble_cache_loop):
+	movdqa	(%eax), %xmm0
+	movaps	0x10(%eax), %xmm1
+	movaps	0x20(%eax), %xmm2
+	movaps	0x30(%eax), %xmm3
+	movaps	0x40(%eax), %xmm4
+	movaps	0x50(%eax), %xmm5
+	movaps	0x60(%eax), %xmm6
+	movaps	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm1, 0x10(%edx)
+	movaps	%xmm2, 0x20(%edx)
+	movaps	%xmm3, 0x30(%edx)
+	movaps	%xmm4, 0x40(%edx)
+	movaps	%xmm5, 0x50(%edx)
+	movaps	%xmm6, 0x60(%edx)
+	movaps	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_cache_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_cache_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_cache_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_cache_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_cache_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_cache_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_0_gobble_mem_start):
+	cmp	%al, %dl
+	je	L(copy_page_by_rep)
+	sub	$128, %ecx
+L(shl_0_gobble_mem_loop):
+	prefetchnta 0x1c0(%eax)
+	prefetchnta 0x280(%eax)
+	prefetchnta 0x1c0(%edx)
+	prefetchnta 0x280(%edx)
+
+	movdqa	(%eax), %xmm0
+	movaps	0x10(%eax), %xmm1
+	movaps	0x20(%eax), %xmm2
+	movaps	0x30(%eax), %xmm3
+	movaps	0x40(%eax), %xmm4
+	movaps	0x50(%eax), %xmm5
+	movaps	0x60(%eax), %xmm6
+	movaps	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$0x80, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm1, 0x10(%edx)
+	movaps	%xmm2, 0x20(%edx)
+	movaps	%xmm3, 0x30(%edx)
+	movaps	%xmm4, 0x40(%edx)
+	movaps	%xmm5, 0x50(%edx)
+	movaps	%xmm6, 0x60(%edx)
+	movaps	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_mem_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_mem_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_mem_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_mem_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_mem_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_1):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$1, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_1_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_1_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_1_loop)
+
+L(shl_1_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	1(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_2):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$2, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_2_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_2_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_2_loop)
+
+L(shl_2_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	2(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_3):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$3, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_3_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_3_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_3_loop)
+
+L(shl_3_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	3(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_4):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$4, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_4_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_4_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_4_loop)
+
+L(shl_4_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	4(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_5):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$5, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_5_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_5_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_5_loop)
+
+L(shl_5_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	5(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_6):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$6, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_6_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_6_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_6_loop)
+
+L(shl_6_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	6(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_7):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$7, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_7_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_7_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_7_loop)
+
+L(shl_7_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	7(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_8):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$8, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_8_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_8_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_8_loop)
+
+L(shl_8_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	8(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_9):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$9, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_9_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_9_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_9_loop)
+
+L(shl_9_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	9(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_10):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$10, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_10_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_10_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_10_loop)
+
+L(shl_10_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	10(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_11):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$11, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_11_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_11_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_11_loop)
+
+L(shl_11_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	11(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_12):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$12, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_12_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_12_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_12_loop)
+
+L(shl_12_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	12(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_13):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$13, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_13_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_13_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_13_loop)
+
+L(shl_13_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	13(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_14):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$14, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_14_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_14_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_14_loop)
+
+L(shl_14_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	14(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_15):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$15, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_15_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_15_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_15_loop)
+
+L(shl_15_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	15(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(fwd_write_44bytes):
+	movl	-44(%eax), %ecx
+	movl	%ecx, -44(%edx)
+L(fwd_write_40bytes):
+	movl	-40(%eax), %ecx
+	movl	%ecx, -40(%edx)
+L(fwd_write_36bytes):
+	movl	-36(%eax), %ecx
+	movl	%ecx, -36(%edx)
+L(fwd_write_32bytes):
+	movl	-32(%eax), %ecx
+	movl	%ecx, -32(%edx)
+L(fwd_write_28bytes):
+	movl	-28(%eax), %ecx
+	movl	%ecx, -28(%edx)
+L(fwd_write_24bytes):
+	movl	-24(%eax), %ecx
+	movl	%ecx, -24(%edx)
+L(fwd_write_20bytes):
+	movl	-20(%eax), %ecx
+	movl	%ecx, -20(%edx)
+L(fwd_write_16bytes):
+	movl	-16(%eax), %ecx
+	movl	%ecx, -16(%edx)
+L(fwd_write_12bytes):
+	movl	-12(%eax), %ecx
+	movl	%ecx, -12(%edx)
+L(fwd_write_8bytes):
+	movl	-8(%eax), %ecx
+	movl	%ecx, -8(%edx)
+L(fwd_write_4bytes):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+L(fwd_write_0bytes):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_5bytes):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_45bytes):
+	movl	-45(%eax), %ecx
+	movl	%ecx, -45(%edx)
+L(fwd_write_41bytes):
+	movl	-41(%eax), %ecx
+	movl	%ecx, -41(%edx)
+L(fwd_write_37bytes):
+	movl	-37(%eax), %ecx
+	movl	%ecx, -37(%edx)
+L(fwd_write_33bytes):
+	movl	-33(%eax), %ecx
+	movl	%ecx, -33(%edx)
+L(fwd_write_29bytes):
+	movl	-29(%eax), %ecx
+	movl	%ecx, -29(%edx)
+L(fwd_write_25bytes):
+	movl	-25(%eax), %ecx
+	movl	%ecx, -25(%edx)
+L(fwd_write_21bytes):
+	movl	-21(%eax), %ecx
+	movl	%ecx, -21(%edx)
+L(fwd_write_17bytes):
+	movl	-17(%eax), %ecx
+	movl	%ecx, -17(%edx)
+L(fwd_write_13bytes):
+	movl	-13(%eax), %ecx
+	movl	%ecx, -13(%edx)
+L(fwd_write_9bytes):
+	movl	-9(%eax), %ecx
+	movl	%ecx, -9(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+L(fwd_write_1bytes):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_46bytes):
+	movl	-46(%eax), %ecx
+	movl	%ecx, -46(%edx)
+L(fwd_write_42bytes):
+	movl	-42(%eax), %ecx
+	movl	%ecx, -42(%edx)
+L(fwd_write_38bytes):
+	movl	-38(%eax), %ecx
+	movl	%ecx, -38(%edx)
+L(fwd_write_34bytes):
+	movl	-34(%eax), %ecx
+	movl	%ecx, -34(%edx)
+L(fwd_write_30bytes):
+	movl	-30(%eax), %ecx
+	movl	%ecx, -30(%edx)
+L(fwd_write_26bytes):
+	movl	-26(%eax), %ecx
+	movl	%ecx, -26(%edx)
+L(fwd_write_22bytes):
+	movl	-22(%eax), %ecx
+	movl	%ecx, -22(%edx)
+L(fwd_write_18bytes):
+	movl	-18(%eax), %ecx
+	movl	%ecx, -18(%edx)
+L(fwd_write_14bytes):
+	movl	-14(%eax), %ecx
+	movl	%ecx, -14(%edx)
+L(fwd_write_10bytes):
+	movl	-10(%eax), %ecx
+	movl	%ecx, -10(%edx)
+L(fwd_write_6bytes):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+L(fwd_write_2bytes):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_47bytes):
+	movl	-47(%eax), %ecx
+	movl	%ecx, -47(%edx)
+L(fwd_write_43bytes):
+	movl	-43(%eax), %ecx
+	movl	%ecx, -43(%edx)
+L(fwd_write_39bytes):
+	movl	-39(%eax), %ecx
+	movl	%ecx, -39(%edx)
+L(fwd_write_35bytes):
+	movl	-35(%eax), %ecx
+	movl	%ecx, -35(%edx)
+L(fwd_write_31bytes):
+	movl	-31(%eax), %ecx
+	movl	%ecx, -31(%edx)
+L(fwd_write_27bytes):
+	movl	-27(%eax), %ecx
+	movl	%ecx, -27(%edx)
+L(fwd_write_23bytes):
+	movl	-23(%eax), %ecx
+	movl	%ecx, -23(%edx)
+L(fwd_write_19bytes):
+	movl	-19(%eax), %ecx
+	movl	%ecx, -19(%edx)
+L(fwd_write_15bytes):
+	movl	-15(%eax), %ecx
+	movl	%ecx, -15(%edx)
+L(fwd_write_11bytes):
+	movl	-11(%eax), %ecx
+	movl	%ecx, -11(%edx)
+L(fwd_write_7bytes):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+L(fwd_write_3bytes):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN_END
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(large_page):
+	movdqu	(%eax), %xmm1
+	movdqu	%xmm0, (%esi)
+	movntdq	%xmm1, (%edx)
+	add	$0x10, %eax
+	add	$0x10, %edx
+	sub	$0x10, %ecx
+	cmp	%al, %dl
+	je	L(copy_page_by_rep)
+L(large_page_loop_init):
+	POP (%esi)
+	sub	$0x80, %ecx
+	POP (%edi)
+L(large_page_loop):
+	prefetchnta	0x1c0(%eax)
+	prefetchnta	0x280(%eax)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	movdqu	0x40(%eax), %xmm4
+	movdqu	0x50(%eax), %xmm5
+	movdqu	0x60(%eax), %xmm6
+	movdqu	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	lfence
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	movntdq	%xmm4, 0x40(%edx)
+	movntdq	%xmm5, 0x50(%edx)
+	movntdq	%xmm6, 0x60(%edx)
+	movntdq	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+	jae	L(large_page_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(large_page_less_64bytes)
+
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	lea	0x40(%eax), %eax
+
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	lea	0x40(%edx), %edx
+	sub	$0x40, %ecx
+L(large_page_less_64bytes):
+	cmp	$32, %ecx
+	jb	L(large_page_less_32bytes)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	lea	0x20(%eax), %eax
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	lea	0x20(%edx), %edx
+	sub	$0x20, %ecx
+L(large_page_less_32bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(copy_page_by_rep):
+	mov	%eax, %esi
+	mov	%edx, %edi
+	mov	%ecx, %edx
+	shr	$2, %ecx
+	and	$3, %edx
+	rep	movsl
+	jz	L(copy_page_by_rep_exit)
+	cmp	$2, %edx
+	jb	L(copy_page_by_rep_left_1)
+	movzwl	(%esi), %eax
+	movw	%ax, (%edi)
+	add	$2, %esi
+	add	$2, %edi
+	sub	$2, %edx
+	jz	L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+	movzbl	(%esi), %eax
+	movb	%al, (%edi)
+L(copy_page_by_rep_exit):
+	POP (%esi)
+	POP (%edi)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_44bytes):
+	movl	40(%eax), %ecx
+	movl	%ecx, 40(%edx)
+L(bk_write_40bytes):
+	movl	36(%eax), %ecx
+	movl	%ecx, 36(%edx)
+L(bk_write_36bytes):
+	movl	32(%eax), %ecx
+	movl	%ecx, 32(%edx)
+L(bk_write_32bytes):
+	movl	28(%eax), %ecx
+	movl	%ecx, 28(%edx)
+L(bk_write_28bytes):
+	movl	24(%eax), %ecx
+	movl	%ecx, 24(%edx)
+L(bk_write_24bytes):
+	movl	20(%eax), %ecx
+	movl	%ecx, 20(%edx)
+L(bk_write_20bytes):
+	movl	16(%eax), %ecx
+	movl	%ecx, 16(%edx)
+L(bk_write_16bytes):
+	movl	12(%eax), %ecx
+	movl	%ecx, 12(%edx)
+L(bk_write_12bytes):
+	movl	8(%eax), %ecx
+	movl	%ecx, 8(%edx)
+L(bk_write_8bytes):
+	movl	4(%eax), %ecx
+	movl	%ecx, 4(%edx)
+L(bk_write_4bytes):
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)
+L(bk_write_0bytes):
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_45bytes):
+	movl	41(%eax), %ecx
+	movl	%ecx, 41(%edx)
+L(bk_write_41bytes):
+	movl	37(%eax), %ecx
+	movl	%ecx, 37(%edx)
+L(bk_write_37bytes):
+	movl	33(%eax), %ecx
+	movl	%ecx, 33(%edx)
+L(bk_write_33bytes):
+	movl	29(%eax), %ecx
+	movl	%ecx, 29(%edx)
+L(bk_write_29bytes):
+	movl	25(%eax), %ecx
+	movl	%ecx, 25(%edx)
+L(bk_write_25bytes):
+	movl	21(%eax), %ecx
+	movl	%ecx, 21(%edx)
+L(bk_write_21bytes):
+	movl	17(%eax), %ecx
+	movl	%ecx, 17(%edx)
+L(bk_write_17bytes):
+	movl	13(%eax), %ecx
+	movl	%ecx, 13(%edx)
+L(bk_write_13bytes):
+	movl	9(%eax), %ecx
+	movl	%ecx, 9(%edx)
+L(bk_write_9bytes):
+	movl	5(%eax), %ecx
+	movl	%ecx, 5(%edx)
+L(bk_write_5bytes):
+	movl	1(%eax), %ecx
+	movl	%ecx, 1(%edx)
+L(bk_write_1bytes):
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_46bytes):
+	movl	42(%eax), %ecx
+	movl	%ecx, 42(%edx)
+L(bk_write_42bytes):
+	movl	38(%eax), %ecx
+	movl	%ecx, 38(%edx)
+L(bk_write_38bytes):
+	movl	34(%eax), %ecx
+	movl	%ecx, 34(%edx)
+L(bk_write_34bytes):
+	movl	30(%eax), %ecx
+	movl	%ecx, 30(%edx)
+L(bk_write_30bytes):
+	movl	26(%eax), %ecx
+	movl	%ecx, 26(%edx)
+L(bk_write_26bytes):
+	movl	22(%eax), %ecx
+	movl	%ecx, 22(%edx)
+L(bk_write_22bytes):
+	movl	18(%eax), %ecx
+	movl	%ecx, 18(%edx)
+L(bk_write_18bytes):
+	movl	14(%eax), %ecx
+	movl	%ecx, 14(%edx)
+L(bk_write_14bytes):
+	movl	10(%eax), %ecx
+	movl	%ecx, 10(%edx)
+L(bk_write_10bytes):
+	movl	6(%eax), %ecx
+	movl	%ecx, 6(%edx)
+L(bk_write_6bytes):
+	movl	2(%eax), %ecx
+	movl	%ecx, 2(%edx)
+L(bk_write_2bytes):
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_47bytes):
+	movl	43(%eax), %ecx
+	movl	%ecx, 43(%edx)
+L(bk_write_43bytes):
+	movl	39(%eax), %ecx
+	movl	%ecx, 39(%edx)
+L(bk_write_39bytes):
+	movl	35(%eax), %ecx
+	movl	%ecx, 35(%edx)
+L(bk_write_35bytes):
+	movl	31(%eax), %ecx
+	movl	%ecx, 31(%edx)
+L(bk_write_31bytes):
+	movl	27(%eax), %ecx
+	movl	%ecx, 27(%edx)
+L(bk_write_27bytes):
+	movl	23(%eax), %ecx
+	movl	%ecx, 23(%edx)
+L(bk_write_23bytes):
+	movl	19(%eax), %ecx
+	movl	%ecx, 19(%edx)
+L(bk_write_19bytes):
+	movl	15(%eax), %ecx
+	movl	%ecx, 15(%edx)
+L(bk_write_15bytes):
+	movl	11(%eax), %ecx
+	movl	%ecx, 11(%edx)
+L(bk_write_11bytes):
+	movl	7(%eax), %ecx
+	movl	%ecx, 7(%edx)
+L(bk_write_7bytes):
+	movl	3(%eax), %ecx
+	movl	%ecx, 3(%edx)
+L(bk_write_3bytes):
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN_END
+
+
+	.pushsection .rodata.ssse3,"a",@progbits
+	ALIGN (2)
+L(table_48bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+	ALIGN (2)
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	ALIGN (2)
+L(table_48_bytes_bwd):
+	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+	.popsection
+
+#ifdef USE_AS_MEMMOVE
+	ALIGN (4)
+L(copy_backward):
+	PUSH (%esi)
+	movl	%eax, %esi
+	add	%ecx, %edx
+	add	%ecx, %esi
+	testl	$0x3, %edx
+	jnz	L(bk_align)
+
+L(bk_aligned_4):
+	cmp	$64, %ecx
+	jae	L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+	cmp	$32, %ecx
+	jb	L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+	/* Copy 32 bytes at a time.  */
+	sub	$32, %ecx
+	movl	-4(%esi), %eax
+	movl	%eax, -4(%edx)
+	movl	-8(%esi), %eax
+	movl	%eax, -8(%edx)
+	movl	-12(%esi), %eax
+	movl	%eax, -12(%edx)
+	movl	-16(%esi), %eax
+	movl	%eax, -16(%edx)
+	movl	-20(%esi), %eax
+	movl	%eax, -20(%edx)
+	movl	-24(%esi), %eax
+	movl	%eax, -24(%edx)
+	movl	-28(%esi), %eax
+	movl	%eax, -28(%edx)
+	movl	-32(%esi), %eax
+	movl	%eax, -32(%edx)
+	sub	$32, %edx
+	sub	$32, %esi
+
+L(bk_write_less32bytes):
+	movl	%esi, %eax
+	sub	%ecx, %edx
+	sub	%ecx, %eax
+	POP (%esi)
+L(bk_write_less48bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+	CFI_PUSH (%esi)
+	ALIGN (4)
+L(bk_align):
+	cmp	$8, %ecx
+	jbe	L(bk_write_less32bytes)
+	testl	$1, %edx
+	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+	   then (EDX & 2) must be != 0.  */
+	jz	L(bk_got2)
+	sub	$1, %esi
+	sub	$1, %ecx
+	sub	$1, %edx
+	movzbl	(%esi), %eax
+	movb	%al, (%edx)
+
+	testl	$2, %edx
+	jz	L(bk_aligned_4)
+
+L(bk_got2):
+	sub	$2, %esi
+	sub	$2, %ecx
+	sub	$2, %edx
+	movzwl	(%esi), %eax
+	movw	%ax, (%edx)
+	jmp	L(bk_aligned_4)
+
+	ALIGN (4)
+L(bk_write_more64bytes):
+	/* Check alignment of last byte.  */
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes.  */
+L(bk_ssse3_align):
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+	cmp	$64, %ecx
+	jb	L(bk_write_more32bytes)
+
+L(bk_ssse3_cpy):
+	sub	$64, %esi
+	sub	$64, %ecx
+	sub	$64, %edx
+	movdqu	0x30(%esi), %xmm3
+	movdqa	%xmm3, 0x30(%edx)
+	movdqu	0x20(%esi), %xmm2
+	movdqa	%xmm2, 0x20(%edx)
+	movdqu	0x10(%esi), %xmm1
+	movdqa	%xmm1, 0x10(%edx)
+	movdqu	(%esi), %xmm0
+	movdqa	%xmm0, (%edx)
+	cmp	$64, %ecx
+	jae	L(bk_ssse3_cpy)
+	jmp	L(bk_write_64bytesless)
+
+#endif
+
+END (MEMCPY)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
new file mode 100644
index 0000000000..53e8a6ca1d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -0,0 +1,3162 @@
+/* memcpy with SSSE3
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+#  define MEMCPY		__memcpy_ssse3
+#  define MEMCPY_CHK	__memcpy_chk_ssse3
+# endif
+
+# ifdef USE_AS_BCOPY
+#  define SRC		PARMS
+#  define DEST		SRC+4
+#  define LEN		DEST+4
+# else
+#  define DEST		PARMS
+#  define SRC		DEST+4
+#  define LEN		SRC+4
+# endif
+
+# define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
+  cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
+  cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# ifdef SHARED
+#  define PARMS		8		/* Preserve EBX.  */
+#  define ENTRANCE	PUSH (%ebx);
+#  define RETURN_END	POP (%ebx); ret
+#  define RETURN		RETURN_END; CFI_PUSH (%ebx)
+#  define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump table with relative offsets.  INDEX is a register contains the
+	index into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */		\
+	SETUP_PIC_REG(bx);		\
+    /* Get the address of the jump table.  */		\
+	addl	$(TABLE - .), %ebx;		\
+    /* Get the entry and convert the relative offset to the		\
+	absolute	address.  */		\
+	addl	(%ebx, INDEX, SCALE), %ebx;		\
+    /* We loaded the jump table.  Go.  */		\
+	jmp	*%ebx
+# else
+
+#  define PARMS		4
+#  define ENTRANCE
+#  define RETURN_END	ret
+#  define RETURN		RETURN_END
+#  define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute offsets.  INDEX is a register contains the index into the
+	jump table.  SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	jmp	*TABLE(, INDEX, SCALE)
+# endif
+
+	.section .text.ssse3,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+# ifdef USE_AS_MEMMOVE
+	cmp	%eax, %edx
+	jb	L(copy_forward)
+	je	L(fwd_write_0bytes)
+	cmp	$32, %ecx
+	jae	L(memmove_bwd)
+	jmp	L(bk_write_less32bytes_2)
+
+	.p2align 4
+L(memmove_bwd):
+	add	%ecx, %eax
+	cmp	%eax, %edx
+	movl	SRC(%esp), %eax
+	jb	L(copy_backward)
+
+L(copy_forward):
+# endif
+	cmp	$48, %ecx
+	jae	L(48bytesormore)
+
+L(fwd_write_less32bytes):
+# ifndef USE_AS_MEMMOVE
+	cmp	%dl, %al
+	jb	L(bk_write)
+# endif
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+# ifndef USE_AS_MEMMOVE
+	.p2align 4
+L(bk_write):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+# endif
+
+	.p2align 4
+L(48bytesormore):
+# ifndef USE_AS_MEMMOVE
+	movlpd	(%eax), %xmm0
+	movlpd	8(%eax), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+# else
+	movdqu	(%eax), %xmm0
+# endif
+	PUSH (%edi)
+	movl	%edx, %edi
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %edi
+	add	%edi, %ecx
+	sub	%edi, %eax
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_shared_cache_size_half, %ecx
+#  endif
+# endif
+
+	mov	%eax, %edi
+	jae	L(large_page)
+	and	$0xf, %edi
+	jz	L(shl_0)
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+	.p2align 4
+L(shl_0):
+# ifdef USE_AS_MEMMOVE
+	movl	DEST+4(%esp), %edi
+	movdqu	%xmm0, (%edi)
+# endif
+	xor	%edi, %edi
+	cmp	$127, %ecx
+	ja	L(shl_0_gobble)
+	lea	-32(%ecx), %ecx
+
+	.p2align 4
+L(shl_0_loop):
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+
+L(shl_0_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	add	%edi, %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_0_gobble):
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	POP	(%edi)
+	lea	-128(%ecx), %ecx
+	jae	L(shl_0_gobble_mem_loop)
+
+	.p2align 4
+L(shl_0_gobble_cache_loop):
+	movdqa	(%eax), %xmm0
+	movdqa	0x10(%eax), %xmm1
+	movdqa	0x20(%eax), %xmm2
+	movdqa	0x30(%eax), %xmm3
+	movdqa	0x40(%eax), %xmm4
+	movdqa	0x50(%eax), %xmm5
+	movdqa	0x60(%eax), %xmm6
+	movdqa	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	%xmm2, 0x20(%edx)
+	movdqa	%xmm3, 0x30(%edx)
+	movdqa	%xmm4, 0x40(%edx)
+	movdqa	%xmm5, 0x50(%edx)
+	movdqa	%xmm6, 0x60(%edx)
+	movdqa	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_cache_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+
+L(shl_0_cache_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_cache_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+
+L(shl_0_cache_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_cache_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+
+L(shl_0_cache_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	.p2align 4
+L(shl_0_gobble_mem_loop):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x280(%eax)
+	prefetcht0 0x1c0(%edx)
+
+	movdqa	(%eax), %xmm0
+	movdqa	0x10(%eax), %xmm1
+	movdqa	0x20(%eax), %xmm2
+	movdqa	0x30(%eax), %xmm3
+	movdqa	0x40(%eax), %xmm4
+	movdqa	0x50(%eax), %xmm5
+	movdqa	0x60(%eax), %xmm6
+	movdqa	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$0x80, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	%xmm2, 0x20(%edx)
+	movdqa	%xmm3, 0x30(%edx)
+	movdqa	%xmm4, 0x40(%edx)
+	movdqa	%xmm5, 0x50(%edx)
+	movdqa	%xmm6, 0x60(%edx)
+	movdqa	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_mem_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_mem_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+
+L(shl_0_mem_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_mem_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+
+L(shl_0_mem_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
+
+	.p2align 4
+L(shl_1):
+# ifndef USE_AS_MEMMOVE
+	movaps	-1(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-1(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_1_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl1LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	15(%eax), %xmm2
+	movaps	31(%eax), %xmm3
+	movaps	47(%eax), %xmm4
+	movaps	63(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	palignr	$1, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl1LoopStart)
+
+L(Shl1LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	15(%eax), %xmm2
+	movaps	31(%eax), %xmm3
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_1_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-1(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_1_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_1_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_1_no_prefetch_loop)
+
+L(sh_1_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	1(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_2):
+# ifndef USE_AS_MEMMOVE
+	movaps	-2(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-2(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_2_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl2LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	14(%eax), %xmm2
+	movaps	30(%eax), %xmm3
+	movaps	46(%eax), %xmm4
+	movaps	62(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	palignr	$2, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl2LoopStart)
+
+L(Shl2LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	14(%eax), %xmm2
+	movaps	30(%eax), %xmm3
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_2_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-2(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_2_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_2_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_2_no_prefetch_loop)
+
+L(sh_2_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	2(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_3):
+# ifndef USE_AS_MEMMOVE
+	movaps	-3(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-3(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_3_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl3LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	13(%eax), %xmm2
+	movaps	29(%eax), %xmm3
+	movaps	45(%eax), %xmm4
+	movaps	61(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	palignr	$3, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl3LoopStart)
+
+L(Shl3LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	13(%eax), %xmm2
+	movaps	29(%eax), %xmm3
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_3_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-3(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_3_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_3_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_3_no_prefetch_loop)
+
+L(sh_3_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	3(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_4):
+# ifndef USE_AS_MEMMOVE
+	movaps	-4(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-4(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_4_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl4LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	12(%eax), %xmm2
+	movaps	28(%eax), %xmm3
+	movaps	44(%eax), %xmm4
+	movaps	60(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	palignr	$4, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl4LoopStart)
+
+L(Shl4LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	12(%eax), %xmm2
+	movaps	28(%eax), %xmm3
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_4_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-4(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_4_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_4_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_4_no_prefetch_loop)
+
+L(sh_4_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	4(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_5):
+# ifndef USE_AS_MEMMOVE
+	movaps	-5(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-5(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_5_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl5LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	11(%eax), %xmm2
+	movaps	27(%eax), %xmm3
+	movaps	43(%eax), %xmm4
+	movaps	59(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	palignr	$5, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl5LoopStart)
+
+L(Shl5LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	11(%eax), %xmm2
+	movaps	27(%eax), %xmm3
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_5_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-5(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_5_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_5_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_5_no_prefetch_loop)
+
+L(sh_5_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	5(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_6):
+# ifndef USE_AS_MEMMOVE
+	movaps	-6(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-6(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_6_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl6LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	10(%eax), %xmm2
+	movaps	26(%eax), %xmm3
+	movaps	42(%eax), %xmm4
+	movaps	58(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	palignr	$6, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl6LoopStart)
+
+L(Shl6LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	10(%eax), %xmm2
+	movaps	26(%eax), %xmm3
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_6_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-6(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_6_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_6_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_6_no_prefetch_loop)
+
+L(sh_6_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	6(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_7):
+# ifndef USE_AS_MEMMOVE
+	movaps	-7(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-7(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_7_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl7LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	9(%eax), %xmm2
+	movaps	25(%eax), %xmm3
+	movaps	41(%eax), %xmm4
+	movaps	57(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	palignr	$7, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl7LoopStart)
+
+L(Shl7LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	9(%eax), %xmm2
+	movaps	25(%eax), %xmm3
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_7_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-7(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_7_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_7_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_7_no_prefetch_loop)
+
+L(sh_7_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	7(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_8):
+# ifndef USE_AS_MEMMOVE
+	movaps	-8(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-8(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_8_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl8LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	8(%eax), %xmm2
+	movaps	24(%eax), %xmm3
+	movaps	40(%eax), %xmm4
+	movaps	56(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	palignr	$8, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl8LoopStart)
+
+L(LoopLeave8):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	8(%eax), %xmm2
+	movaps	24(%eax), %xmm3
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_8_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-8(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_8_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_8_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_8_no_prefetch_loop)
+
+L(sh_8_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	8(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_9):
+# ifndef USE_AS_MEMMOVE
+	movaps	-9(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-9(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_9_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl9LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	7(%eax), %xmm2
+	movaps	23(%eax), %xmm3
+	movaps	39(%eax), %xmm4
+	movaps	55(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	palignr	$9, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl9LoopStart)
+
+L(Shl9LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	7(%eax), %xmm2
+	movaps	23(%eax), %xmm3
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_9_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-9(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_9_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_9_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_9_no_prefetch_loop)
+
+L(sh_9_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	9(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_10):
+# ifndef USE_AS_MEMMOVE
+	movaps	-10(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-10(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_10_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl10LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	6(%eax), %xmm2
+	movaps	22(%eax), %xmm3
+	movaps	38(%eax), %xmm4
+	movaps	54(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	palignr	$10, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl10LoopStart)
+
+L(Shl10LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	6(%eax), %xmm2
+	movaps	22(%eax), %xmm3
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_10_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-10(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_10_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_10_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_10_no_prefetch_loop)
+
+L(sh_10_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	10(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_11):
+# ifndef USE_AS_MEMMOVE
+	movaps	-11(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-11(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_11_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl11LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	5(%eax), %xmm2
+	movaps	21(%eax), %xmm3
+	movaps	37(%eax), %xmm4
+	movaps	53(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	palignr	$11, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl11LoopStart)
+
+L(Shl11LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	5(%eax), %xmm2
+	movaps	21(%eax), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_11_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-11(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_11_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_11_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_11_no_prefetch_loop)
+
+L(sh_11_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	11(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_12):
+# ifndef USE_AS_MEMMOVE
+	movaps	-12(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-12(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_12_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl12LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	4(%eax), %xmm2
+	movaps	20(%eax), %xmm3
+	movaps	36(%eax), %xmm4
+	movaps	52(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	palignr	$12, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl12LoopStart)
+
+L(Shl12LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	4(%eax), %xmm2
+	movaps	20(%eax), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_12_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-12(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_12_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_12_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_12_no_prefetch_loop)
+
+L(sh_12_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	12(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_13):
+# ifndef USE_AS_MEMMOVE
+	movaps	-13(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-13(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_13_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl13LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	3(%eax), %xmm2
+	movaps	19(%eax), %xmm3
+	movaps	35(%eax), %xmm4
+	movaps	51(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	palignr	$13, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl13LoopStart)
+
+L(Shl13LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	3(%eax), %xmm2
+	movaps	19(%eax), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_13_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-13(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_13_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_13_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_13_no_prefetch_loop)
+
+L(sh_13_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	13(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_14):
+# ifndef USE_AS_MEMMOVE
+	movaps	-14(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-14(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_14_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl14LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	2(%eax), %xmm2
+	movaps	18(%eax), %xmm3
+	movaps	34(%eax), %xmm4
+	movaps	50(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	palignr	$14, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl14LoopStart)
+
+L(Shl14LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	2(%eax), %xmm2
+	movaps	18(%eax), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_14_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-14(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_14_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_14_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_14_no_prefetch_loop)
+
+L(sh_14_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	14(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_15):
+# ifndef USE_AS_MEMMOVE
+	movaps	-15(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-15(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_15_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl15LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	1(%eax), %xmm2
+	movaps	17(%eax), %xmm3
+	movaps	33(%eax), %xmm4
+	movaps	49(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	palignr	$15, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl15LoopStart)
+
+L(Shl15LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	1(%eax), %xmm2
+	movaps	17(%eax), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_15_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-15(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_15_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_15_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_15_no_prefetch_loop)
+
+L(sh_15_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	15(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_end_0):
+	lea	32(%ecx), %ecx
+	lea	(%edx, %ecx), %edx
+	lea	(%eax, %ecx), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	.p2align 4
+L(fwd_write_44bytes):
+	movq	-44(%eax), %xmm0
+	movq	%xmm0, -44(%edx)
+L(fwd_write_36bytes):
+	movq	-36(%eax), %xmm0
+	movq	%xmm0, -36(%edx)
+L(fwd_write_28bytes):
+	movq	-28(%eax), %xmm0
+	movq	%xmm0, -28(%edx)
+L(fwd_write_20bytes):
+	movq	-20(%eax), %xmm0
+	movq	%xmm0, -20(%edx)
+L(fwd_write_12bytes):
+	movq	-12(%eax), %xmm0
+	movq	%xmm0, -12(%edx)
+L(fwd_write_4bytes):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_40bytes):
+	movq	-40(%eax), %xmm0
+	movq	%xmm0, -40(%edx)
+L(fwd_write_32bytes):
+	movq	-32(%eax), %xmm0
+	movq	%xmm0, -32(%edx)
+L(fwd_write_24bytes):
+	movq	-24(%eax), %xmm0
+	movq	%xmm0, -24(%edx)
+L(fwd_write_16bytes):
+	movq	-16(%eax), %xmm0
+	movq	%xmm0, -16(%edx)
+L(fwd_write_8bytes):
+	movq	-8(%eax), %xmm0
+	movq	%xmm0, -8(%edx)
+L(fwd_write_0bytes):
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_5bytes):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_45bytes):
+	movq	-45(%eax), %xmm0
+	movq	%xmm0, -45(%edx)
+L(fwd_write_37bytes):
+	movq	-37(%eax), %xmm0
+	movq	%xmm0, -37(%edx)
+L(fwd_write_29bytes):
+	movq	-29(%eax), %xmm0
+	movq	%xmm0, -29(%edx)
+L(fwd_write_21bytes):
+	movq	-21(%eax), %xmm0
+	movq	%xmm0, -21(%edx)
+L(fwd_write_13bytes):
+	movq	-13(%eax), %xmm0
+	movq	%xmm0, -13(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_41bytes):
+	movq	-41(%eax), %xmm0
+	movq	%xmm0, -41(%edx)
+L(fwd_write_33bytes):
+	movq	-33(%eax), %xmm0
+	movq	%xmm0, -33(%edx)
+L(fwd_write_25bytes):
+	movq	-25(%eax), %xmm0
+	movq	%xmm0, -25(%edx)
+L(fwd_write_17bytes):
+	movq	-17(%eax), %xmm0
+	movq	%xmm0, -17(%edx)
+L(fwd_write_9bytes):
+	movq	-9(%eax), %xmm0
+	movq	%xmm0, -9(%edx)
+L(fwd_write_1bytes):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_46bytes):
+	movq	-46(%eax), %xmm0
+	movq	%xmm0, -46(%edx)
+L(fwd_write_38bytes):
+	movq	-38(%eax), %xmm0
+	movq	%xmm0, -38(%edx)
+L(fwd_write_30bytes):
+	movq	-30(%eax), %xmm0
+	movq	%xmm0, -30(%edx)
+L(fwd_write_22bytes):
+	movq	-22(%eax), %xmm0
+	movq	%xmm0, -22(%edx)
+L(fwd_write_14bytes):
+	movq	-14(%eax), %xmm0
+	movq	%xmm0, -14(%edx)
+L(fwd_write_6bytes):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_42bytes):
+	movq	-42(%eax), %xmm0
+	movq	%xmm0, -42(%edx)
+L(fwd_write_34bytes):
+	movq	-34(%eax), %xmm0
+	movq	%xmm0, -34(%edx)
+L(fwd_write_26bytes):
+	movq	-26(%eax), %xmm0
+	movq	%xmm0, -26(%edx)
+L(fwd_write_18bytes):
+	movq	-18(%eax), %xmm0
+	movq	%xmm0, -18(%edx)
+L(fwd_write_10bytes):
+	movq	-10(%eax), %xmm0
+	movq	%xmm0, -10(%edx)
+L(fwd_write_2bytes):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_47bytes):
+	movq	-47(%eax), %xmm0
+	movq	%xmm0, -47(%edx)
+L(fwd_write_39bytes):
+	movq	-39(%eax), %xmm0
+	movq	%xmm0, -39(%edx)
+L(fwd_write_31bytes):
+	movq	-31(%eax), %xmm0
+	movq	%xmm0, -31(%edx)
+L(fwd_write_23bytes):
+	movq	-23(%eax), %xmm0
+	movq	%xmm0, -23(%edx)
+L(fwd_write_15bytes):
+	movq	-15(%eax), %xmm0
+	movq	%xmm0, -15(%edx)
+L(fwd_write_7bytes):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_43bytes):
+	movq	-43(%eax), %xmm0
+	movq	%xmm0, -43(%edx)
+L(fwd_write_35bytes):
+	movq	-35(%eax), %xmm0
+	movq	%xmm0, -35(%edx)
+L(fwd_write_27bytes):
+	movq	-27(%eax), %xmm0
+	movq	%xmm0, -27(%edx)
+L(fwd_write_19bytes):
+	movq	-19(%eax), %xmm0
+	movq	%xmm0, -19(%edx)
+L(fwd_write_11bytes):
+	movq	-11(%eax), %xmm0
+	movq	%xmm0, -11(%edx)
+L(fwd_write_3bytes):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_40bytes_align):
+	movdqa	-40(%eax), %xmm0
+	movdqa	%xmm0, -40(%edx)
+L(fwd_write_24bytes_align):
+	movdqa	-24(%eax), %xmm0
+	movdqa	%xmm0, -24(%edx)
+L(fwd_write_8bytes_align):
+	movq	-8(%eax), %xmm0
+	movq	%xmm0, -8(%edx)
+L(fwd_write_0bytes_align):
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_32bytes_align):
+	movdqa	-32(%eax), %xmm0
+	movdqa	%xmm0, -32(%edx)
+L(fwd_write_16bytes_align):
+	movdqa	-16(%eax), %xmm0
+	movdqa	%xmm0, -16(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_5bytes_align):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_45bytes_align):
+	movdqa	-45(%eax), %xmm0
+	movdqa	%xmm0, -45(%edx)
+L(fwd_write_29bytes_align):
+	movdqa	-29(%eax), %xmm0
+	movdqa	%xmm0, -29(%edx)
+L(fwd_write_13bytes_align):
+	movq	-13(%eax), %xmm0
+	movq	%xmm0, -13(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_37bytes_align):
+	movdqa	-37(%eax), %xmm0
+	movdqa	%xmm0, -37(%edx)
+L(fwd_write_21bytes_align):
+	movdqa	-21(%eax), %xmm0
+	movdqa	%xmm0, -21(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_41bytes_align):
+	movdqa	-41(%eax), %xmm0
+	movdqa	%xmm0, -41(%edx)
+L(fwd_write_25bytes_align):
+	movdqa	-25(%eax), %xmm0
+	movdqa	%xmm0, -25(%edx)
+L(fwd_write_9bytes_align):
+	movq	-9(%eax), %xmm0
+	movq	%xmm0, -9(%edx)
+L(fwd_write_1bytes_align):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_33bytes_align):
+	movdqa	-33(%eax), %xmm0
+	movdqa	%xmm0, -33(%edx)
+L(fwd_write_17bytes_align):
+	movdqa	-17(%eax), %xmm0
+	movdqa	%xmm0, -17(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_46bytes_align):
+	movdqa	-46(%eax), %xmm0
+	movdqa	%xmm0, -46(%edx)
+L(fwd_write_30bytes_align):
+	movdqa	-30(%eax), %xmm0
+	movdqa	%xmm0, -30(%edx)
+L(fwd_write_14bytes_align):
+	movq	-14(%eax), %xmm0
+	movq	%xmm0, -14(%edx)
+L(fwd_write_6bytes_align):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_38bytes_align):
+	movdqa	-38(%eax), %xmm0
+	movdqa	%xmm0, -38(%edx)
+L(fwd_write_22bytes_align):
+	movdqa	-22(%eax), %xmm0
+	movdqa	%xmm0, -22(%edx)
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_42bytes_align):
+	movdqa	-42(%eax), %xmm0
+	movdqa	%xmm0, -42(%edx)
+L(fwd_write_26bytes_align):
+	movdqa	-26(%eax), %xmm0
+	movdqa	%xmm0, -26(%edx)
+L(fwd_write_10bytes_align):
+	movq	-10(%eax), %xmm0
+	movq	%xmm0, -10(%edx)
+L(fwd_write_2bytes_align):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_34bytes_align):
+	movdqa	-34(%eax), %xmm0
+	movdqa	%xmm0, -34(%edx)
+L(fwd_write_18bytes_align):
+	movdqa	-18(%eax), %xmm0
+	movdqa	%xmm0, -18(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_47bytes_align):
+	movdqa	-47(%eax), %xmm0
+	movdqa	%xmm0, -47(%edx)
+L(fwd_write_31bytes_align):
+	movdqa	-31(%eax), %xmm0
+	movdqa	%xmm0, -31(%edx)
+L(fwd_write_15bytes_align):
+	movq	-15(%eax), %xmm0
+	movq	%xmm0, -15(%edx)
+L(fwd_write_7bytes_align):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_39bytes_align):
+	movdqa	-39(%eax), %xmm0
+	movdqa	%xmm0, -39(%edx)
+L(fwd_write_23bytes_align):
+	movdqa	-23(%eax), %xmm0
+	movdqa	%xmm0, -23(%edx)
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_43bytes_align):
+	movdqa	-43(%eax), %xmm0
+	movdqa	%xmm0, -43(%edx)
+L(fwd_write_27bytes_align):
+	movdqa	-27(%eax), %xmm0
+	movdqa	%xmm0, -27(%edx)
+L(fwd_write_11bytes_align):
+	movq	-11(%eax), %xmm0
+	movq	%xmm0, -11(%edx)
+L(fwd_write_3bytes_align):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_35bytes_align):
+	movdqa	-35(%eax), %xmm0
+	movdqa	%xmm0, -35(%edx)
+L(fwd_write_19bytes_align):
+	movdqa	-19(%eax), %xmm0
+	movdqa	%xmm0, -19(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_44bytes_align):
+	movdqa	-44(%eax), %xmm0
+	movdqa	%xmm0, -44(%edx)
+L(fwd_write_28bytes_align):
+	movdqa	-28(%eax), %xmm0
+	movdqa	%xmm0, -28(%edx)
+L(fwd_write_12bytes_align):
+	movq	-12(%eax), %xmm0
+	movq	%xmm0, -12(%edx)
+L(fwd_write_4bytes_align):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_36bytes_align):
+	movdqa	-36(%eax), %xmm0
+	movdqa	%xmm0, -36(%edx)
+L(fwd_write_20bytes_align):
+	movdqa	-20(%eax), %xmm0
+	movdqa	%xmm0, -20(%edx)
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN_END
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(large_page):
+	movdqu	(%eax), %xmm1
+# ifdef USE_AS_MEMMOVE
+	movl	DEST+4(%esp), %edi
+	movdqu	%xmm0, (%edi)
+# endif
+	lea	16(%eax), %eax
+	movntdq	%xmm1, (%edx)
+	lea	16(%edx), %edx
+	lea	-0x90(%ecx), %ecx
+	POP (%edi)
+
+	.p2align 4
+L(large_page_loop):
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	movdqu	0x40(%eax), %xmm4
+	movdqu	0x50(%eax), %xmm5
+	movdqu	0x60(%eax), %xmm6
+	movdqu	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	movntdq	%xmm4, 0x40(%edx)
+	movntdq	%xmm5, 0x50(%edx)
+	movntdq	%xmm6, 0x60(%edx)
+	movntdq	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+	jae	L(large_page_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(large_page_less_64bytes)
+
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	lea	0x40(%eax), %eax
+
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	lea	0x40(%edx), %edx
+	sub	$0x40, %ecx
+L(large_page_less_64bytes):
+	cmp	$32, %ecx
+	jb	L(large_page_less_32bytes)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	lea	0x20(%eax), %eax
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	lea	0x20(%edx), %edx
+	sub	$0x20, %ecx
+L(large_page_less_32bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	.p2align 4
+L(bk_write_44bytes):
+	movq	36(%eax), %xmm0
+	movq	%xmm0, 36(%edx)
+L(bk_write_36bytes):
+	movq	28(%eax), %xmm0
+	movq	%xmm0, 28(%edx)
+L(bk_write_28bytes):
+	movq	20(%eax), %xmm0
+	movq	%xmm0, 20(%edx)
+L(bk_write_20bytes):
+	movq	12(%eax), %xmm0
+	movq	%xmm0, 12(%edx)
+L(bk_write_12bytes):
+	movq	4(%eax), %xmm0
+	movq	%xmm0, 4(%edx)
+L(bk_write_4bytes):
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)
+L(bk_write_0bytes):
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_40bytes):
+	movq	32(%eax), %xmm0
+	movq	%xmm0, 32(%edx)
+L(bk_write_32bytes):
+	movq	24(%eax), %xmm0
+	movq	%xmm0, 24(%edx)
+L(bk_write_24bytes):
+	movq	16(%eax), %xmm0
+	movq	%xmm0, 16(%edx)
+L(bk_write_16bytes):
+	movq	8(%eax), %xmm0
+	movq	%xmm0, 8(%edx)
+L(bk_write_8bytes):
+	movq	(%eax), %xmm0
+	movq	%xmm0, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_45bytes):
+	movq	37(%eax), %xmm0
+	movq	%xmm0, 37(%edx)
+L(bk_write_37bytes):
+	movq	29(%eax), %xmm0
+	movq	%xmm0, 29(%edx)
+L(bk_write_29bytes):
+	movq	21(%eax), %xmm0
+	movq	%xmm0, 21(%edx)
+L(bk_write_21bytes):
+	movq	13(%eax), %xmm0
+	movq	%xmm0, 13(%edx)
+L(bk_write_13bytes):
+	movq	5(%eax), %xmm0
+	movq	%xmm0, 5(%edx)
+L(bk_write_5bytes):
+	movl	1(%eax), %ecx
+	movl	%ecx, 1(%edx)
+L(bk_write_1bytes):
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_41bytes):
+	movq	33(%eax), %xmm0
+	movq	%xmm0, 33(%edx)
+L(bk_write_33bytes):
+	movq	25(%eax), %xmm0
+	movq	%xmm0, 25(%edx)
+L(bk_write_25bytes):
+	movq	17(%eax), %xmm0
+	movq	%xmm0, 17(%edx)
+L(bk_write_17bytes):
+	movq	9(%eax), %xmm0
+	movq	%xmm0, 9(%edx)
+L(bk_write_9bytes):
+	movq	1(%eax), %xmm0
+	movq	%xmm0, 1(%edx)
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_46bytes):
+	movq	38(%eax), %xmm0
+	movq	%xmm0, 38(%edx)
+L(bk_write_38bytes):
+	movq	30(%eax), %xmm0
+	movq	%xmm0, 30(%edx)
+L(bk_write_30bytes):
+	movq	22(%eax), %xmm0
+	movq	%xmm0, 22(%edx)
+L(bk_write_22bytes):
+	movq	14(%eax), %xmm0
+	movq	%xmm0, 14(%edx)
+L(bk_write_14bytes):
+	movq	6(%eax), %xmm0
+	movq	%xmm0, 6(%edx)
+L(bk_write_6bytes):
+	movl	2(%eax), %ecx
+	movl	%ecx, 2(%edx)
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_42bytes):
+	movq	34(%eax), %xmm0
+	movq	%xmm0, 34(%edx)
+L(bk_write_34bytes):
+	movq	26(%eax), %xmm0
+	movq	%xmm0, 26(%edx)
+L(bk_write_26bytes):
+	movq	18(%eax), %xmm0
+	movq	%xmm0, 18(%edx)
+L(bk_write_18bytes):
+	movq	10(%eax), %xmm0
+	movq	%xmm0, 10(%edx)
+L(bk_write_10bytes):
+	movq	2(%eax), %xmm0
+	movq	%xmm0, 2(%edx)
+L(bk_write_2bytes):
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_47bytes):
+	movq	39(%eax), %xmm0
+	movq	%xmm0, 39(%edx)
+L(bk_write_39bytes):
+	movq	31(%eax), %xmm0
+	movq	%xmm0, 31(%edx)
+L(bk_write_31bytes):
+	movq	23(%eax), %xmm0
+	movq	%xmm0, 23(%edx)
+L(bk_write_23bytes):
+	movq	15(%eax), %xmm0
+	movq	%xmm0, 15(%edx)
+L(bk_write_15bytes):
+	movq	7(%eax), %xmm0
+	movq	%xmm0, 7(%edx)
+L(bk_write_7bytes):
+	movl	3(%eax), %ecx
+	movl	%ecx, 3(%edx)
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_43bytes):
+	movq	35(%eax), %xmm0
+	movq	%xmm0, 35(%edx)
+L(bk_write_35bytes):
+	movq	27(%eax), %xmm0
+	movq	%xmm0, 27(%edx)
+L(bk_write_27bytes):
+	movq	19(%eax), %xmm0
+	movq	%xmm0, 19(%edx)
+L(bk_write_19bytes):
+	movq	11(%eax), %xmm0
+	movq	%xmm0, 11(%edx)
+L(bk_write_11bytes):
+	movq	3(%eax), %xmm0
+	movq	%xmm0, 3(%edx)
+L(bk_write_3bytes):
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN_END
+
+
+	.pushsection .rodata.ssse3,"a",@progbits
+	.p2align 2
+L(table_48bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+	.p2align 2
+L(table_48bytes_fwd_align):
+	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
+
+	.p2align 2
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	.p2align 2
+L(table_48_bytes_bwd):
+	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+	.popsection
+
+# ifdef USE_AS_MEMMOVE
+	.p2align 4
+L(copy_backward):
+	PUSH (%edi)
+	movl	%eax, %edi
+	lea	(%ecx,%edx,1),%edx
+	lea	(%ecx,%edi,1),%edi
+	testl	$0x3, %edx
+	jnz	L(bk_align)
+
+L(bk_aligned_4):
+	cmp	$64, %ecx
+	jae	L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+	cmp	$32, %ecx
+	jb	L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+	/* Copy 32 bytes at a time.  */
+	sub	$32, %ecx
+	movq	-8(%edi), %xmm0
+	movq	%xmm0, -8(%edx)
+	movq	-16(%edi), %xmm0
+	movq	%xmm0, -16(%edx)
+	movq	-24(%edi), %xmm0
+	movq	%xmm0, -24(%edx)
+	movq	-32(%edi), %xmm0
+	movq	%xmm0, -32(%edx)
+	sub	$32, %edx
+	sub	$32, %edi
+
+L(bk_write_less32bytes):
+	movl	%edi, %eax
+	sub	%ecx, %edx
+	sub	%ecx, %eax
+	POP (%edi)
+L(bk_write_less32bytes_2):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(bk_align):
+	cmp	$8, %ecx
+	jbe	L(bk_write_less32bytes)
+	testl	$1, %edx
+	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+	then	(EDX & 2) must be != 0.  */
+	jz	L(bk_got2)
+	sub	$1, %edi
+	sub	$1, %ecx
+	sub	$1, %edx
+	movzbl	(%edi), %eax
+	movb	%al, (%edx)
+
+	testl	$2, %edx
+	jz	L(bk_aligned_4)
+
+L(bk_got2):
+	sub	$2, %edi
+	sub	$2, %ecx
+	sub	$2, %edx
+	movzwl	(%edi), %eax
+	movw	%ax, (%edx)
+	jmp	L(bk_aligned_4)
+
+	.p2align 4
+L(bk_write_more64bytes):
+	/* Check alignment of last byte.  */
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes.  */
+L(bk_ssse3_align):
+	sub	$4, %edi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%edi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %edi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%edi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %edi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%edi), %eax
+	movl	%eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+	cmp	$64, %ecx
+	jb	L(bk_write_more32bytes)
+
+	.p2align 4
+L(bk_ssse3_cpy):
+	sub	$64, %edi
+	sub	$64, %ecx
+	sub	$64, %edx
+	movdqu	0x30(%edi), %xmm3
+	movdqa	%xmm3, 0x30(%edx)
+	movdqu	0x20(%edi), %xmm2
+	movdqa	%xmm2, 0x20(%edx)
+	movdqu	0x10(%edi), %xmm1
+	movdqa	%xmm1, 0x10(%edx)
+	movdqu	(%edi), %xmm0
+	movdqa	%xmm0, (%edx)
+	cmp	$64, %ecx
+	jae	L(bk_ssse3_cpy)
+	jmp	L(bk_write_64bytesless)
+
+# endif
+
+END (MEMCPY)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S
new file mode 100644
index 0000000000..f725944620
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S
@@ -0,0 +1,78 @@
+/* Multiple versions of memcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need memcpy before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(memcpy)
+	.type	memcpy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcpy_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_ssse3_rep)
+2:	ret
+END(memcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcpy_ia32, @function; \
+	.p2align 4; \
+	.globl __memcpy_ia32; \
+	.hidden __memcpy_ia32; \
+	__memcpy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memcpy_chk_ia32, @function; \
+	.globl __memcpy_chk_ia32; \
+	.p2align 4; \
+	__memcpy_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_memcpy; __GI_memcpy = __memcpy_ia32
+#endif
+
+#include "../memcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S
new file mode 100644
index 0000000000..1b4fbe2e6f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -0,0 +1,50 @@
+/* Multiple versions of __memcpy_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch memcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+	.text
+ENTRY(__memcpy_chk)
+	.type	__memcpy_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3_rep)
+2:	ret
+END(__memcpy_chk)
+# else
+#  include "../memcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
new file mode 100644
index 0000000000..3873594cb2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_sse2_unaligned
+#define MEMCPY_CHK	__memmove_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
new file mode 100644
index 0000000000..d202fc4a13
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3_rep
+#define MEMCPY_CHK	__memmove_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S
new file mode 100644
index 0000000000..295430b1ef
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3
+#define MEMCPY_CHK	__memmove_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S
new file mode 100644
index 0000000000..6eb418ca7f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S
@@ -0,0 +1,89 @@
+/* Multiple versions of memmove
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(memmove)
+	.type	memmove, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memmove_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_ssse3_rep)
+2:	ret
+END(memmove)
+
+# ifdef SHARED
+#  undef ENTRY
+#  define ENTRY(name) \
+	.type __memmove_ia32, @function; \
+	.p2align 4; \
+	.globl __memmove_ia32; \
+	.hidden __memmove_ia32; \
+	__memmove_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# else
+#  undef ENTRY
+#  define ENTRY(name) \
+	.type __memmove_ia32, @function; \
+	.globl __memmove_ia32; \
+	.p2align 4; \
+	__memmove_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# endif
+
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memmove_ia32, .-__memmove_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memmove_chk_ia32, @function; \
+	.globl __memmove_chk_ia32; \
+	.p2align 4; \
+	__memmove_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memmove; __GI_memmove = __memmove_ia32
+# endif
+#endif
+
+#include "../memmove.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S
new file mode 100644
index 0000000000..314834c4c6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -0,0 +1,94 @@
+/* Multiple versions of __memmove_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__memmove_chk)
+	.type	__memmove_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3_rep)
+2:	ret
+END(__memmove_chk)
+
+# ifndef SHARED
+	.type __memmove_chk_sse2_unaligned, @function
+	.p2align 4;
+__memmove_chk_sse2_unaligned:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_sse2_unaligned
+	cfi_endproc
+	.size __memmove_chk_sse2_unaligned, .-__memmove_chk_sse2_unaligned
+
+	.type __memmove_chk_ssse3, @function
+	.p2align 4;
+__memmove_chk_ssse3:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ssse3
+	cfi_endproc
+	.size __memmove_chk_ssse3, .-__memmove_chk_ssse3
+
+	.type __memmove_chk_ssse3_rep, @function
+	.p2align 4;
+__memmove_chk_ssse3_rep:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ssse3_rep
+	cfi_endproc
+	.size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep
+
+	.type __memmove_chk_ia32, @function
+	.p2align 4;
+__memmove_chk_ia32:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ia32
+	cfi_endproc
+	.size __memmove_chk_ia32, .-__memmove_chk_ia32
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..a1cea50771
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_sse2_unaligned
+#define MEMCPY_CHK	__mempcpy_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
new file mode 100644
index 0000000000..5357b33e18
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_ssse3_rep
+#define MEMCPY_CHK	__mempcpy_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
new file mode 100644
index 0000000000..822d98e954
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_ssse3
+#define MEMCPY_CHK	__mempcpy_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S
new file mode 100644
index 0000000000..06e377fbc9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -0,0 +1,81 @@
+/* Multiple versions of mempcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need mempcpy before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(__mempcpy)
+	.type	__mempcpy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__mempcpy_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_ssse3_rep)
+2:	ret
+END(__mempcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __mempcpy_ia32, @function; \
+	.p2align 4; \
+	.globl __mempcpy_ia32; \
+	.hidden __mempcpy_ia32; \
+	__mempcpy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __mempcpy_chk_ia32, @function; \
+	.globl __mempcpy_chk_ia32; \
+	.p2align 4; \
+	__mempcpy_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32
+
+# undef libc_hidden_def
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_def(name) \
+	.globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32
+# define libc_hidden_builtin_def(name) \
+	.globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32
+#endif
+
+#include "../mempcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S
new file mode 100644
index 0000000000..e13e5248a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -0,0 +1,50 @@
+/* Multiple versions of __mempcpy_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch mempcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+	.text
+ENTRY(__mempcpy_chk)
+	.type	__mempcpy_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3_rep)
+2:	ret
+END(__mempcpy_chk)
+# else
+#  include "../mempcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c
new file mode 100644
index 0000000000..ef7bbbe792
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c
@@ -0,0 +1,7 @@
+#if IS_IN (libc)
+# define MEMRCHR  __memrchr_ia32
+# include <string.h>
+extern void *__memrchr_ia32 (const void *, int, size_t);
+#endif
+
+#include "string/memrchr.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
new file mode 100644
index 0000000000..dbbe94fd08
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
@@ -0,0 +1,417 @@
+/* Optimized memrchr with sse2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN   STR2+4
+
+# define MEMCHR __memrchr_sse2_bsf
+
+	.text
+ENTRY (MEMCHR)
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+	mov	LEN(%esp), %edx
+
+	sub	$16, %edx
+	jbe	L(length_less16)
+
+	punpcklbw %xmm1, %xmm1
+	add	%edx, %ecx
+	punpcklbw %xmm1, %xmm1
+
+	movdqu	(%ecx), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	sub	$64, %ecx
+	mov	%ecx, %eax
+	and	$15, %eax
+	jz	L(loop_prolog)
+
+	add	$16, %ecx
+	add	$16, %edx
+	sub	%eax, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	mov	%ecx, %eax
+	and	$63, %eax
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	add	$64, %ecx
+	add	$64, %edx
+	sub	%eax, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%ecx), %xmm0
+	movdqa	16(%ecx), %xmm2
+	movdqa	32(%ecx), %xmm3
+	movdqa	48(%ecx), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm3, %xmm0
+	pmaxub	%xmm4, %xmm2
+	pmaxub	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	(%ecx), %xmm1
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	pmovmskb %xmm1, %eax
+	bsr	%eax, %eax
+
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches0_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	32(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches0):
+	bsr	%eax, %eax
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16):
+	bsr	%eax, %eax
+	lea	16(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches32):
+	bsr	%eax, %eax
+	lea	32(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches48):
+	bsr	%eax, %eax
+	lea	48(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches0_1):
+	bsr	%eax, %eax
+	sub	$64, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	bsr	%eax, %eax
+	sub	$48, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	16(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	bsr	%eax, %eax
+	sub	$32, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	32(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	bsr	%eax, %eax
+	sub	$16, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	48(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16_offset0):
+	mov	%dl, %cl
+	pcmpeqb	(%eax), %xmm1
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+	mov	%edx, %ecx
+
+	pmovmskb %xmm1, %edx
+
+	and	%ecx, %edx
+	test	%edx, %edx
+	jz	L(return_null)
+
+	bsr	%edx, %ecx
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(length_less16):
+	punpcklbw %xmm1, %xmm1
+	mov	%ecx, %eax
+	punpcklbw %xmm1, %xmm1
+	add	$16, %edx
+	jz	L(return_null)
+
+	pshufd	$0, %xmm1, %xmm1
+	and	$15, %ecx
+	jz	L(length_less16_offset0)
+
+	PUSH	(%edi)
+	mov	%cl, %dh
+	add	%dl, %dh
+	and	$-16, %eax
+
+	sub	$16, %dh
+	ja	L(length_less16_part2)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	sar	%cl, %edi
+	add	%ecx, %eax
+	mov	%dl, %cl
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2):
+	movdqa	16(%eax), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %edi
+
+	mov	%cl, %ch
+
+	mov	%dh, %cl
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+
+	test	%edi, %edi
+	jnz	L(length_less16_part2_return)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	mov	%ch, %cl
+	sar	%cl, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	xor	%ch, %ch
+	add	%ecx, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2_return):
+	bsr	%edi, %edi
+	lea	16(%eax, %edi), %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ret_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S
new file mode 100644
index 0000000000..5f7853f683
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S
@@ -0,0 +1,724 @@
+/* Optimized memrchr with sse2 without bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN   STR2+4
+
+	atom_text_section
+ENTRY (__memrchr_sse2)
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+	mov	LEN(%esp), %edx
+
+	sub	$16, %edx
+	jbe	L(length_less16)
+
+	punpcklbw %xmm1, %xmm1
+	add	%edx, %ecx
+	punpcklbw %xmm1, %xmm1
+
+	movdqu	(%ecx), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	sub	$64, %ecx
+	mov	%ecx, %eax
+	and	$15, %eax
+	jz	L(loop_prolog)
+
+	lea	16(%ecx), %ecx
+	lea	16(%edx), %edx
+	sub	%eax, %edx
+	and	$-16, %ecx
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	mov	%ecx, %eax
+	and	$63, %eax
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	lea	64(%ecx), %ecx
+	lea	64(%edx), %edx
+	and	$-64, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%ecx), %xmm0
+	movdqa	16(%ecx), %xmm2
+	movdqa	32(%ecx), %xmm3
+	movdqa	48(%ecx), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm3, %xmm0
+	pmaxub	%xmm4, %xmm2
+	pmaxub	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	(%ecx), %xmm1
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	pmovmskb %xmm1, %eax
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches0_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	32(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches16):
+	lea	16(%ecx), %ecx
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches32):
+	lea	32(%ecx), %ecx
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches48):
+	lea	48(%ecx), %ecx
+
+	.p2align 4
+L(exit_dispatch):
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_8):
+	test	$0x80, %al
+	jnz	L(exit_8)
+	test	$0x40, %al
+	jnz	L(exit_7)
+	test	$0x20, %al
+	jnz	L(exit_6)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_high):
+	mov	%ah, %dh
+	and	$15 << 4, %dh
+	jnz	L(exit_dispatch_high_8)
+	test	$0x08, %ah
+	jnz	L(exit_12)
+	test	$0x04, %ah
+	jnz	L(exit_11)
+	test	$0x02, %ah
+	jnz	L(exit_10)
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_high_8):
+	test	$0x80, %ah
+	jnz	L(exit_16)
+	test	$0x40, %ah
+	jnz	L(exit_15)
+	test	$0x20, %ah
+	jnz	L(exit_14)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_2):
+	lea	1(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_3):
+	lea	2(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_4):
+	lea	3(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_6):
+	lea	5(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_7):
+	lea	6(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_8):
+	lea	7(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_10):
+	lea	9(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_11):
+	lea	10(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_12):
+	lea	11(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_14):
+	lea	13(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_15):
+	lea	14(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_16):
+	lea	15(%ecx), %eax
+	ret
+
+	.p2align 4
+L(matches0_1):
+	lea	-64(%edx), %edx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	lea	-48(%edx), %edx
+	lea	16(%ecx), %ecx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	lea	-32(%edx), %edx
+	lea	32(%ecx), %ecx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	lea	-16(%edx), %edx
+	lea	48(%ecx), %ecx
+
+	.p2align 4
+L(exit_dispatch_1):
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_8):
+	test	$0x80, %al
+	jnz	L(exit_1_8)
+	test	$0x40, %al
+	jnz	L(exit_1_7)
+	test	$0x20, %al
+	jnz	L(exit_1_6)
+	add	$4, %edx
+	jl	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_high):
+	mov	%ah, %al
+	and	$15 << 4, %al
+	jnz	L(exit_dispatch_1_high_8)
+	test	$0x08, %ah
+	jnz	L(exit_1_12)
+	test	$0x04, %ah
+	jnz	L(exit_1_11)
+	test	$0x02, %ah
+	jnz	L(exit_1_10)
+	add	$8, %edx
+	jl	L(return_null)
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_high_8):
+	test	$0x80, %ah
+	jnz	L(exit_1_16)
+	test	$0x40, %ah
+	jnz	L(exit_1_15)
+	test	$0x20, %ah
+	jnz	L(exit_1_14)
+	add	$12, %edx
+	jl	L(return_null)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_2):
+	add	$1, %edx
+	jl	L(return_null)
+	lea	1(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_3):
+	add	$2, %edx
+	jl	L(return_null)
+	lea	2(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_4):
+	add	$3, %edx
+	jl	L(return_null)
+	lea	3(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_6):
+	add	$5, %edx
+	jl	L(return_null)
+	lea	5(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_7):
+	add	$6, %edx
+	jl	L(return_null)
+	lea	6(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_8):
+	add	$7, %edx
+	jl	L(return_null)
+	lea	7(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_10):
+	add	$9, %edx
+	jl	L(return_null)
+	lea	9(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_11):
+	add	$10, %edx
+	jl	L(return_null)
+	lea	10(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_12):
+	add	$11, %edx
+	jl	L(return_null)
+	lea	11(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_14):
+	add	$13, %edx
+	jl	L(return_null)
+	lea	13(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_15):
+	add	$14, %edx
+	jl	L(return_null)
+	lea	14(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_16):
+	add	$15, %edx
+	jl	L(return_null)
+	lea	15(%ecx), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16_offset0):
+	mov	%dl, %cl
+	pcmpeqb	(%eax), %xmm1
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	mov	%eax, %ecx
+	pmovmskb %xmm1, %eax
+
+	and	%edx, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16):
+	punpcklbw %xmm1, %xmm1
+	add	$16, %edx
+	je	L(return_null)
+	punpcklbw %xmm1, %xmm1
+
+	mov	%ecx, %eax
+	pshufd	$0, %xmm1, %xmm1
+
+	and	$15, %ecx
+	jz	L(length_less16_offset0)
+
+	PUSH	(%edi)
+
+	mov	%cl, %dh
+	add	%dl, %dh
+	and	$-16, %eax
+
+	sub	$16, %dh
+	ja	L(length_less16_part2)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	sar	%cl, %edi
+	add	%ecx, %eax
+	mov	%dl, %cl
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2):
+	movdqa	16(%eax), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %edi
+
+	mov	%cl, %ch
+
+	mov	%dh, %cl
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+
+	test	%edi, %edi
+	jnz	L(length_less16_part2_return)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	mov	%ch, %cl
+	sar	%cl, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	xor	%ch, %ch
+	add	%ecx, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2_return):
+	bsr	%edi, %edi
+	lea	16(%eax, %edi), %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ret_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+END (__memrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S
new file mode 100644
index 0000000000..d4253a553b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S
@@ -0,0 +1,45 @@
+/* Multiple versions of memrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__memrchr)
+	.type	__memrchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	3f
+
+	LOAD_FUNC_GOT_EAX (__memrchr_sse2)
+	ret
+
+2:	LOAD_FUNC_GOT_EAX (__memrchr_ia32)
+	ret
+
+3:	LOAD_FUNC_GOT_EAX (__memrchr_sse2_bsf)
+	ret
+END(__memrchr)
+
+weak_alias(__memrchr, memrchr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
new file mode 100644
index 0000000000..3221077e49
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
@@ -0,0 +1,811 @@
+/* memset with SSE2 and REP string.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST		PARMS
+# define LEN		DEST+4
+# define SETRTNVAL
+#else
+# define DEST		PARMS
+# define CHR		DEST+4
+# define LEN		CHR+4
+# define SETRTNVAL	movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define PARMS		8		/* Preserve EBX.  */
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.   */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    /* We first load PC into EBX.  */				\
+    SETUP_PIC_REG(bx);						\
+    /* Get the address of the jump table.  */			\
+    add		$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    add		(%ebx,%ecx,4), %ebx;				\
+    add		%ecx, %edx;					\
+    /* We loaded the jump table and adjusted EDX. Go.  */	\
+    jmp		*%ebx
+#else
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define PARMS		4
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    add		%ecx, %edx;					\
+    jmp		*TABLE(,%ecx,4)
+#endif
+
+	.section .text.sse2,"ax",@progbits
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2_rep)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2_rep)
+#endif
+ENTRY (__memset_sse2_rep)
+	ENTRANCE
+
+	movl	LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	CHR(%esp), %eax
+	movb	%al, %ah
+	/* Fill the whole EAX with pattern.  */
+	movl	%eax, %edx
+	shl	$16, %eax
+	or	%edx, %eax
+#endif
+	movl	DEST(%esp), %edx
+	cmp	$32, %ecx
+	jae	L(32bytesormore)
+
+L(write_less32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_less_32bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
+	.popsection
+
+	ALIGN (4)
+L(write_28bytes):
+	movl	%eax, -28(%edx)
+L(write_24bytes):
+	movl	%eax, -24(%edx)
+L(write_20bytes):
+	movl	%eax, -20(%edx)
+L(write_16bytes):
+	movl	%eax, -16(%edx)
+L(write_12bytes):
+	movl	%eax, -12(%edx)
+L(write_8bytes):
+	movl	%eax, -8(%edx)
+L(write_4bytes):
+	movl	%eax, -4(%edx)
+L(write_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_29bytes):
+	movl	%eax, -29(%edx)
+L(write_25bytes):
+	movl	%eax, -25(%edx)
+L(write_21bytes):
+	movl	%eax, -21(%edx)
+L(write_17bytes):
+	movl	%eax, -17(%edx)
+L(write_13bytes):
+	movl	%eax, -13(%edx)
+L(write_9bytes):
+	movl	%eax, -9(%edx)
+L(write_5bytes):
+	movl	%eax, -5(%edx)
+L(write_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_30bytes):
+	movl	%eax, -30(%edx)
+L(write_26bytes):
+	movl	%eax, -26(%edx)
+L(write_22bytes):
+	movl	%eax, -22(%edx)
+L(write_18bytes):
+	movl	%eax, -18(%edx)
+L(write_14bytes):
+	movl	%eax, -14(%edx)
+L(write_10bytes):
+	movl	%eax, -10(%edx)
+L(write_6bytes):
+	movl	%eax, -6(%edx)
+L(write_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_31bytes):
+	movl	%eax, -31(%edx)
+L(write_27bytes):
+	movl	%eax, -27(%edx)
+L(write_23bytes):
+	movl	%eax, -23(%edx)
+L(write_19bytes):
+	movl	%eax, -19(%edx)
+L(write_15bytes):
+	movl	%eax, -15(%edx)
+L(write_11bytes):
+	movl	%eax, -11(%edx)
+L(write_7bytes):
+	movl	%eax, -7(%edx)
+L(write_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(32bytesormore):
+	/* Fill xmm0 with the pattern.  */
+#ifdef USE_AS_BZERO
+	pxor	%xmm0, %xmm0
+#else
+	movd	%eax, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+#endif
+	testl	$0xf, %edx
+	jz	L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned.  */
+L(not_aligned_16):
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %eax
+	add	%eax, %ecx
+	movd	%xmm0, %eax
+
+	ALIGN (4)
+L(aligned_16):
+	cmp	$128, %ecx
+	jae	L(128bytesormore)
+
+L(aligned_16_less128bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytesormore):
+	PUSH (%edi)
+#ifdef DATA_CACHE_SIZE
+	PUSH (%ebx)
+	mov	$DATA_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_data_cache_size@GOTOFF(%ebx), %ebx
+# else
+	PUSH (%ebx)
+	mov	__x86_data_cache_size, %ebx
+# endif
+#endif
+	mov	%ebx, %edi
+	shr	$4, %ebx
+	sub	%ebx, %edi
+#if defined DATA_CACHE_SIZE || !defined SHARED
+	POP (%ebx)
+#endif
+/*
+ * When data size approximate the end of L1 cache,
+ * fast string will prefetch and combine data efficiently.
+ */
+	cmp	%edi, %ecx
+	jae	L(128bytesormore_endof_L1)
+	subl	$128, %ecx
+L(128bytesormore_normal):
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jb	L(128bytesless_normal)
+
+
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jae	L(128bytesormore_normal)
+
+L(128bytesless_normal):
+	POP (%edi)
+	add	$128, %ecx
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(128bytesormore_endof_L1):
+	mov	%edx, %edi
+	mov	%ecx, %edx
+	shr	$2, %ecx
+	and	$3, %edx
+	rep stosl
+	jz	L(copy_page_by_rep_exit)
+	cmp	$2, %edx
+	jb	L(copy_page_by_rep_left_1)
+	movw	%ax, (%edi)
+	add	$2, %edi
+	sub	$2, %edx
+	jz	L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+	movb	%al, (%edi)
+L(copy_page_by_rep_exit):
+	POP (%edi)
+	SETRTNVAL
+	RETURN
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_16_128bytes):
+	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+	.popsection
+
+	ALIGN (4)
+L(aligned_16_112bytes):
+	movdqa	%xmm0, -112(%edx)
+L(aligned_16_96bytes):
+	movdqa	%xmm0, -96(%edx)
+L(aligned_16_80bytes):
+	movdqa	%xmm0, -80(%edx)
+L(aligned_16_64bytes):
+	movdqa	%xmm0, -64(%edx)
+L(aligned_16_48bytes):
+	movdqa	%xmm0, -48(%edx)
+L(aligned_16_32bytes):
+	movdqa	%xmm0, -32(%edx)
+L(aligned_16_16bytes):
+	movdqa	%xmm0, -16(%edx)
+L(aligned_16_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_113bytes):
+	movdqa	%xmm0, -113(%edx)
+L(aligned_16_97bytes):
+	movdqa	%xmm0, -97(%edx)
+L(aligned_16_81bytes):
+	movdqa	%xmm0, -81(%edx)
+L(aligned_16_65bytes):
+	movdqa	%xmm0, -65(%edx)
+L(aligned_16_49bytes):
+	movdqa	%xmm0, -49(%edx)
+L(aligned_16_33bytes):
+	movdqa	%xmm0, -33(%edx)
+L(aligned_16_17bytes):
+	movdqa	%xmm0, -17(%edx)
+L(aligned_16_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_114bytes):
+	movdqa	%xmm0, -114(%edx)
+L(aligned_16_98bytes):
+	movdqa	%xmm0, -98(%edx)
+L(aligned_16_82bytes):
+	movdqa	%xmm0, -82(%edx)
+L(aligned_16_66bytes):
+	movdqa	%xmm0, -66(%edx)
+L(aligned_16_50bytes):
+	movdqa	%xmm0, -50(%edx)
+L(aligned_16_34bytes):
+	movdqa	%xmm0, -34(%edx)
+L(aligned_16_18bytes):
+	movdqa	%xmm0, -18(%edx)
+L(aligned_16_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_115bytes):
+	movdqa	%xmm0, -115(%edx)
+L(aligned_16_99bytes):
+	movdqa	%xmm0, -99(%edx)
+L(aligned_16_83bytes):
+	movdqa	%xmm0, -83(%edx)
+L(aligned_16_67bytes):
+	movdqa	%xmm0, -67(%edx)
+L(aligned_16_51bytes):
+	movdqa	%xmm0, -51(%edx)
+L(aligned_16_35bytes):
+	movdqa	%xmm0, -35(%edx)
+L(aligned_16_19bytes):
+	movdqa	%xmm0, -19(%edx)
+L(aligned_16_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_116bytes):
+	movdqa	%xmm0, -116(%edx)
+L(aligned_16_100bytes):
+	movdqa	%xmm0, -100(%edx)
+L(aligned_16_84bytes):
+	movdqa	%xmm0, -84(%edx)
+L(aligned_16_68bytes):
+	movdqa	%xmm0, -68(%edx)
+L(aligned_16_52bytes):
+	movdqa	%xmm0, -52(%edx)
+L(aligned_16_36bytes):
+	movdqa	%xmm0, -36(%edx)
+L(aligned_16_20bytes):
+	movdqa	%xmm0, -20(%edx)
+L(aligned_16_4bytes):
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_117bytes):
+	movdqa	%xmm0, -117(%edx)
+L(aligned_16_101bytes):
+	movdqa	%xmm0, -101(%edx)
+L(aligned_16_85bytes):
+	movdqa	%xmm0, -85(%edx)
+L(aligned_16_69bytes):
+	movdqa	%xmm0, -69(%edx)
+L(aligned_16_53bytes):
+	movdqa	%xmm0, -53(%edx)
+L(aligned_16_37bytes):
+	movdqa	%xmm0, -37(%edx)
+L(aligned_16_21bytes):
+	movdqa	%xmm0, -21(%edx)
+L(aligned_16_5bytes):
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_118bytes):
+	movdqa	%xmm0, -118(%edx)
+L(aligned_16_102bytes):
+	movdqa	%xmm0, -102(%edx)
+L(aligned_16_86bytes):
+	movdqa	%xmm0, -86(%edx)
+L(aligned_16_70bytes):
+	movdqa	%xmm0, -70(%edx)
+L(aligned_16_54bytes):
+	movdqa	%xmm0, -54(%edx)
+L(aligned_16_38bytes):
+	movdqa	%xmm0, -38(%edx)
+L(aligned_16_22bytes):
+	movdqa	%xmm0, -22(%edx)
+L(aligned_16_6bytes):
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_119bytes):
+	movdqa	%xmm0, -119(%edx)
+L(aligned_16_103bytes):
+	movdqa	%xmm0, -103(%edx)
+L(aligned_16_87bytes):
+	movdqa	%xmm0, -87(%edx)
+L(aligned_16_71bytes):
+	movdqa	%xmm0, -71(%edx)
+L(aligned_16_55bytes):
+	movdqa	%xmm0, -55(%edx)
+L(aligned_16_39bytes):
+	movdqa	%xmm0, -39(%edx)
+L(aligned_16_23bytes):
+	movdqa	%xmm0, -23(%edx)
+L(aligned_16_7bytes):
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_120bytes):
+	movdqa	%xmm0, -120(%edx)
+L(aligned_16_104bytes):
+	movdqa	%xmm0, -104(%edx)
+L(aligned_16_88bytes):
+	movdqa	%xmm0, -88(%edx)
+L(aligned_16_72bytes):
+	movdqa	%xmm0, -72(%edx)
+L(aligned_16_56bytes):
+	movdqa	%xmm0, -56(%edx)
+L(aligned_16_40bytes):
+	movdqa	%xmm0, -40(%edx)
+L(aligned_16_24bytes):
+	movdqa	%xmm0, -24(%edx)
+L(aligned_16_8bytes):
+	movq	%xmm0, -8(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_121bytes):
+	movdqa	%xmm0, -121(%edx)
+L(aligned_16_105bytes):
+	movdqa	%xmm0, -105(%edx)
+L(aligned_16_89bytes):
+	movdqa	%xmm0, -89(%edx)
+L(aligned_16_73bytes):
+	movdqa	%xmm0, -73(%edx)
+L(aligned_16_57bytes):
+	movdqa	%xmm0, -57(%edx)
+L(aligned_16_41bytes):
+	movdqa	%xmm0, -41(%edx)
+L(aligned_16_25bytes):
+	movdqa	%xmm0, -25(%edx)
+L(aligned_16_9bytes):
+	movq	%xmm0, -9(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_122bytes):
+	movdqa	%xmm0, -122(%edx)
+L(aligned_16_106bytes):
+	movdqa	%xmm0, -106(%edx)
+L(aligned_16_90bytes):
+	movdqa	%xmm0, -90(%edx)
+L(aligned_16_74bytes):
+	movdqa	%xmm0, -74(%edx)
+L(aligned_16_58bytes):
+	movdqa	%xmm0, -58(%edx)
+L(aligned_16_42bytes):
+	movdqa	%xmm0, -42(%edx)
+L(aligned_16_26bytes):
+	movdqa	%xmm0, -26(%edx)
+L(aligned_16_10bytes):
+	movq	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_123bytes):
+	movdqa	%xmm0, -123(%edx)
+L(aligned_16_107bytes):
+	movdqa	%xmm0, -107(%edx)
+L(aligned_16_91bytes):
+	movdqa	%xmm0, -91(%edx)
+L(aligned_16_75bytes):
+	movdqa	%xmm0, -75(%edx)
+L(aligned_16_59bytes):
+	movdqa	%xmm0, -59(%edx)
+L(aligned_16_43bytes):
+	movdqa	%xmm0, -43(%edx)
+L(aligned_16_27bytes):
+	movdqa	%xmm0, -27(%edx)
+L(aligned_16_11bytes):
+	movq	%xmm0, -11(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_124bytes):
+	movdqa	%xmm0, -124(%edx)
+L(aligned_16_108bytes):
+	movdqa	%xmm0, -108(%edx)
+L(aligned_16_92bytes):
+	movdqa	%xmm0, -92(%edx)
+L(aligned_16_76bytes):
+	movdqa	%xmm0, -76(%edx)
+L(aligned_16_60bytes):
+	movdqa	%xmm0, -60(%edx)
+L(aligned_16_44bytes):
+	movdqa	%xmm0, -44(%edx)
+L(aligned_16_28bytes):
+	movdqa	%xmm0, -28(%edx)
+L(aligned_16_12bytes):
+	movq	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_125bytes):
+	movdqa	%xmm0, -125(%edx)
+L(aligned_16_109bytes):
+	movdqa	%xmm0, -109(%edx)
+L(aligned_16_93bytes):
+	movdqa	%xmm0, -93(%edx)
+L(aligned_16_77bytes):
+	movdqa	%xmm0, -77(%edx)
+L(aligned_16_61bytes):
+	movdqa	%xmm0, -61(%edx)
+L(aligned_16_45bytes):
+	movdqa	%xmm0, -45(%edx)
+L(aligned_16_29bytes):
+	movdqa	%xmm0, -29(%edx)
+L(aligned_16_13bytes):
+	movq	%xmm0, -13(%edx)
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_126bytes):
+	movdqa	%xmm0, -126(%edx)
+L(aligned_16_110bytes):
+	movdqa	%xmm0, -110(%edx)
+L(aligned_16_94bytes):
+	movdqa	%xmm0, -94(%edx)
+L(aligned_16_78bytes):
+	movdqa	%xmm0, -78(%edx)
+L(aligned_16_62bytes):
+	movdqa	%xmm0, -62(%edx)
+L(aligned_16_46bytes):
+	movdqa	%xmm0, -46(%edx)
+L(aligned_16_30bytes):
+	movdqa	%xmm0, -30(%edx)
+L(aligned_16_14bytes):
+	movq	%xmm0, -14(%edx)
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_127bytes):
+	movdqa	%xmm0, -127(%edx)
+L(aligned_16_111bytes):
+	movdqa	%xmm0, -111(%edx)
+L(aligned_16_95bytes):
+	movdqa	%xmm0, -95(%edx)
+L(aligned_16_79bytes):
+	movdqa	%xmm0, -79(%edx)
+L(aligned_16_63bytes):
+	movdqa	%xmm0, -63(%edx)
+L(aligned_16_47bytes):
+	movdqa	%xmm0, -47(%edx)
+L(aligned_16_31bytes):
+	movdqa	%xmm0, -31(%edx)
+L(aligned_16_15bytes):
+	movq	%xmm0, -15(%edx)
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN_END
+
+END (__memset_sse2_rep)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S
new file mode 100644
index 0000000000..d7b8be9114
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S
@@ -0,0 +1,860 @@
+/* memset with SSE2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST		PARMS
+# define LEN		DEST+4
+# define SETRTNVAL
+#else
+# define DEST		PARMS
+# define CHR		DEST+4
+# define LEN		CHR+4
+# define SETRTNVAL	movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define PARMS		8		/* Preserve EBX.  */
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.   */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    /* We first load PC into EBX.  */				\
+    SETUP_PIC_REG(bx);						\
+    /* Get the address of the jump table.  */			\
+    add		$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    add		(%ebx,%ecx,4), %ebx;				\
+    add		%ecx, %edx;					\
+    /* We loaded the jump table and adjusted EDX. Go.  */	\
+    jmp		*%ebx
+#else
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define PARMS		4
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    add		%ecx, %edx;					\
+    jmp		*TABLE(,%ecx,4)
+#endif
+
+	.section .text.sse2,"ax",@progbits
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2)
+#endif
+ENTRY (__memset_sse2)
+	ENTRANCE
+
+	movl	LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	CHR(%esp), %eax
+	movb	%al, %ah
+	/* Fill the whole EAX with pattern.  */
+	movl	%eax, %edx
+	shl	$16, %eax
+	or	%edx, %eax
+#endif
+	movl	DEST(%esp), %edx
+	cmp	$32, %ecx
+	jae	L(32bytesormore)
+
+L(write_less32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_less_32bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
+	.popsection
+
+	ALIGN (4)
+L(write_28bytes):
+	movl	%eax, -28(%edx)
+L(write_24bytes):
+	movl	%eax, -24(%edx)
+L(write_20bytes):
+	movl	%eax, -20(%edx)
+L(write_16bytes):
+	movl	%eax, -16(%edx)
+L(write_12bytes):
+	movl	%eax, -12(%edx)
+L(write_8bytes):
+	movl	%eax, -8(%edx)
+L(write_4bytes):
+	movl	%eax, -4(%edx)
+L(write_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_29bytes):
+	movl	%eax, -29(%edx)
+L(write_25bytes):
+	movl	%eax, -25(%edx)
+L(write_21bytes):
+	movl	%eax, -21(%edx)
+L(write_17bytes):
+	movl	%eax, -17(%edx)
+L(write_13bytes):
+	movl	%eax, -13(%edx)
+L(write_9bytes):
+	movl	%eax, -9(%edx)
+L(write_5bytes):
+	movl	%eax, -5(%edx)
+L(write_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_30bytes):
+	movl	%eax, -30(%edx)
+L(write_26bytes):
+	movl	%eax, -26(%edx)
+L(write_22bytes):
+	movl	%eax, -22(%edx)
+L(write_18bytes):
+	movl	%eax, -18(%edx)
+L(write_14bytes):
+	movl	%eax, -14(%edx)
+L(write_10bytes):
+	movl	%eax, -10(%edx)
+L(write_6bytes):
+	movl	%eax, -6(%edx)
+L(write_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_31bytes):
+	movl	%eax, -31(%edx)
+L(write_27bytes):
+	movl	%eax, -27(%edx)
+L(write_23bytes):
+	movl	%eax, -23(%edx)
+L(write_19bytes):
+	movl	%eax, -19(%edx)
+L(write_15bytes):
+	movl	%eax, -15(%edx)
+L(write_11bytes):
+	movl	%eax, -11(%edx)
+L(write_7bytes):
+	movl	%eax, -7(%edx)
+L(write_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(32bytesormore):
+	/* Fill xmm0 with the pattern.  */
+#ifdef USE_AS_BZERO
+	pxor	%xmm0, %xmm0
+#else
+	movd	%eax, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+#endif
+	testl	$0xf, %edx
+	jz	L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned.  */
+L(not_aligned_16):
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %eax
+	add	%eax, %ecx
+	movd	%xmm0, %eax
+
+	ALIGN (4)
+L(aligned_16):
+	cmp	$128, %ecx
+	jae	L(128bytesormore)
+
+L(aligned_16_less128bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytesormore):
+#ifdef SHARED_CACHE_SIZE
+	PUSH (%ebx)
+	mov	$SHARED_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
+# else
+	PUSH (%ebx)
+	mov	__x86_shared_cache_size, %ebx
+# endif
+#endif
+	cmp	%ebx, %ecx
+	jae	L(128bytesormore_nt_start)
+
+
+#ifdef DATA_CACHE_SIZE
+	POP (%ebx)
+# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
+	cmp	$DATA_CACHE_SIZE, %ecx
+#else
+# ifdef SHARED
+#  define RESTORE_EBX_STATE
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
+# else
+	POP (%ebx)
+#  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
+	cmp	__x86_data_cache_size, %ecx
+# endif
+#endif
+
+	jae	L(128bytes_L2_normal)
+	subl	$128, %ecx
+L(128bytesormore_normal):
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jb	L(128bytesless_normal)
+
+
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jae	L(128bytesormore_normal)
+
+L(128bytesless_normal):
+	add	$128, %ecx
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytes_L2_normal):
+	prefetcht0	0x380(%edx)
+	prefetcht0	0x3c0(%edx)
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm0, 0x10(%edx)
+	movaps	%xmm0, 0x20(%edx)
+	movaps	%xmm0, 0x30(%edx)
+	movaps	%xmm0, 0x40(%edx)
+	movaps	%xmm0, 0x50(%edx)
+	movaps	%xmm0, 0x60(%edx)
+	movaps	%xmm0, 0x70(%edx)
+	add	$128, %edx
+	cmp	$128, %ecx
+	jae	L(128bytes_L2_normal)
+
+L(128bytesless_L2_normal):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	RESTORE_EBX_STATE
+L(128bytesormore_nt_start):
+	sub	%ebx, %ecx
+	ALIGN (4)
+L(128bytesormore_shared_cache_loop):
+	prefetcht0	0x3c0(%edx)
+	prefetcht0	0x380(%edx)
+	sub	$0x80, %ebx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	add	$0x80, %edx
+	cmp	$0x80, %ebx
+	jae	L(128bytesormore_shared_cache_loop)
+	cmp	$0x80, %ecx
+	jb	L(shared_cache_loop_end)
+	ALIGN (4)
+L(128bytesormore_nt):
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm0, 0x10(%edx)
+	movntdq	%xmm0, 0x20(%edx)
+	movntdq	%xmm0, 0x30(%edx)
+	movntdq	%xmm0, 0x40(%edx)
+	movntdq	%xmm0, 0x50(%edx)
+	movntdq	%xmm0, 0x60(%edx)
+	movntdq	%xmm0, 0x70(%edx)
+	add	$0x80, %edx
+	cmp	$0x80, %ecx
+	jae	L(128bytesormore_nt)
+	sfence
+L(shared_cache_loop_end):
+#if defined DATA_CACHE_SIZE || !defined SHARED
+	POP (%ebx)
+#endif
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_16_128bytes):
+	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+	.popsection
+
+	ALIGN (4)
+L(aligned_16_112bytes):
+	movdqa	%xmm0, -112(%edx)
+L(aligned_16_96bytes):
+	movdqa	%xmm0, -96(%edx)
+L(aligned_16_80bytes):
+	movdqa	%xmm0, -80(%edx)
+L(aligned_16_64bytes):
+	movdqa	%xmm0, -64(%edx)
+L(aligned_16_48bytes):
+	movdqa	%xmm0, -48(%edx)
+L(aligned_16_32bytes):
+	movdqa	%xmm0, -32(%edx)
+L(aligned_16_16bytes):
+	movdqa	%xmm0, -16(%edx)
+L(aligned_16_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_113bytes):
+	movdqa	%xmm0, -113(%edx)
+L(aligned_16_97bytes):
+	movdqa	%xmm0, -97(%edx)
+L(aligned_16_81bytes):
+	movdqa	%xmm0, -81(%edx)
+L(aligned_16_65bytes):
+	movdqa	%xmm0, -65(%edx)
+L(aligned_16_49bytes):
+	movdqa	%xmm0, -49(%edx)
+L(aligned_16_33bytes):
+	movdqa	%xmm0, -33(%edx)
+L(aligned_16_17bytes):
+	movdqa	%xmm0, -17(%edx)
+L(aligned_16_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_114bytes):
+	movdqa	%xmm0, -114(%edx)
+L(aligned_16_98bytes):
+	movdqa	%xmm0, -98(%edx)
+L(aligned_16_82bytes):
+	movdqa	%xmm0, -82(%edx)
+L(aligned_16_66bytes):
+	movdqa	%xmm0, -66(%edx)
+L(aligned_16_50bytes):
+	movdqa	%xmm0, -50(%edx)
+L(aligned_16_34bytes):
+	movdqa	%xmm0, -34(%edx)
+L(aligned_16_18bytes):
+	movdqa	%xmm0, -18(%edx)
+L(aligned_16_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_115bytes):
+	movdqa	%xmm0, -115(%edx)
+L(aligned_16_99bytes):
+	movdqa	%xmm0, -99(%edx)
+L(aligned_16_83bytes):
+	movdqa	%xmm0, -83(%edx)
+L(aligned_16_67bytes):
+	movdqa	%xmm0, -67(%edx)
+L(aligned_16_51bytes):
+	movdqa	%xmm0, -51(%edx)
+L(aligned_16_35bytes):
+	movdqa	%xmm0, -35(%edx)
+L(aligned_16_19bytes):
+	movdqa	%xmm0, -19(%edx)
+L(aligned_16_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_116bytes):
+	movdqa	%xmm0, -116(%edx)
+L(aligned_16_100bytes):
+	movdqa	%xmm0, -100(%edx)
+L(aligned_16_84bytes):
+	movdqa	%xmm0, -84(%edx)
+L(aligned_16_68bytes):
+	movdqa	%xmm0, -68(%edx)
+L(aligned_16_52bytes):
+	movdqa	%xmm0, -52(%edx)
+L(aligned_16_36bytes):
+	movdqa	%xmm0, -36(%edx)
+L(aligned_16_20bytes):
+	movdqa	%xmm0, -20(%edx)
+L(aligned_16_4bytes):
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_117bytes):
+	movdqa	%xmm0, -117(%edx)
+L(aligned_16_101bytes):
+	movdqa	%xmm0, -101(%edx)
+L(aligned_16_85bytes):
+	movdqa	%xmm0, -85(%edx)
+L(aligned_16_69bytes):
+	movdqa	%xmm0, -69(%edx)
+L(aligned_16_53bytes):
+	movdqa	%xmm0, -53(%edx)
+L(aligned_16_37bytes):
+	movdqa	%xmm0, -37(%edx)
+L(aligned_16_21bytes):
+	movdqa	%xmm0, -21(%edx)
+L(aligned_16_5bytes):
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_118bytes):
+	movdqa	%xmm0, -118(%edx)
+L(aligned_16_102bytes):
+	movdqa	%xmm0, -102(%edx)
+L(aligned_16_86bytes):
+	movdqa	%xmm0, -86(%edx)
+L(aligned_16_70bytes):
+	movdqa	%xmm0, -70(%edx)
+L(aligned_16_54bytes):
+	movdqa	%xmm0, -54(%edx)
+L(aligned_16_38bytes):
+	movdqa	%xmm0, -38(%edx)
+L(aligned_16_22bytes):
+	movdqa	%xmm0, -22(%edx)
+L(aligned_16_6bytes):
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_119bytes):
+	movdqa	%xmm0, -119(%edx)
+L(aligned_16_103bytes):
+	movdqa	%xmm0, -103(%edx)
+L(aligned_16_87bytes):
+	movdqa	%xmm0, -87(%edx)
+L(aligned_16_71bytes):
+	movdqa	%xmm0, -71(%edx)
+L(aligned_16_55bytes):
+	movdqa	%xmm0, -55(%edx)
+L(aligned_16_39bytes):
+	movdqa	%xmm0, -39(%edx)
+L(aligned_16_23bytes):
+	movdqa	%xmm0, -23(%edx)
+L(aligned_16_7bytes):
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_120bytes):
+	movdqa	%xmm0, -120(%edx)
+L(aligned_16_104bytes):
+	movdqa	%xmm0, -104(%edx)
+L(aligned_16_88bytes):
+	movdqa	%xmm0, -88(%edx)
+L(aligned_16_72bytes):
+	movdqa	%xmm0, -72(%edx)
+L(aligned_16_56bytes):
+	movdqa	%xmm0, -56(%edx)
+L(aligned_16_40bytes):
+	movdqa	%xmm0, -40(%edx)
+L(aligned_16_24bytes):
+	movdqa	%xmm0, -24(%edx)
+L(aligned_16_8bytes):
+	movq	%xmm0, -8(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_121bytes):
+	movdqa	%xmm0, -121(%edx)
+L(aligned_16_105bytes):
+	movdqa	%xmm0, -105(%edx)
+L(aligned_16_89bytes):
+	movdqa	%xmm0, -89(%edx)
+L(aligned_16_73bytes):
+	movdqa	%xmm0, -73(%edx)
+L(aligned_16_57bytes):
+	movdqa	%xmm0, -57(%edx)
+L(aligned_16_41bytes):
+	movdqa	%xmm0, -41(%edx)
+L(aligned_16_25bytes):
+	movdqa	%xmm0, -25(%edx)
+L(aligned_16_9bytes):
+	movq	%xmm0, -9(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_122bytes):
+	movdqa	%xmm0, -122(%edx)
+L(aligned_16_106bytes):
+	movdqa	%xmm0, -106(%edx)
+L(aligned_16_90bytes):
+	movdqa	%xmm0, -90(%edx)
+L(aligned_16_74bytes):
+	movdqa	%xmm0, -74(%edx)
+L(aligned_16_58bytes):
+	movdqa	%xmm0, -58(%edx)
+L(aligned_16_42bytes):
+	movdqa	%xmm0, -42(%edx)
+L(aligned_16_26bytes):
+	movdqa	%xmm0, -26(%edx)
+L(aligned_16_10bytes):
+	movq	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_123bytes):
+	movdqa	%xmm0, -123(%edx)
+L(aligned_16_107bytes):
+	movdqa	%xmm0, -107(%edx)
+L(aligned_16_91bytes):
+	movdqa	%xmm0, -91(%edx)
+L(aligned_16_75bytes):
+	movdqa	%xmm0, -75(%edx)
+L(aligned_16_59bytes):
+	movdqa	%xmm0, -59(%edx)
+L(aligned_16_43bytes):
+	movdqa	%xmm0, -43(%edx)
+L(aligned_16_27bytes):
+	movdqa	%xmm0, -27(%edx)
+L(aligned_16_11bytes):
+	movq	%xmm0, -11(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_124bytes):
+	movdqa	%xmm0, -124(%edx)
+L(aligned_16_108bytes):
+	movdqa	%xmm0, -108(%edx)
+L(aligned_16_92bytes):
+	movdqa	%xmm0, -92(%edx)
+L(aligned_16_76bytes):
+	movdqa	%xmm0, -76(%edx)
+L(aligned_16_60bytes):
+	movdqa	%xmm0, -60(%edx)
+L(aligned_16_44bytes):
+	movdqa	%xmm0, -44(%edx)
+L(aligned_16_28bytes):
+	movdqa	%xmm0, -28(%edx)
+L(aligned_16_12bytes):
+	movq	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_125bytes):
+	movdqa	%xmm0, -125(%edx)
+L(aligned_16_109bytes):
+	movdqa	%xmm0, -109(%edx)
+L(aligned_16_93bytes):
+	movdqa	%xmm0, -93(%edx)
+L(aligned_16_77bytes):
+	movdqa	%xmm0, -77(%edx)
+L(aligned_16_61bytes):
+	movdqa	%xmm0, -61(%edx)
+L(aligned_16_45bytes):
+	movdqa	%xmm0, -45(%edx)
+L(aligned_16_29bytes):
+	movdqa	%xmm0, -29(%edx)
+L(aligned_16_13bytes):
+	movq	%xmm0, -13(%edx)
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_126bytes):
+	movdqa	%xmm0, -126(%edx)
+L(aligned_16_110bytes):
+	movdqa	%xmm0, -110(%edx)
+L(aligned_16_94bytes):
+	movdqa	%xmm0, -94(%edx)
+L(aligned_16_78bytes):
+	movdqa	%xmm0, -78(%edx)
+L(aligned_16_62bytes):
+	movdqa	%xmm0, -62(%edx)
+L(aligned_16_46bytes):
+	movdqa	%xmm0, -46(%edx)
+L(aligned_16_30bytes):
+	movdqa	%xmm0, -30(%edx)
+L(aligned_16_14bytes):
+	movq	%xmm0, -14(%edx)
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_127bytes):
+	movdqa	%xmm0, -127(%edx)
+L(aligned_16_111bytes):
+	movdqa	%xmm0, -111(%edx)
+L(aligned_16_95bytes):
+	movdqa	%xmm0, -95(%edx)
+L(aligned_16_79bytes):
+	movdqa	%xmm0, -79(%edx)
+L(aligned_16_63bytes):
+	movdqa	%xmm0, -63(%edx)
+L(aligned_16_47bytes):
+	movdqa	%xmm0, -47(%edx)
+L(aligned_16_31bytes):
+	movdqa	%xmm0, -31(%edx)
+L(aligned_16_15bytes):
+	movq	%xmm0, -15(%edx)
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN_END
+
+END (__memset_sse2)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S
new file mode 100644
index 0000000000..f601663a9f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S
@@ -0,0 +1,75 @@
+/* Multiple versions of memset
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(memset)
+	.type	memset, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memset_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_sse2)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_sse2_rep)
+2:	ret
+END(memset)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memset_ia32, @function; \
+	.globl __memset_ia32; \
+	.p2align 4; \
+	__memset_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memset_ia32, .-__memset_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memset_chk_ia32, @function; \
+	.globl __memset_chk_ia32; \
+	.p2align 4; \
+	__memset_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memset_chk_ia32, .-__memset_chk_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memset; __GI_memset = __memset_ia32
+# endif
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S
new file mode 100644
index 0000000000..573cf4208a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S
@@ -0,0 +1,82 @@
+/* Multiple versions of __memset_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__memset_chk)
+	.type	__memset_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memset_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_chk_sse2)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_chk_sse2_rep)
+2:	ret
+END(__memset_chk)
+
+# ifdef SHARED
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+	.section .gnu.warning.__memset_zero_constant_len_parameter
+	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+# else
+	.text
+	.type __memset_chk_sse2, @function
+	.p2align 4;
+__memset_chk_sse2:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_sse2
+	cfi_endproc
+	.size __memset_chk_sse2, .-__memset_chk_sse2
+
+	.type __memset_chk_sse2_rep, @function
+	.p2align 4;
+__memset_chk_sse2_rep:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_sse2_rep
+	cfi_endproc
+	.size __memset_chk_sse2_rep, .-__memset_chk_sse2_rep
+
+	.type __memset_chk_ia32, @function
+	.p2align 4;
+__memset_chk_ia32:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_ia32
+	cfi_endproc
+	.size __memset_chk_ia32, .-__memset_chk_ia32
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
new file mode 100644
index 0000000000..88c0e5776c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2_bsf
+#include "memchr-sse2-bsf.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
new file mode 100644
index 0000000000..038c74896b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2
+#include "memchr-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S
new file mode 100644
index 0000000000..0a41d63ee8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S
@@ -0,0 +1,65 @@
+/* Multiple versions of rawmemchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__rawmemchr)
+	.type	__rawmemchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	3f
+
+	LOAD_FUNC_GOT_EAX (__rawmemchr_sse2)
+	ret
+
+2:	LOAD_FUNC_GOT_EAX (__rawmemchr_ia32)
+	ret
+
+3:	LOAD_FUNC_GOT_EAX (__rawmemchr_sse2_bsf)
+	ret
+END(__rawmemchr)
+
+weak_alias(__rawmemchr, rawmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __rawmemchr_ia32, @function; \
+	.globl __rawmemchr_ia32; \
+	.p2align 4; \
+	__rawmemchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __rawmemchr_ia32, .-__rawmemchr_ia32
+
+# undef libc_hidden_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_def(name) \
+	.globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_ia32
+
+#endif
+#include "../../rawmemchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c
new file mode 100644
index 0000000000..1aa5440644
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c
@@ -0,0 +1 @@
+#include <string/strnlen.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c
new file mode 100644
index 0000000000..2e9619f97c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c
@@ -0,0 +1,27 @@
+/* FMA version of fma.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+double
+__fma_fma (double x, double y, double z)
+{
+  asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c
new file mode 100644
index 0000000000..411ebb2ba9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c
@@ -0,0 +1,34 @@
+/* Multiple versions of fma.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include <math.h>
+#include <init-arch.h>
+
+extern double __fma_ia32 (double x, double y, double z) attribute_hidden;
+extern double __fma_fma (double x, double y, double z) attribute_hidden;
+
+libm_ifunc (__fma,
+	    HAS_ARCH_FEATURE (FMA_Usable) ? __fma_fma : __fma_ia32);
+weak_alias (__fma, fma)
+
+#define __fma __fma_ia32
+
+#include <sysdeps/ieee754/ldbl-96/s_fma.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
new file mode 100644
index 0000000000..ee57abfda2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
@@ -0,0 +1,27 @@
+/* FMA version of fmaf.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+float
+__fmaf_fma (float x, float y, float z)
+{
+  asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c
new file mode 100644
index 0000000000..00b0fbcfc5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c
@@ -0,0 +1,34 @@
+/* Multiple versions of fmaf.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include <math.h>
+#include <init-arch.h>
+
+extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden;
+extern float __fmaf_fma (float x, float y, float z) attribute_hidden;
+
+libm_ifunc (__fmaf,
+	    HAS_ARCH_FEATURE (FMA_Usable) ? __fmaf_fma : __fmaf_ia32);
+weak_alias (__fmaf, fmaf)
+
+#define __fmaf __fmaf_ia32
+
+#include <sysdeps/ieee754/dbl-64/s_fmaf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c
new file mode 100644
index 0000000000..7db31b02f8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/sched_cpucount.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
new file mode 100644
index 0000000000..46ca1b3074
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
new file mode 100644
index 0000000000..d971c2da38
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S
new file mode 100644
index 0000000000..ee81ab6ae3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S
@@ -0,0 +1,9 @@
+/* Multiple versions of stpcpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
new file mode 100644
index 0000000000..37a703cb76
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
new file mode 100644
index 0000000000..14ed16f6b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S
new file mode 100644
index 0000000000..2698ca6a8c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S
@@ -0,0 +1,8 @@
+/* Multiple versions of stpncpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c
new file mode 100644
index 0000000000..753c6ec84a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c
@@ -0,0 +1,12 @@
+#include <string.h>
+
+extern __typeof (strcasecmp) __strcasecmp_nonascii;
+
+#define __strcasecmp __strcasecmp_nonascii
+#include <string/strcasecmp.c>
+
+strong_alias (__strcasecmp_nonascii, __strcasecmp_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+   the IFUNC.  */
+strong_alias (__strcasecmp_nonascii, __GI___strcasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S
new file mode 100644
index 0000000000..ec59276408
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S
@@ -0,0 +1,39 @@
+/* Entry point for multi-version x86 strcasecmp.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY(__strcasecmp)
+	.type	__strcasecmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strcasecmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strcasecmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	LOAD_FUNC_GOT_EAX (__strcasecmp_sse4_2)
+2:	ret
+END(__strcasecmp)
+
+weak_alias (__strcasecmp, strcasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c
new file mode 100644
index 0000000000..d4fcd2b4a1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c
@@ -0,0 +1,13 @@
+#include <string.h>
+
+extern __typeof (strcasecmp_l) __strcasecmp_l_nonascii;
+
+#define __strcasecmp_l __strcasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL    1
+#include <string/strcasecmp.c>
+
+strong_alias (__strcasecmp_l_nonascii, __strcasecmp_l_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+   the IFUNC.  */
+strong_alias (__strcasecmp_l_nonascii, __GI___strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S
new file mode 100644
index 0000000000..411d4153f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCASECMP_L 1
+#include "strcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S
new file mode 100644
index 0000000000..a22b93c518
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCASECMP_L 1
+#include "strcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S
new file mode 100644
index 0000000000..711c09b0dc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S
@@ -0,0 +1,7 @@
+/* Multiple versions of strcasecmp_l
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCMP __strcasecmp_l
+#define USE_AS_STRCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strcasecmp_l, strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S
new file mode 100644
index 0000000000..6359c7330c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S
@@ -0,0 +1,1245 @@
+/* strcat with SSE2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifdef SHARED
+#  define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into ECX and branch to it.  TABLE is a
+	jump table with relative offsets.  INDEX is a register contains the
+	index into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	/* We first load PC into ECX.  */	\
+	SETUP_PIC_REG(cx);	\
+	/* Get the address of the jump table.  */	\
+	addl	$(TABLE - .), %ecx;	\
+	/* Get the entry and convert the relative offset to the	\
+	absolute address.  */	\
+	addl	(%ecx,INDEX,SCALE), %ecx;	\
+	/* We loaded the jump table and adjusted ECX. Go.  */	\
+	jmp	*%ecx
+# else
+#  define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute offsets.  INDEX is a register contains the index into the
+	jump table.  SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_sse2
+# endif
+
+# define PARMS  4
+# define STR1  PARMS+4
+# define STR2  STR1+4
+
+# ifdef USE_AS_STRNCAT
+#  define LEN    STR2+8
+#  define STR3   STR1+4
+# else
+#  define STR3   STR1
+# endif
+
+# define USE_AS_STRCAT
+# ifdef USE_AS_STRNCAT
+#  define RETURN  POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi);
+# else
+#  define RETURN  POP(%esi); ret; CFI_PUSH(%esi);
+# endif
+
+.text
+ENTRY (STRCAT)
+	PUSH	(%esi)
+	mov	STR1(%esp), %eax
+	mov	STR2(%esp), %esi
+# ifdef USE_AS_STRNCAT
+	PUSH	(%ebx)
+	movl	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(ExitZero)
+# endif
+	cmpb	$0, (%esi)
+	mov	%esi, %ecx
+	mov	%eax, %edx
+	jz	L(ExitZero)
+
+	and	$63, %ecx
+	and	$63, %edx
+	cmp	$32, %ecx
+	ja	L(StrlenCore7_1)
+	cmp	$48, %edx
+	ja	L(alignment_prolog)
+
+	pxor	%xmm0, %xmm0
+	pxor	%xmm4, %xmm4
+	pxor	%xmm7, %xmm7
+	movdqu	(%eax), %xmm1
+	movdqu	(%esi), %xmm5
+	pcmpeqb	%xmm1, %xmm0
+	movdqu	16(%esi), %xmm6
+	pmovmskb %xmm0, %ecx
+	pcmpeqb	%xmm5, %xmm4
+	pcmpeqb	%xmm6, %xmm7
+	test	%ecx, %ecx
+	jnz	L(exit_less16_)
+	mov	%eax, %ecx
+	and	$-16, %eax
+	jmp	L(loop_prolog)
+
+L(alignment_prolog):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm4, %xmm4
+	mov	%edx, %ecx
+	pxor	%xmm7, %xmm7
+	and	$15, %ecx
+	and	$-16, %eax
+	pcmpeqb	(%eax), %xmm0
+	movdqu	(%esi), %xmm5
+	movdqu	16(%esi), %xmm6
+	pmovmskb %xmm0, %edx
+	pcmpeqb	%xmm5, %xmm4
+	shr	%cl, %edx
+	pcmpeqb	%xmm6, %xmm7
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	add	%eax, %ecx
+
+	pxor	%xmm0, %xmm0
+L(loop_prolog):
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	.p2align 4
+L(align16_loop):
+	pcmpeqb	16(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(align16_loop)
+	bsf	%edx, %edx
+	add	%edx, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit16):
+	bsf	%edx, %edx
+	lea	16(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit32):
+	bsf	%edx, %edx
+	lea	32(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit48):
+	bsf	%edx, %edx
+	lea	48(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_less16):
+	bsf	%edx, %edx
+	add	%ecx, %eax
+	add	%edx, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_less16_):
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+
+	.p2align 4
+L(StartStrcpyPart):
+	pmovmskb %xmm4, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	movdqu	%xmm5, (%eax)
+	pmovmskb %xmm7, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes1)
+
+	mov	%esi, %ecx
+	and	$-16, %esi
+	and	$15, %ecx
+	pxor	%xmm0, %xmm0
+# ifdef USE_AS_STRNCAT
+	add	%ecx, %ebx
+	sbb	%edx, %edx
+	or	%edx, %ebx
+# endif
+	sub	%ecx, %eax
+	jmp	L(Unalign16Both)
+
+L(StrlenCore7_1):
+	mov	%eax, %ecx
+	pxor	%xmm0, %xmm0
+	and	$15, %ecx
+	and	$-16, %eax
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	shr	%cl, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16_1)
+	add	%eax, %ecx
+
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+
+	.p2align 4
+L(align16_loop_1):
+	pcmpeqb	16(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16_1)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32_1)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48_1)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(align16_loop_1)
+	bsf	%edx, %edx
+	add	%edx, %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit16_1):
+	bsf	%edx, %edx
+	lea	16(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit32_1):
+	bsf	%edx, %edx
+	lea	32(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit48_1):
+	bsf	%edx, %edx
+	lea	48(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit_less16_1):
+	bsf	%edx, %edx
+	add	%ecx, %eax
+	add	%edx, %eax
+
+	.p2align 4
+L(StartStrcpyPart_1):
+	mov	%esi, %ecx
+	and	$15, %ecx
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+# ifdef USE_AS_STRNCAT
+	cmp	$48, %ebx
+	ja      L(BigN)
+# endif
+	pcmpeqb	(%esi), %xmm1
+# ifdef USE_AS_STRNCAT
+	add	%ecx, %ebx
+# endif
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%eax)
+	sub	%ecx, %eax
+
+	.p2align 4
+L(Unalign16Both):
+	mov	$16, %ecx
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%eax, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$48, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+L(Unalign16BothBigN):
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%eax, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm4
+	movdqu	%xmm3, (%eax, %ecx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm1
+	movdqu	%xmm4, (%eax, %ecx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%eax, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%eax, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movdqu	%xmm3, (%eax, %ecx)
+	mov	%esi, %edx
+	lea	16(%esi, %ecx), %esi
+	and	$-0x40, %esi
+	sub	%esi, %edx
+	sub	%edx, %eax
+# ifdef USE_AS_STRNCAT
+	lea	128(%ebx, %edx), %ebx
+# endif
+	movaps	(%esi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%esi), %xmm5
+	movaps	32(%esi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%esi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+# ifdef USE_AS_STRNCAT
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(Unaligned64Leave)
+
+	.p2align 4
+L(Unaligned64Loop_start):
+	add	$64, %eax
+	add	$64, %esi
+	movdqu	%xmm4, -64(%eax)
+	movaps	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%eax)
+	movaps	16(%esi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%esi), %xmm3
+	movdqu	%xmm6, -32(%eax)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%eax)
+	movaps	48(%esi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+# ifdef USE_AS_STRNCAT
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jz	L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%ecx, %ecx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%eax)
+	movdqu	%xmm5, 16(%eax)
+	movdqu	%xmm6, 32(%eax)
+	add	$48, %esi
+	add	$48, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+# ifdef USE_AS_STRNCAT
+	.p2align 4
+L(BigN):
+	pcmpeqb	(%esi), %xmm1
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%eax)
+	sub	%ecx, %eax
+	sub     $48, %ebx
+	add     %ecx, %ebx
+
+	mov	$16, %ecx
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%eax, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+	jmp	L(Unalign16BothBigN)
+# endif
+
+/*------------end of main part-------------------------------*/
+
+/* Case1 */
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%ecx, %eax
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %esi
+	add	$16, %eax
+L(CopyFrom1To16BytesTail1):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+	bsf	%edx, %edx
+	add	%ecx, %esi
+	add	$16, %edx
+	sub	%ecx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%eax)
+	add	$16, %esi
+	add	$16, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%edx, %edx
+	movdqu	%xmm4, (%eax)
+	movdqu	%xmm5, 16(%eax)
+	add	$32, %esi
+	add	$32, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+# ifdef USE_AS_STRNCAT
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%ecx, %eax
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	add	$16, %edx
+	sub	%ecx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%ecx, %eax
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32BytesCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %eax
+	add	$16, %esi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+# endif
+
+# ifdef USE_AS_STRNCAT
+	.p2align 4
+L(StrncatExit0):
+	movb	%bh, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+# endif
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit1):
+	movb	%bh, 1(%eax)
+# endif
+L(Exit1):
+# ifdef USE_AS_STRNCAT
+	movb	(%esi), %dh
+# endif
+	movb	%dh, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit2):
+	movb	%bh, 2(%eax)
+# endif
+L(Exit2):
+	movw	(%esi), %dx
+	movw	%dx, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit3):
+	movb	%bh, 3(%eax)
+# endif
+L(Exit3):
+	movw	(%esi), %cx
+	movw	%cx, (%eax)
+# ifdef USE_AS_STRNCAT
+	movb	2(%esi), %dh
+# endif
+	movb	%dh, 2(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit4):
+	movb	%bh, 4(%eax)
+# endif
+L(Exit4):
+	movl	(%esi), %edx
+	movl	%edx, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit5):
+	movb	%bh, 5(%eax)
+# endif
+L(Exit5):
+	movl	(%esi), %ecx
+# ifdef USE_AS_STRNCAT
+	movb	4(%esi), %dh
+# endif
+	movb	%dh, 4(%eax)
+	movl	%ecx, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit6):
+	movb	%bh, 6(%eax)
+# endif
+L(Exit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%eax)
+	movw	%dx, 4(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit7):
+	movb	%bh, 7(%eax)
+# endif
+L(Exit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%eax)
+	movl	%edx, 3(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit8):
+	movb	%bh, 8(%eax)
+# endif
+L(Exit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit9):
+	movb	%bh, 9(%eax)
+# endif
+L(Exit9):
+	movlpd	(%esi), %xmm0
+# ifdef USE_AS_STRNCAT
+	movb	8(%esi), %dh
+# endif
+	movb	%dh, 8(%eax)
+	movlpd	%xmm0, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit10):
+	movb	%bh, 10(%eax)
+# endif
+L(Exit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%eax)
+	movw	%dx, 8(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit11):
+	movb	%bh, 11(%eax)
+# endif
+L(Exit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%eax)
+	movl	%edx, 7(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit12):
+	movb	%bh, 12(%eax)
+# endif
+L(Exit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%eax)
+	movl	%edx, 8(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit13):
+	movb	%bh, 13(%eax)
+# endif
+L(Exit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%eax)
+	movlpd	%xmm1, 5(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit14):
+	movb	%bh, 14(%eax)
+# endif
+L(Exit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%eax)
+	movlpd	%xmm1, 6(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit15):
+	movb	%bh, 15(%eax)
+# endif
+L(Exit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%eax)
+	movlpd	%xmm1, 7(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit16):
+	movb	%bh, 16(%eax)
+# endif
+L(Exit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit17):
+	movb	%bh, 17(%eax)
+# endif
+L(Exit17):
+	movdqu	(%esi), %xmm0
+# ifdef USE_AS_STRNCAT
+	movb	16(%esi), %dh
+# endif
+	movdqu	%xmm0, (%eax)
+	movb	%dh, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit18):
+	movb	%bh, 18(%eax)
+# endif
+L(Exit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%eax)
+	movw	%cx, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit19):
+	movb	%bh, 19(%eax)
+# endif
+L(Exit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movl	%ecx, 15(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit20):
+	movb	%bh, 20(%eax)
+# endif
+L(Exit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movl	%ecx, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit21):
+	movb	%bh, 21(%eax)
+# endif
+L(Exit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+# ifdef USE_AS_STRNCAT
+	movb	20(%esi), %dh
+# endif
+	movdqu	%xmm0, (%eax)
+	movl	%ecx, 16(%eax)
+	movb	%dh, 20(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit22):
+	movb	%bh, 22(%eax)
+# endif
+L(Exit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm3, 14(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit23):
+	movb	%bh, 23(%eax)
+# endif
+L(Exit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm3, 15(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit24):
+	movb	%bh, 24(%eax)
+# endif
+L(Exit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit25):
+	movb	%bh, 25(%eax)
+# endif
+L(Exit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+# ifdef USE_AS_STRNCAT
+	movb	24(%esi), %dh
+# endif
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movb	%dh, 24(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit26):
+	movb	%bh, 26(%eax)
+# endif
+L(Exit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movw	%cx, 24(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit27):
+	movb	%bh, 27(%eax)
+# endif
+L(Exit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movl	%ecx, 23(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit28):
+	movb	%bh, 28(%eax)
+# endif
+L(Exit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movl	%ecx, 24(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit29):
+	movb	%bh, 29(%eax)
+# endif
+L(Exit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 13(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit30):
+	movb	%bh, 30(%eax)
+# endif
+L(Exit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 14(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit31):
+	movb	%bh, 31(%eax)
+# endif
+L(Exit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 15(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit32):
+	movb	%bh, 32(%eax)
+# endif
+L(Exit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+# ifdef USE_AS_STRNCAT
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%ebx), %ecx
+	and	$-16, %ecx
+	add	$48, %ebx
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%eax)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%eax)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%eax)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%eax)
+	xor	%bh, %bh
+	movb	%bh, 64(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	xor	%ecx, %ecx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm4, (%eax)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm5, 16(%eax)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm6, 32(%eax)
+	lea	16(%eax, %ecx), %eax
+	lea	16(%esi, %ecx), %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+# endif
+	.p2align 4
+L(ExitZero):
+	RETURN
+
+END (STRCAT)
+
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int	JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+# ifdef USE_AS_STRNCAT
+L(ExitStrncatTable):
+	.int	JMPTBL(L(StrncatExit0), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit1), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit2), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit3), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit4), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit5), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit6), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit7), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit8), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit9), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit10), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit11), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit12), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit13), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit14), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit15), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit16), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit17), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit18), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit19), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit20), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit21), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit22), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit23), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit24), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit25), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit26), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit27), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit28), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit29), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit30), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit31), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit32), L(ExitStrncatTable))
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S
new file mode 100644
index 0000000000..59ffbc60a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S
@@ -0,0 +1,572 @@
+/* strcat with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY	or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_ssse3
+# endif
+
+# define PARMS  4
+# define STR1  PARMS+4
+# define STR2  STR1+4
+
+# ifdef USE_AS_STRNCAT
+#  define LEN STR2+8
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+	PUSH	(%edi)
+	mov	STR1(%esp), %edi
+	mov	%edi, %edx
+
+# define RETURN  jmp L(StartStrcpyPart)
+# include "strlen-sse2.S"
+
+L(StartStrcpyPart):
+	mov	STR2(%esp), %ecx
+	lea	(%edi, %eax), %edx
+# ifdef USE_AS_STRNCAT
+	PUSH	(%ebx)
+	mov	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(StrncatExit0)
+	cmp	$8, %ebx
+	jbe	L(StrncatExit8Bytes)
+# endif
+	cmpb	$0, (%ecx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%ecx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%ecx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%ecx)
+	jz	L(Exit4)
+	cmpb	$0, 4(%ecx)
+	jz	L(Exit5)
+	cmpb	$0, 5(%ecx)
+	jz	L(Exit6)
+	cmpb	$0, 6(%ecx)
+	jz	L(Exit7)
+	cmpb	$0, 7(%ecx)
+	jz	L(Exit8)
+	cmpb	$0, 8(%ecx)
+	jz	L(Exit9)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	jb	L(StrncatExit15Bytes)
+# endif
+	cmpb	$0, 9(%ecx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%ecx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%ecx)
+	jz	L(Exit12)
+	cmpb	$0, 12(%ecx)
+	jz	L(Exit13)
+	cmpb	$0, 13(%ecx)
+	jz	L(Exit14)
+	cmpb	$0, 14(%ecx)
+	jz	L(Exit15)
+	cmpb	$0, 15(%ecx)
+	jz	L(Exit16)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	je	L(StrncatExit16)
+
+#  define RETURN1	\
+	POP	(%ebx);	\
+	POP	(%edi);	\
+	ret;	\
+	CFI_PUSH	(%ebx);	\
+	CFI_PUSH	(%edi)
+#  define USE_AS_STRNCPY
+# else
+#  define RETURN1  POP (%edi); ret; CFI_PUSH (%edi)
+# endif
+# include "strcpy-ssse3.S"
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit1):
+	movb	%bh, 1(%edx)
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit2):
+	movb	%bh, 2(%edx)
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit3):
+	movb	%bh, 3(%edx)
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit4):
+	movb	%bh, 4(%edx)
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit5):
+	movb	%bh, 5(%edx)
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit6):
+	movb	%bh, 6(%edx)
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit7):
+	movb	%bh, 7(%edx)
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit8):
+	movb	%bh, 8(%edx)
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit9):
+	movb	%bh, 9(%edx)
+L(Exit9):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit10):
+	movb	%bh, 10(%edx)
+L(Exit10):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit11):
+	movb	%bh, 11(%edx)
+L(Exit11):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit12):
+	movb	%bh, 12(%edx)
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit13):
+	movb	%bh, 13(%edx)
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit14):
+	movb	%bh, 14(%edx)
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit15):
+	movb	%bh, 15(%edx)
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit16):
+	movb	%bh, 16(%edx)
+L(Exit16):
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+# ifdef USE_AS_STRNCPY
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%esi, %ecx
+	lea	(%esi, %edx), %esi
+	lea	-9(%ebx), %edx
+	and	$1<<7, %dh
+	or	%al, %dh
+	test	%dh, %dh
+	lea	(%esi), %edx
+	POP	(%esi)
+	jz	L(ExitHighCase2)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	xor	%cl, %cl
+	movb	%cl, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHighCase2):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %ebx
+	je	L(StrncatExit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	CFI_PUSH(%esi)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+
+	cmp	$8, %ebx
+	ja	L(ExitHighCase3)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	%bh, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHighCase3):
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	cmp	$15, %ebx
+	je	L(StrncatExit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+	movb	%bh, 16(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit0):
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit15Bytes):
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	cmpb	$0, 9(%ecx)
+	jz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	cmpb	$0, 10(%ecx)
+	jz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	cmpb	$0, 11(%ecx)
+	jz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	cmpb	$0, 12(%ecx)
+	jz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	cmpb	$0, 13(%ecx)
+	jz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	lea	14(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	movb	%bh, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit8Bytes):
+	cmpb	$0, (%ecx)
+	jz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	cmpb	$0, 1(%ecx)
+	jz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	cmpb	$0, 2(%ecx)
+	jz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	cmpb	$0, 3(%ecx)
+	jz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	cmpb	$0, 4(%ecx)
+	jz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	cmpb	$0, 5(%ecx)
+	jz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	cmpb	$0, 6(%ecx)
+	jz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	movb	%bh, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+# endif
+END (STRCAT)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S
new file mode 100644
index 0000000000..8412cb6f23
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S
@@ -0,0 +1,92 @@
+/* Multiple versions of strcat
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef USE_AS_STRNCAT
+# ifndef STRCAT
+#  define STRCAT strcat
+# endif
+#endif
+
+#ifdef USE_AS_STRNCAT
+# define STRCAT_SSSE3	__strncat_ssse3
+# define STRCAT_SSE2		__strncat_sse2
+# define STRCAT_IA32		__strncat_ia32
+# define __GI_STRCAT		__GI_strncat
+#else
+# define STRCAT_SSSE3	__strcat_ssse3
+# define STRCAT_SSE2		__strcat_sse2
+# define STRCAT_IA32		__strcat_ia32
+# define __GI_STRCAT		__GI_strcat
+#endif
+
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncat in static library since we
+   need strncat before the initialization happened.  */
+#if IS_IN (libc)
+
+	.text
+ENTRY(STRCAT)
+	.type	STRCAT, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCAT_IA32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCAT_SSE2)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCAT_SSSE3)
+2:	ret
+END(STRCAT)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCAT_IA32, @function; \
+	.align 16; \
+	.globl STRCAT_IA32; \
+	.hidden STRCAT_IA32; \
+	STRCAT_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCAT_IA32, .-STRCAT_IA32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcat calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCAT; __GI_STRCAT = STRCAT_IA32
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI___STRCAT; __GI___STRCAT = STRCAT_IA32
+
+# endif
+#endif
+
+#ifndef USE_AS_STRNCAT
+# include "../../strcat.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
new file mode 100644
index 0000000000..95fd7c084e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
@@ -0,0 +1,158 @@
+/* strchr with SSE2 with bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  8
+# define ENTRANCE PUSH(%edi)
+# define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	.text
+ENTRY (__strchr_sse2_bsf)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$15, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	je	L(loop)
+
+/* Handle unaligned string.  */
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	je	L(unaligned_no_match)
+	/* Check which byte is a match.  */
+	bsf	%eax, %eax
+	/* Is there a NULL? */
+	test	%edx, %edx
+	je	L(unaligned_match)
+	bsf	%edx, %edx
+	cmpl	%edx, %eax
+	/* Return NULL if NULL comes first.  */
+	ja	L(return_null)
+L(unaligned_match):
+	add	%edi, %eax
+	add	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(unaligned_no_match):
+	test	%edx, %edx
+	jne	L(return_null)
+	pxor	%xmm2, %xmm2
+
+	add	$16, %edi
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	jmp	L(loop)
+
+L(matches):
+	pmovmskb %xmm2, %edx
+	test	%eax, %eax
+	jz	L(return_null)
+	bsf	%eax, %eax
+	/* There is a match.  First find where NULL is.  */
+	test	%edx, %edx
+	je	L(match)
+	bsf	%edx, %ecx
+	/* Check if NULL comes first.  */
+	cmpl	%ecx, %eax
+	ja	L(return_null)
+L(match):
+	sub	$16, %edi
+	add	%edi, %eax
+	RETURN
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+END (__strchr_sse2_bsf)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S
new file mode 100644
index 0000000000..1f9e875b04
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S
@@ -0,0 +1,348 @@
+/* strchr SSE2 without bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  8
+# define ENTRANCE PUSH(%edi)
+# define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	atom_text_section
+ENTRY (__strchr_sse2)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$15, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	je	L(loop)
+
+/* Handle unaligned string.  */
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	jz	L(unaligned_no_match)
+	/* Check which byte is a match.  */
+	/* Is there a NULL? */
+	add	%ecx, %edi
+	test	%edx, %edx
+	jz	L(match_case1)
+	jmp	L(match_case2)
+
+	.p2align 4
+L(unaligned_no_match):
+	test	%edx, %edx
+	jne	L(return_null)
+
+	pxor	%xmm2, %xmm2
+	add	$16, %edi
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+	jmp	L(loop)
+
+L(matches):
+	/* There is a match.  First find where NULL is.  */
+	test	%edx, %edx
+	jz	L(match_case1)
+
+	.p2align 4
+L(match_case2):
+	test	%al, %al
+	jz	L(match_higth_case2)
+
+	mov	%al, %cl
+	and	$15, %cl
+	jnz	L(match_case2_4)
+
+	mov	%dl, %ch
+	and	$15, %ch
+	jnz	L(return_null)
+
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x10, %dl
+	jnz	L(return_null)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x20, %dl
+	jnz	L(return_null)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	test	$0x40, %dl
+	jnz	L(return_null)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_4):
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x01, %dl
+	jnz	L(return_null)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x02, %dl
+	jnz	L(return_null)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x04, %dl
+	jnz	L(return_null)
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_higth_case2):
+	test	%dl, %dl
+	jnz	L(return_null)
+
+	mov	%ah, %cl
+	and	$15, %cl
+	jnz	L(match_case2_12)
+
+	mov	%dh, %ch
+	and	$15, %ch
+	jnz	L(return_null)
+
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x10, %dh
+	jnz	L(return_null)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x20, %dh
+	jnz	L(return_null)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	test	$0x40, %dh
+	jnz	L(return_null)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_12):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x01, %dh
+	jnz	L(return_null)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x02, %dh
+	jnz	L(return_null)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x04, %dh
+	jnz	L(return_null)
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case1):
+	test	%al, %al
+	jz	L(match_higth_case1)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_higth_case1):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	lea	(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	lea	1(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	lea	2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	lea	4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	lea	5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	lea	6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	lea	8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	lea	9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	lea	10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	lea	12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	lea	13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	lea	14(%edi), %eax
+	RETURN
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+END (__strchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S
new file mode 100644
index 0000000000..5b97b1c767
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S
@@ -0,0 +1,57 @@
+/* Multiple versions of strchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(strchr)
+	.type	strchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strchr_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strchr_sse2)
+2:	ret
+END(strchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strchr_ia32, @function; \
+	.globl __strchr_ia32; \
+	.p2align 4; \
+	__strchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strchr_ia32, .-__strchr_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strchr; __GI_strchr = __strchr_ia32
+#endif
+
+#include "../../i586/strchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S
new file mode 100644
index 0000000000..cd26058671
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S
@@ -0,0 +1,804 @@
+/* strcmp with SSE4.2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_STRNCMP
+# ifndef STRCMP
+#  define STRCMP	__strncmp_sse4_2
+# endif
+# define STR1		8
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define RETURN		POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# define REM		%ebp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strcasecmp_l_sse4_2
+# endif
+# ifdef PIC
+#  define STR1		12
+# else
+#  define STR1		8
+# endif
+# define STR2		STR1+4
+# define LOCALE		12	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (%edi); POP (%ebx); ret; \
+			.p2align 4; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+# else
+#  define RETURN	POP (%edi); ret; .p2align 4; CFI_PUSH (%edi)
+# endif
+# define NONASCII	__strcasecmp_nonascii
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strncasecmp_l_sse4_2
+# endif
+# ifdef PIC
+#  define STR1		16
+# else
+#  define STR1		12
+# endif
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define LOCALE		16	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (%edi); POP (REM); POP (%ebx); ret; \
+			.p2align 4; \
+			CFI_PUSH (%ebx); CFI_PUSH (REM); CFI_PUSH (%edi)
+# else
+#  define RETURN	POP (%edi); POP (REM); ret; \
+			.p2align 4; CFI_PUSH (REM); CFI_PUSH (%edi)
+# endif
+# define REM		%ebp
+# define NONASCII	__strncasecmp_nonascii
+#else
+# ifndef STRCMP
+#  define STRCMP	__strcmp_sse4_2
+# endif
+# define STR1		4
+# define STR2		STR1+4
+# define RETURN		ret; .p2align 4
+#endif
+
+	.section .text.sse4.2,"ax",@progbits
+
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (__strcasecmp_sse4_2)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strcasecmp_nonascii
+# else
+	jne	__strcasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strcasecmp_sse4_2)
+#endif
+
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (__strncasecmp_sse4_2)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strncasecmp_nonascii
+# else
+	jne	__strncasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strncasecmp_sse4_2)
+#endif
+
+	ENTRY (STRCMP)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movl	LOCALE(%esp), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+	jne	NONASCII
+
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+# endif
+L(ascii):
+	.section .rodata.cst16,"aM",@progbits,16
+	.align 16
+.Lbelowupper:
+	.quad	0x4040404040404040
+	.quad	0x4040404040404040
+.Ltopupper:
+	.quad	0x5b5b5b5b5b5b5b5b
+	.quad	0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+# ifdef PIC
+#  define UCLOW_reg .Lbelowupper@GOTOFF(%ebx)
+#  define UCHIGH_reg .Ltopupper@GOTOFF(%ebx)
+#  define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx)
+# else
+#  define UCLOW_reg .Lbelowupper
+#  define UCHIGH_reg .Ltopupper
+#  define LCQWORD_reg .Ltouppermask
+# endif
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	PUSH	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	PUSH	(%edi)
+#endif
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %eax
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	movl	CNT(%esp), REM
+	test	REM, REM
+	je	L(eq)
+#endif
+	mov	%dx, %cx
+	and	$0xfff, %cx
+	cmp	$0xff0, %cx
+	ja	L(first4bytes)
+	movdqu	(%edx), %xmm2
+	mov	%eax, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(first4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+	movdqa	reg1, %xmm3;						      \
+	movdqa	UCHIGH_reg, %xmm4;					      \
+	movdqa	reg2, %xmm5;						      \
+	movdqa	UCHIGH_reg, %xmm6;					      \
+	pcmpgtb	UCLOW_reg, %xmm3;					      \
+	pcmpgtb	reg1, %xmm4;						      \
+	pcmpgtb	UCLOW_reg, %xmm5;					      \
+	pcmpgtb	reg2, %xmm6;						      \
+	pand	%xmm4, %xmm3;						      \
+	pand	%xmm6, %xmm5;						      \
+	pand	LCQWORD_reg, %xmm3;					      \
+	pand	LCQWORD_reg, %xmm5;					      \
+	por	%xmm3, reg1;						      \
+	por	%xmm5, reg2
+
+	movdqu	(%eax), %xmm1
+	TOLOWER (%xmm2, %xmm1)
+	movd	%xmm2, %ecx
+	movd	%xmm1, %edi
+	movdqa	%xmm2, %xmm3
+	movdqa	%xmm1, %xmm4
+	cmpl	%edi, %ecx
+#else
+# define TOLOWER(reg1, reg)
+
+	movd	%xmm2, %ecx
+	cmp	(%eax), %ecx
+#endif
+	jne	L(less4bytes)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	movdqu	(%eax), %xmm1
+#endif
+	pxor	%xmm2, %xmm1
+	pxor	%xmm0, %xmm0
+	ptest	%xmm1, %xmm0
+	jnc	L(less16bytes)
+	pcmpeqb	%xmm0, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, REM
+	jbe	L(eq)
+#endif
+	add	$16, %edx
+	add	$16, %eax
+L(first4bytes):
+	movzbl	(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, (%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	je	L(eq)
+#endif
+
+	movzbl	1(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	1(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 1(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	je	L(eq)
+#endif
+	movzbl	2(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	2(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 2(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	je	L(eq)
+#endif
+	movzbl	3(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	3(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 3(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	je	L(eq)
+#endif
+	movzbl	4(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	4(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 4(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	je	L(eq)
+#endif
+	movzbl	5(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	5(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 5(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	je	L(eq)
+#endif
+	movzbl	6(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	6(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 6(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	je	L(eq)
+#endif
+	movzbl	7(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	7(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 7(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$8, REM
+	je	L(eq)
+#endif
+	add	$8, %eax
+	add	$8, %edx
+
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	PUSH	(%edi)
+#endif
+	PUSH	(%esi)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cfi_remember_state
+#endif
+	mov	%edx, %edi
+	mov	%eax, %esi
+	xorl	%eax, %eax
+L(check_offset):
+	movl	%edi, %edx
+	movl	%esi, %ecx
+	andl	$0xfff, %edx
+	andl	$0xfff, %ecx
+	cmpl	%edx, %ecx
+	cmovl	%edx, %ecx
+	lea	-0xff0(%ecx), %edx
+	sub	%edx, %edi
+	sub	%edx, %esi
+	testl	%edx, %edx
+	jg	L(crosspage)
+L(loop):
+	movdqu	(%esi,%edx), %xmm2
+	movdqu	(%edi,%edx), %xmm1
+	TOLOWER (%xmm2, %xmm1)
+	pcmpistri	$0x1a, %xmm2, %xmm1
+	jbe	L(end)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, REM
+	jbe	L(more16byteseq)
+#endif
+
+	add	$16, %edx
+	jle	L(loop)
+L(crosspage):
+	movzbl	(%edi,%edx), %eax
+	movzbl	(%esi,%edx), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+# endif
+#endif
+	subl	%ecx, %eax
+	jne	L(ret)
+	testl	%ecx, %ecx
+	je	L(ret)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$1, REM
+	jbe	L(more16byteseq)
+#endif
+	inc	%edx
+	cmp	$15, %edx
+	jle	L(crosspage)
+	add	%edx, %edi
+	add	%edx, %esi
+	jmp	L(check_offset)
+
+	.p2align 4
+L(end):
+	jnc	L(ret)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	%ecx, REM
+	jbe	L(more16byteseq)
+#endif
+	lea	(%ecx,%edx), %ecx
+	movzbl	(%edi,%ecx), %eax
+	movzbl	(%esi,%ecx), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+# endif
+#endif
+	subl	%ecx, %eax
+L(ret):
+	POP	(%esi)
+	POP	(%edi)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	POP	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	POP	(%ebx)
+# endif
+#endif
+	ret
+
+	.p2align 4
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cfi_restore_state
+L(more16byteseq):
+	POP	(%esi)
+# ifdef USE_AS_STRNCMP
+	POP	(%edi)
+# endif
+#endif
+L(eq):
+	xorl	%eax, %eax
+	RETURN
+
+L(neq):
+	mov	$1, %eax
+	ja	L(neq_bigger)
+	neg	%eax
+L(neq_bigger):
+	RETURN
+
+L(less16bytes):
+	add	$0xfefefeff, %ecx
+	jnc	L(less4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movd	%xmm3, %edi
+	xor	%edi, %ecx
+#else
+	xor	(%edx), %ecx
+#endif
+	or	$0xfefefeff, %ecx
+	add	$1, %ecx
+	jnz	L(less4bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	jbe	L(eq)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	psrldq	$4, %xmm3
+	psrldq	$4, %xmm4
+	movd	%xmm3, %ecx
+	movd	%xmm4, %edi
+	cmp	%edi, %ecx
+	mov	%ecx, %edi
+#else
+	mov	4(%edx), %ecx
+	cmp	4(%eax), %ecx
+#endif
+	jne	L(more4bytes)
+	add	$0xfefefeff, %ecx
+	jnc	L(more4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	xor	%edi, %ecx
+#else
+	xor	4(%edx), %ecx
+#endif
+	or	$0xfefefeff, %ecx
+	add	$1, %ecx
+	jnz	L(more4bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$8, REM
+	jbe	L(eq)
+#endif
+
+	add	$8, %edx
+	add	$8, %eax
+L(less4bytes):
+
+	movzbl	(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, (%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	je	L(eq)
+#endif
+	movzbl	1(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	1(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 1(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	je	L(eq)
+#endif
+
+	movzbl	2(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	2(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 2(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	je	L(eq)
+#endif
+	movzbl	3(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	3(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 3(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+L(more4bytes):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	je	L(eq)
+#endif
+	movzbl	4(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	4(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 4(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	je	L(eq)
+#endif
+	movzbl	5(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	5(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 5(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	je	L(eq)
+#endif
+	movzbl	6(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	6(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 6(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	je	L(eq)
+#endif
+	movzbl	7(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	7(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 7(%edx)
+#endif
+	jne	L(neq)
+	jmp	L(eq)
+
+END (STRCMP)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
new file mode 100644
index 0000000000..b25cc3e068
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
@@ -0,0 +1,2810 @@
+/* strcmp with SSSE3
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_STRNCMP
+# ifndef STRCMP
+#  define STRCMP	__strncmp_ssse3
+# endif
+# define STR1		8
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define RETURN		POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	mov	$16, %esi;				\
+	sub	%ecx, %esi;				\
+	cmp	%esi, REM;				\
+	jbe	L(more8byteseq);			\
+	sub	%esi, REM
+# define FLAGS		%ebx
+# define REM		%ebp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strcasecmp_l_ssse3
+# endif
+# ifdef PIC
+#  define STR1		8
+# else
+#  define STR1		4
+# endif
+# define STR2		STR1+4
+# define LOCALE		12	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (%ebx); ret; .p2align 4; CFI_PUSH (%ebx)
+# else
+#  define RETURN	ret; .p2align 4
+# endif
+# define UPDATE_STRNCMP_COUNTER
+# define FLAGS		(%esp)
+# define NONASCII	__strcasecmp_nonascii
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strncasecmp_l_ssse3
+# endif
+# ifdef PIC
+#  define STR1		12
+# else
+#  define STR1		8
+# endif
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define LOCALE		16	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (REM); POP (%ebx); ret; \
+			.p2align 4; CFI_PUSH (%ebx); CFI_PUSH (REM)
+# else
+#  define RETURN	POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# endif
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	mov	$16, %esi;				\
+	sub	%ecx, %esi;				\
+	cmp	%esi, REM;				\
+	jbe	L(more8byteseq);			\
+	sub	%esi, REM
+# define FLAGS		(%esp)
+# define REM		%ebp
+# define NONASCII	__strncasecmp_nonascii
+#else
+# ifndef STRCMP
+#  define STRCMP	__strcmp_ssse3
+# endif
+# define STR1		4
+# define STR2		STR1+4
+# define RETURN		ret; .p2align 4
+# define UPDATE_STRNCMP_COUNTER
+# define FLAGS		%ebx
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (__strcasecmp_ssse3)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strcasecmp_nonascii
+# else
+	jne	__strcasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strcasecmp_ssse3)
+#endif
+
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (__strncasecmp_ssse3)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strncasecmp_nonascii
+# else
+	jne	__strncasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strncasecmp_ssse3)
+#endif
+
+ENTRY (STRCMP)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movl	LOCALE(%esp), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+	jne	NONASCII
+
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+# endif
+L(ascii):
+	.section .rodata.cst16,"aM",@progbits,16
+	.align 16
+.Lbelowupper:
+	.quad	0x4040404040404040
+	.quad	0x4040404040404040
+.Ltopupper:
+	.quad	0x5b5b5b5b5b5b5b5b
+	.quad	0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+# ifdef PIC
+#  define UCLOW_reg .Lbelowupper@GOTOFF(%ebx)
+#  define UCHIGH_reg .Ltopupper@GOTOFF(%ebx)
+#  define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx)
+# else
+#  define UCLOW_reg .Lbelowupper
+#  define UCHIGH_reg .Ltopupper
+#  define LCQWORD_reg .Ltouppermask
+# endif
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	PUSH	(REM)
+#endif
+
+	movl	STR1(%esp), %edx
+	movl	STR2(%esp), %eax
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	movl	CNT(%esp), REM
+	cmp	$16, REM
+	jb	L(less16bytes_sncmp)
+#elif !defined USE_AS_STRCASECMP_L
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	add	$8, %edx
+	add	$8, %eax
+#endif
+	movl	%edx, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(crosspage)
+	mov	%eax, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(crosspage)
+	pxor	%xmm0, %xmm0
+	movlpd	(%eax), %xmm1
+	movlpd	(%edx), %xmm2
+	movhpd	8(%eax), %xmm1
+	movhpd	8(%edx), %xmm2
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+	movdqa	reg1, %xmm5;					\
+	movdqa	reg2, %xmm7;					\
+	movdqa	UCHIGH_reg, %xmm6;				\
+	pcmpgtb	UCLOW_reg, %xmm5;				\
+	pcmpgtb	UCLOW_reg, %xmm7;				\
+	pcmpgtb	reg1, %xmm6;					\
+	pand	%xmm6, %xmm5;					\
+	movdqa	UCHIGH_reg, %xmm6;				\
+	pcmpgtb	reg2, %xmm6;					\
+	pand	%xmm6, %xmm7;					\
+	pand	LCQWORD_reg, %xmm5;				\
+	por	%xmm5, reg1;					\
+	pand	LCQWORD_reg, %xmm7;				\
+	por	%xmm7, reg2
+	TOLOWER (%xmm1, %xmm2)
+#else
+# define TOLOWER(reg1, reg2)
+#endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %ecx
+	sub	$0xffff, %ecx
+	jnz	L(less16bytes)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(eq)
+#endif
+	add	$16, %eax
+	add	$16, %edx
+
+L(crosspage):
+
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	PUSH	(FLAGS)
+#endif
+	PUSH	(%edi)
+	PUSH	(%esi)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	pushl	$0
+	cfi_adjust_cfa_offset (4)
+#endif
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cfi_remember_state
+#endif
+
+	movl	%edx, %edi
+	movl	%eax, %ecx
+	and	$0xf, %ecx
+	and	$0xf, %edi
+	xor	%ecx, %eax
+	xor	%edi, %edx
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	xor	FLAGS, FLAGS
+#endif
+	cmp	%edi, %ecx
+	je	L(ashr_0)
+	ja	L(bigger)
+	orl	$0x20, FLAGS
+	xchg	%edx, %eax
+	xchg	%ecx, %edi
+L(bigger):
+	lea	15(%edi), %edi
+	sub	%ecx, %edi
+	cmp	$8, %edi
+	jle	L(ashr_less_8)
+	cmp	$14, %edi
+	je	L(ashr_15)
+	cmp	$13, %edi
+	je	L(ashr_14)
+	cmp	$12, %edi
+	je	L(ashr_13)
+	cmp	$11, %edi
+	je	L(ashr_12)
+	cmp	$10, %edi
+	je	L(ashr_11)
+	cmp	$9, %edi
+	je	L(ashr_10)
+L(ashr_less_8):
+	je	L(ashr_9)
+	cmp	$7, %edi
+	je	L(ashr_8)
+	cmp	$6, %edi
+	je	L(ashr_7)
+	cmp	$5, %edi
+	je	L(ashr_6)
+	cmp	$4, %edi
+	je	L(ashr_5)
+	cmp	$3, %edi
+	je	L(ashr_4)
+	cmp	$2, %edi
+	je	L(ashr_3)
+	cmp	$1, %edi
+	je	L(ashr_2)
+	cmp	$0, %edi
+	je	L(ashr_1)
+
+/*
+ * The following cases will be handled by ashr_0
+ *  ecx(offset of esi)  eax(offset of edi)  relative offset  corresponding case
+ *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
+ */
+	.p2align 4
+L(ashr_0):
+	mov	$0xffff, %esi
+	movdqa	(%eax), %xmm1
+	pxor	%xmm0, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movdqa	(%edx), %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm2, %xmm1
+#else
+	pcmpeqb	(%edx), %xmm1
+#endif
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	mov	%ecx, %edi
+	jne	L(less32bytes)
+	UPDATE_STRNCMP_COUNTER
+	movl	$0x10, FLAGS
+	mov	$0x10, %ecx
+	pxor	%xmm0, %xmm0
+	.p2align 4
+L(loop_ashr_0):
+	movdqa	(%eax, %ecx), %xmm1
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movdqa	(%edx, %ecx), %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+#else
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	(%edx, %ecx), %xmm1
+#endif
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	jmp	L(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
+ */
+	.p2align 4
+L(ashr_1):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$15, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-15(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$1, FLAGS
+	lea	1(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_1):
+	add	$16, %edi
+	jg	L(nibble_ashr_1)
+
+L(gobble_ashr_1):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$1, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_1)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$1, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_1)
+
+	.p2align 4
+L(nibble_ashr_1):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfffe, %esi
+	jnz	L(ashr_1_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$15, REM
+	jbe	L(ashr_1_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_1)
+
+	.p2align 4
+L(ashr_1_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$1, %xmm0
+	psrldq	$1, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(14~15)            n -14            1(15 +(n-14) - n)         ashr_2
+ */
+	.p2align 4
+L(ashr_2):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$14, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-14(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$2, FLAGS
+	lea	2(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_2):
+	add	$16, %edi
+	jg	L(nibble_ashr_2)
+
+L(gobble_ashr_2):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$2, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_2)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$2, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_2)
+
+	.p2align 4
+L(nibble_ashr_2):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfffc, %esi
+	jnz	L(ashr_2_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$14, REM
+	jbe	L(ashr_2_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_2)
+
+	.p2align 4
+L(ashr_2_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$2, %xmm0
+	psrldq	$2, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(13~15)            n -13            2(15 +(n-13) - n)         ashr_3
+ */
+	.p2align 4
+L(ashr_3):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$13, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-13(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$3, FLAGS
+	lea	3(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_3):
+	add	$16, %edi
+	jg	L(nibble_ashr_3)
+
+L(gobble_ashr_3):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$3, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_3)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$3, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_3)
+
+	.p2align 4
+L(nibble_ashr_3):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfff8, %esi
+	jnz	L(ashr_3_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$13, REM
+	jbe	L(ashr_3_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_3)
+
+	.p2align 4
+L(ashr_3_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$3, %xmm0
+	psrldq	$3, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(12~15)            n -12            3(15 +(n-12) - n)         ashr_4
+ */
+	.p2align 4
+L(ashr_4):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$12, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-12(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$4, FLAGS
+	lea	4(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_4):
+	add	$16, %edi
+	jg	L(nibble_ashr_4)
+
+L(gobble_ashr_4):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$4, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_4)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$4, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_4)
+
+	.p2align 4
+L(nibble_ashr_4):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfff0, %esi
+	jnz	L(ashr_4_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$12, REM
+	jbe	L(ashr_4_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_4)
+
+	.p2align 4
+L(ashr_4_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$4, %xmm0
+	psrldq	$4, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(11~15)            n -11            4(15 +(n-11) - n)         ashr_5
+ */
+	.p2align 4
+L(ashr_5):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$11, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-11(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$5, FLAGS
+	lea	5(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_5):
+	add	$16, %edi
+	jg	L(nibble_ashr_5)
+
+L(gobble_ashr_5):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$5, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_5)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$5, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_5)
+
+	.p2align 4
+L(nibble_ashr_5):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xffe0, %esi
+	jnz	L(ashr_5_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$11, REM
+	jbe	L(ashr_5_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_5)
+
+	.p2align 4
+L(ashr_5_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$5, %xmm0
+	psrldq	$5, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(10~15)            n -10            5(15 +(n-10) - n)         ashr_6
+ */
+
+	.p2align 4
+L(ashr_6):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$10, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-10(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$6, FLAGS
+	lea	6(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_6):
+	add	$16, %edi
+	jg	L(nibble_ashr_6)
+
+L(gobble_ashr_6):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$6, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_6)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$6, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_6)
+
+	.p2align 4
+L(nibble_ashr_6):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xffc0, %esi
+	jnz	L(ashr_6_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$10, REM
+	jbe	L(ashr_6_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_6)
+
+	.p2align 4
+L(ashr_6_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$6, %xmm0
+	psrldq	$6, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(9~15)            n - 9            6(15 +(n-9) - n)         ashr_7
+ */
+
+	.p2align 4
+L(ashr_7):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$9, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-9(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$7, FLAGS
+	lea	8(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_7):
+	add	$16, %edi
+	jg	L(nibble_ashr_7)
+
+L(gobble_ashr_7):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$7, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_7)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$7, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_7)
+
+	.p2align 4
+L(nibble_ashr_7):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xff80, %esi
+	jnz	L(ashr_7_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$9, REM
+	jbe	L(ashr_7_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_7)
+
+	.p2align 4
+L(ashr_7_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$7, %xmm0
+	psrldq	$7, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_8
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(8~15)            n - 8            7(15 +(n-8) - n)         ashr_8
+ */
+	.p2align 4
+L(ashr_8):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$8, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-8(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$8, FLAGS
+	lea	8(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_8):
+	add	$16, %edi
+	jg	L(nibble_ashr_8)
+
+L(gobble_ashr_8):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$8, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_8)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$8, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_8)
+
+	.p2align 4
+L(nibble_ashr_8):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xff00, %esi
+	jnz	L(ashr_8_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$8, REM
+	jbe	L(ashr_8_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_8)
+
+	.p2align 4
+L(ashr_8_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$8, %xmm0
+	psrldq	$8, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_9
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(7~15)            n - 7            8(15 +(n-7) - n)         ashr_9
+ */
+	.p2align 4
+L(ashr_9):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$7, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-7(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$9, FLAGS
+	lea	9(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_9):
+	add	$16, %edi
+	jg	L(nibble_ashr_9)
+
+L(gobble_ashr_9):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$9, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_9)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$9, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_9)
+
+	.p2align 4
+L(nibble_ashr_9):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfe00, %esi
+	jnz	L(ashr_9_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	jbe	L(ashr_9_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_9)
+
+	.p2align 4
+L(ashr_9_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$9, %xmm0
+	psrldq	$9, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_10
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(6~15)            n - 6            9(15 +(n-6) - n)         ashr_10
+ */
+	.p2align 4
+L(ashr_10):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$6, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-6(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$10, FLAGS
+	lea	10(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_10):
+	add	$16, %edi
+	jg	L(nibble_ashr_10)
+
+L(gobble_ashr_10):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$10, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_10)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$10, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_10)
+
+	.p2align 4
+L(nibble_ashr_10):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfc00, %esi
+	jnz	L(ashr_10_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	jbe	L(ashr_10_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_10)
+
+	.p2align 4
+L(ashr_10_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$10, %xmm0
+	psrldq	$10, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_11
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(5~15)            n - 5            10(15 +(n-5) - n)         ashr_11
+ */
+	.p2align 4
+L(ashr_11):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$5, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-5(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$11, FLAGS
+	lea	11(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_11):
+	add	$16, %edi
+	jg	L(nibble_ashr_11)
+
+L(gobble_ashr_11):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$11, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_11)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$11, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_11)
+
+	.p2align 4
+L(nibble_ashr_11):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xf800, %esi
+	jnz	L(ashr_11_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	jbe	L(ashr_11_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_11)
+
+	.p2align 4
+L(ashr_11_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$11, %xmm0
+	psrldq	$11, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_12
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(4~15)            n - 4            11(15 +(n-4) - n)         ashr_12
+ */
+	.p2align 4
+L(ashr_12):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$4, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-4(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$12, FLAGS
+	lea	12(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_12):
+	add	$16, %edi
+	jg	L(nibble_ashr_12)
+
+L(gobble_ashr_12):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$12, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_12)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$12, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_12)
+
+	.p2align 4
+L(nibble_ashr_12):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xf000, %esi
+	jnz	L(ashr_12_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	jbe	L(ashr_12_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_12)
+
+	.p2align 4
+L(ashr_12_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$12, %xmm0
+	psrldq	$12, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_13
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(3~15)            n - 3            12(15 +(n-3) - n)         ashr_13
+ */
+	.p2align 4
+L(ashr_13):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-3(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$13, FLAGS
+	lea	13(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_13):
+	add	$16, %edi
+	jg	L(nibble_ashr_13)
+
+L(gobble_ashr_13):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$13, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_13)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$13, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_13)
+
+	.p2align 4
+L(nibble_ashr_13):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xe000, %esi
+	jnz	L(ashr_13_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	jbe	L(ashr_13_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_13)
+
+	.p2align 4
+L(ashr_13_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$13, %xmm0
+	psrldq	$13, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(2~15)            n - 2            13(15 +(n-2) - n)         ashr_14
+ */
+	.p2align 4
+L(ashr_14):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$2, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-2(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$14, FLAGS
+	lea	14(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_14):
+	add	$16, %edi
+	jg	L(nibble_ashr_14)
+
+L(gobble_ashr_14):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$14, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_14)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$14, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_14)
+
+	.p2align 4
+L(nibble_ashr_14):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xc000, %esi
+	jnz	L(ashr_14_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	jbe	L(ashr_14_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_14)
+
+	.p2align 4
+L(ashr_14_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$14, %xmm0
+	psrldq	$14, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(1~15)            n - 1            14(15 +(n-1) - n)         ashr_15
+ */
+
+	.p2align 4
+L(ashr_15):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$1, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-1(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$15, FLAGS
+	lea	15(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_15):
+	add	$16, %edi
+	jg	L(nibble_ashr_15)
+
+L(gobble_ashr_15):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$15, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_15)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$15, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_15)
+
+	.p2align 4
+L(nibble_ashr_15):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0x8000, %esi
+	jnz	L(ashr_15_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	jbe	L(ashr_15_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_15)
+
+	.p2align 4
+L(ashr_15_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$15, %xmm0
+	psrldq	$15, %xmm3
+	jmp	L(aftertail)
+
+	.p2align 4
+L(aftertail):
+	TOLOWER (%xmm1, %xmm3)
+	pcmpeqb	%xmm3, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	not	%esi
+L(exit):
+	mov	FLAGS, %edi
+	and	$0x1f, %edi
+	lea	-16(%edi, %ecx), %edi
+L(less32bytes):
+	add	%edi, %edx
+	add	%ecx, %eax
+	testl	$0x20, FLAGS
+	jz	L(ret2)
+	xchg	%eax, %edx
+
+	.p2align 4
+L(ret2):
+	mov	%esi, %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+#endif
+	POP	(%esi)
+	POP	(%edi)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	POP	(FLAGS)
+#endif
+L(less16bytes):
+	test	%cl, %cl
+	jz	L(2next_8_bytes)
+
+	test	$0x01, %cl
+	jnz	L(Byte0)
+
+	test	$0x02, %cl
+	jnz	L(Byte1)
+
+	test	$0x04, %cl
+	jnz	L(Byte2)
+
+	test	$0x08, %cl
+	jnz	L(Byte3)
+
+	test	$0x10, %cl
+	jnz	L(Byte4)
+
+	test	$0x20, %cl
+	jnz	L(Byte5)
+
+	test	$0x40, %cl
+	jnz	L(Byte6)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	jbe	L(eq)
+#endif
+
+	movzx	7(%eax), %ecx
+	movzx	7(%edx), %eax
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte0):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$0, REM
+	jbe	L(eq)
+#endif
+	movzx	(%eax), %ecx
+	movzx	(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte1):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	jbe	L(eq)
+#endif
+	movzx	1(%eax), %ecx
+	movzx	1(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte2):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	jbe	L(eq)
+#endif
+	movzx	2(%eax), %ecx
+	movzx	2(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte3):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	jbe	L(eq)
+#endif
+	movzx	3(%eax), %ecx
+	movzx	3(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte4):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	jbe	L(eq)
+#endif
+	movzx	4(%eax), %ecx
+	movzx	4(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte5):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	jbe	L(eq)
+#endif
+	movzx	5(%eax), %ecx
+	movzx	5(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte6):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	jbe	L(eq)
+#endif
+	movzx	6(%eax), %ecx
+	movzx	6(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(2next_8_bytes):
+	add	$8, %eax
+	add	$8, %edx
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$8, REM
+	lea	-8(REM), REM
+	jbe	L(eq)
+#endif
+
+	test	$0x01, %ch
+	jnz	L(Byte0)
+
+	test	$0x02, %ch
+	jnz	L(Byte1)
+
+	test	$0x04, %ch
+	jnz	L(Byte2)
+
+	test	$0x08, %ch
+	jnz	L(Byte3)
+
+	test	$0x10, %ch
+	jnz	L(Byte4)
+
+	test	$0x20, %ch
+	jnz	L(Byte5)
+
+	test	$0x40, %ch
+	jnz	L(Byte6)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	jbe	L(eq)
+#endif
+	movzx	7(%eax), %ecx
+	movzx	7(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+#ifdef USE_AS_STRNCMP
+L(neq_sncmp):
+#endif
+L(neq):
+	mov	$1, %eax
+	ja	L(neq_bigger)
+	neg	%eax
+L(neq_bigger):
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+#endif
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	POP	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	POP	(%ebx)
+# endif
+#endif
+	ret
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	.p2align 4
+	cfi_restore_state
+L(more8byteseq):
+
+# ifdef USE_AS_STRNCASECMP_L
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+# endif
+	POP	(%esi)
+	POP	(%edi)
+# ifdef USE_AS_STRNCMP
+	POP	(FLAGS)
+# endif
+#endif
+
+#ifdef USE_AS_STRNCMP
+L(eq_sncmp):
+#endif
+L(eq):
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	POP	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	POP	(%ebx)
+# endif
+#endif
+	xorl	%eax, %eax
+	ret
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	.p2align 4
+# if defined USE_AS_STRNCASECMP_L && defined PIC
+	CFI_PUSH (%ebx)
+# endif
+	CFI_PUSH (REM)
+L(less16bytes_sncmp):
+# ifdef USE_AS_STRNCASECMP_L
+	PUSH	(%esi)
+# endif
+	test	REM, REM
+	jz	L(eq_sncmp)
+
+	movzbl	(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, (%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$1, REM
+	je	L(eq_sncmp)
+
+	movzbl	1(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	1(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 1(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$2, REM
+	je	L(eq_sncmp)
+
+	movzbl	2(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	2(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 2(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$3, REM
+	je	L(eq_sncmp)
+
+	movzbl	3(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	3(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 3(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$4, REM
+	je	L(eq_sncmp)
+
+	movzbl	4(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	4(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 4(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$5, REM
+	je	L(eq_sncmp)
+
+	movzbl	5(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	5(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 5(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$6, REM
+	je	L(eq_sncmp)
+
+	movzbl	6(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	6(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 6(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$7, REM
+	je	L(eq_sncmp)
+
+	movzbl	7(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	7(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 7(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+
+	cmp	$8, REM
+	je	L(eq_sncmp)
+
+	movzbl	8(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	8(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 8(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$9, REM
+	je	L(eq_sncmp)
+
+	movzbl	9(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	9(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 9(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$10, REM
+	je	L(eq_sncmp)
+
+	movzbl	10(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	10(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 10(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$11, REM
+	je	L(eq_sncmp)
+
+	movzbl	11(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	11(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 11(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+
+	cmp	$12, REM
+	je	L(eq_sncmp)
+
+	movzbl	12(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	12(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 12(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$13, REM
+	je	L(eq_sncmp)
+
+	movzbl	13(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	13(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 13(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$14, REM
+	je	L(eq_sncmp)
+
+	movzbl	14(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	14(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 14(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$15, REM
+	je	L(eq_sncmp)
+
+	movzbl	15(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	15(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 15(%edx)
+# endif
+	jne	L(neq_sncmp)
+
+# ifdef USE_AS_STRNCASECMP_L
+L(eq_sncmp):
+	POP	(%esi)
+# endif
+	POP	(REM)
+# if defined USE_AS_STRNCASECMP_L && defined PIC
+	POP	(%ebx)
+# endif
+	xor	%eax, %eax
+	ret
+
+# ifdef USE_AS_STRNCASECMP_L
+	.p2align 4
+#  ifdef PIC
+	CFI_PUSH (%ebx)
+#  endif
+	CFI_PUSH (REM)
+	CFI_PUSH (%esi)
+L(neq_sncmp):
+	mov	$1, %eax
+	mov	$-1, %edx
+	cmovna	%edx, %eax
+	POP	(%esi)
+	POP	(REM)
+#  ifdef PIC
+	POP	(%ebx)
+#  endif
+	ret
+# endif
+#endif
+
+END (STRCMP)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S
new file mode 100644
index 0000000000..56de25a4b7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S
@@ -0,0 +1,95 @@
+/* Multiple versions of strcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRNCMP
+# define STRCMP			strncmp
+# define __GI_STRCMP		__GI_strncmp
+# define __STRCMP_IA32		__strncmp_ia32
+# define __STRCMP_SSSE3		__strncmp_ssse3
+# define __STRCMP_SSE4_2	__strncmp_sse4_2
+#elif defined USE_AS_STRCASECMP_L
+# define STRCMP			__strcasecmp_l
+# define __GI_STRCMP		__GI_strcasecmp_l
+# define __STRCMP_IA32		__strcasecmp_l_ia32
+# define __STRCMP_SSSE3		__strcasecmp_l_ssse3
+# define __STRCMP_SSE4_2	__strcasecmp_l_sse4_2
+#elif defined USE_AS_STRNCASECMP_L
+# define STRCMP			__strncasecmp_l
+# define __GI_STRCMP		__GI_strncasecmp_l
+# define __STRCMP_IA32		__strncasecmp_l_ia32
+# define __STRCMP_SSSE3		__strncasecmp_l_ssse3
+# define __STRCMP_SSE4_2	__strncasecmp_l_sse4_2
+#else
+# define STRCMP			strcmp
+# define __GI_STRCMP		__GI_strcmp
+# define __STRCMP_IA32		__strcmp_ia32
+# define __STRCMP_SSSE3		__strcmp_ssse3
+# define __STRCMP_SSE4_2	__strcmp_sse4_2
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncmp in static library since we
+   need strncmp before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc)
+	.text
+ENTRY(STRCMP)
+	.type	STRCMP, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__STRCMP_IA32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__STRCMP_SSSE3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	LOAD_FUNC_GOT_EAX (__STRCMP_SSE4_2)
+2:	ret
+END(STRCMP)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __STRCMP_IA32, @function; \
+	.p2align 4; \
+	.globl __STRCMP_IA32; \
+	.hidden __STRCMP_IA32; \
+	__STRCMP_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __STRCMP_IA32, .-__STRCMP_IA32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCMP; __GI_STRCMP = __STRCMP_IA32
+# endif
+#endif
+
+#if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L \
+    && !defined USE_AS_STRNCASECMP_L
+# include "../strcmp.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S
new file mode 100644
index 0000000000..ed627a5f62
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S
@@ -0,0 +1,2250 @@
+/* strcpy with SSE2 and unaligned load
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG)                  \
+	cfi_adjust_cfa_offset (4);     \
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)                   \
+	cfi_adjust_cfa_offset (-4);    \
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+#  define STRCPY  __strcpy_sse2
+# endif
+
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN  STR2+4
+
+# ifdef USE_AS_STRNCPY
+#  define PARMS  16
+#  define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi)
+#  define RETURN  POP(%edi); POP(%esi); POP(%ebx); ret;          \
+	CFI_PUSH(%ebx); CFI_PUSH(%esi); CFI_PUSH(%edi);
+
+# ifdef SHARED
+#  define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into ECX and branch to it. TABLE is a
+	jump table with relative offsets.
+	INDEX is a register contains the index into the jump table.
+	SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)            \
+	/* We first load PC into ECX.  */                       \
+	SETUP_PIC_REG(cx);                                      \
+	/* Get the address of the jump table.  */               \
+	addl	$(TABLE - .), %ecx;                             \
+	/* Get the entry and convert the relative offset to the \
+	absolute	address.  */                            \
+	addl	(%ecx,INDEX,SCALE), %ecx;                       \
+	/* We loaded the jump table and adjusted ECX. Go.  */  \
+	jmp	*%ecx
+# else
+#  define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute	offsets.  INDEX is a register contains the index into the
+	jump	table.  SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edi
+	mov	STR2(%esp), %esi
+	movl	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(ExitZero)
+
+	mov	%esi, %ecx
+# ifndef USE_AS_STPCPY
+	mov	%edi, %eax      /* save result */
+# endif
+	and	$15, %ecx
+	jz	L(SourceStringAlignmentZero)
+
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	(%esi), %xmm1
+	add	%ecx, %ebx
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# else
+	cmp	$17, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# else
+	cmp	$33, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%edi)
+
+	sub	%ecx, %edi
+
+/* If source address alignment != destination address alignment */
+	.p2align 4
+L(Unalign16Both):
+	mov	$16, %ecx
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$48, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+
+	movaps	16(%esi, %ecx), %xmm4
+	movdqu	%xmm3, (%edi, %ecx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+
+	movaps	16(%esi, %ecx), %xmm1
+	movdqu	%xmm4, (%edi, %ecx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
+
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+
+	movdqu	%xmm3, (%edi, %ecx)
+	mov	%esi, %edx
+	lea	16(%esi, %ecx), %esi
+	and	$-0x40, %esi
+	sub	%esi, %edx
+	sub	%edx, %edi
+	lea	128(%ebx, %edx), %ebx
+
+L(Unaligned64Loop):
+	movaps	(%esi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%esi), %xmm5
+	movaps	32(%esi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%esi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(Unaligned64Leave)
+L(Unaligned64Loop_start):
+	add	$64, %edi
+	add	$64, %esi
+	movdqu	%xmm4, -64(%edi)
+	movaps	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%edi)
+	movaps	16(%esi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%esi), %xmm3
+	movdqu	%xmm6, -32(%edi)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%edi)
+	movaps	48(%esi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+	test	%edx, %edx
+	jz	L(Unaligned64Loop_start)
+L(Unaligned64Leave):
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%ecx, %ecx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%edi)
+	movdqu	%xmm5, 16(%edi)
+	movdqu	%xmm6, 32(%edi)
+# ifdef USE_AS_STPCPY
+	lea	48(%edi, %edx), %eax
+# endif
+	movdqu	%xmm7, 48(%edi)
+	add	$15, %ebx
+	sub	%edx, %ebx
+	lea	49(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentZero):
+	pxor	%xmm0, %xmm0
+	movdqa	(%esi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# else
+	cmp	$17, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	pcmpeqb	16(%esi), %xmm0
+	movdqu	%xmm1, (%edi)
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# else
+	cmp	$33, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes1)
+
+	jmp	L(Unalign16Both)
+
+/*-----------------End of main part---------------------------*/
+
+/* Case1 */
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %esi
+	add	$16, %edi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+	sub	%ecx, %ebx
+	bsf	%edx, %edx
+	add	%ecx, %esi
+	add	$16, %edx
+	sub	%ecx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%edx, %edx
+# ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+# endif
+	movdqu	%xmm4, (%edi)
+	add	$63, %ebx
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi, %edx), %eax
+# endif
+	movdqu	%xmm5, 16(%edi)
+	add	$47, %ebx
+	sub	%edx, %ebx
+	lea	17(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%edx, %edx
+	movdqu	%xmm4, (%edi)
+	movdqu	%xmm5, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	32(%edi, %edx), %eax
+# endif
+	movdqu	%xmm6, 32(%edi)
+	add	$31, %ebx
+	sub	%edx, %ebx
+	lea	33(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+	movdqu	%xmm6, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+	movdqu	%xmm5, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+	movdqu	%xmm4, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+	movdqu	%xmm3, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+	movdqu	%xmm1, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%ecx, %edi
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	add	$16, %edx
+	sub	%ecx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%ecx, %edi
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32BytesCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %edi
+	add	$16, %esi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(Exit0):
+# ifdef USE_AS_STPCPY
+	mov	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	movb	%dh, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	(%edi), %eax
+# endif
+	sub	$1, %ebx
+	lea	1(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	movw	(%esi), %dx
+	movw	%dx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	1(%edi), %eax
+# endif
+	sub	$2, %ebx
+	lea	2(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	movw	(%esi), %cx
+	movw	%cx, (%edi)
+	movb	%dh, 2(%edi)
+# ifdef USE_AS_STPCPY
+	lea	2(%edi), %eax
+# endif
+	sub	$3, %ebx
+	lea	3(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%esi), %edx
+	movl	%edx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	3(%edi), %eax
+# endif
+	sub	$4, %ebx
+	lea	4(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	movl	(%esi), %ecx
+	movb	%dh, 4(%edi)
+	movl	%ecx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	4(%edi), %eax
+# endif
+	sub	$5, %ebx
+	lea	5(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%edi)
+	movw	%dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	5(%edi), %eax
+# endif
+	sub	$6, %ebx
+	lea	6(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%edi)
+	movl	%edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+	lea	6(%edi), %eax
+# endif
+	sub	$7, %ebx
+	lea	7(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	7(%edi), %eax
+# endif
+	sub	$8, %ebx
+	lea	8(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	movlpd	(%esi), %xmm0
+	movb	%dh, 8(%edi)
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	8(%edi), %eax
+# endif
+	sub	$9, %ebx
+	lea	9(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	9(%edi), %eax
+# endif
+	sub	$10, %ebx
+	lea	10(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	10(%edi), %eax
+# endif
+	sub	$11, %ebx
+	lea	11(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	11(%edi), %eax
+# endif
+	sub	$12, %ebx
+	lea	12(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+	lea	12(%edi), %eax
+# endif
+	sub	$13, %ebx
+	lea	13(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+	lea	13(%edi), %eax
+# endif
+	sub	$14, %ebx
+	lea	14(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	14(%edi), %eax
+# endif
+	sub	$15, %ebx
+	lea	15(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	15(%edi), %eax
+# endif
+	sub	$16, %ebx
+	lea	16(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit17):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+	movb	%dh, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi), %eax
+# endif
+	sub	$17, %ebx
+	lea	17(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movw	%cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	17(%edi), %eax
+# endif
+	sub	$18, %ebx
+	lea	18(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	18(%edi), %eax
+# endif
+	sub	$19, %ebx
+	lea	19(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	19(%edi), %eax
+# endif
+	sub	$20, %ebx
+	lea	20(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+	movb	%dh, 20(%edi)
+# ifdef USE_AS_STPCPY
+	lea	20(%edi), %eax
+# endif
+	sub	$21, %ebx
+	lea	21(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	21(%edi), %eax
+# endif
+	sub	$22, %ebx
+	lea	22(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	22(%edi), %eax
+# endif
+	sub	$23, %ebx
+	lea	23(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	23(%edi), %eax
+# endif
+	sub	$24, %ebx
+	lea	24(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movb	%dh, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	24(%edi), %eax
+# endif
+	sub	$25, %ebx
+	lea	25(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movw	%cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	25(%edi), %eax
+# endif
+	sub	$26, %ebx
+	lea	26(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+	lea	26(%edi), %eax
+# endif
+	sub	$27, %ebx
+	lea	27(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	27(%edi), %eax
+# endif
+	sub	$28, %ebx
+	lea	28(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+	lea	28(%edi), %eax
+# endif
+	sub	$29, %ebx
+	lea	29(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	29(%edi), %eax
+# endif
+	sub	$30, %ebx
+	lea	30(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+
+	.p2align 4
+L(Exit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	30(%edi), %eax
+# endif
+	sub	$31, %ebx
+	lea	31(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	31(%edi), %eax
+# endif
+	sub	$32, %ebx
+	lea	32(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(StrncpyExit1):
+	movb	(%esi), %dl
+	movb	%dl, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	1(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit2):
+	movw	(%esi), %dx
+	movw	%dx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	2(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit3):
+	movw	(%esi), %cx
+	movb	2(%esi), %dl
+	movw	%cx, (%edi)
+	movb	%dl, 2(%edi)
+# ifdef USE_AS_STPCPY
+	lea	3(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit4):
+	movl	(%esi), %edx
+	movl	%edx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	4(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit5):
+	movl	(%esi), %ecx
+	movb	4(%esi), %dl
+	movl	%ecx, (%edi)
+	movb	%dl, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	5(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%edi)
+	movw	%dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	6(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%edi)
+	movl	%edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+	lea	7(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	8(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit9):
+	movlpd	(%esi), %xmm0
+	movb	8(%esi), %dl
+	movlpd	%xmm0, (%edi)
+	movb	%dl, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	9(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	10(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	11(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	12(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+	lea	13(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+	lea	14(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	15(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit17):
+	movdqu	(%esi), %xmm0
+	movb	16(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movb	%cl, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	17(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movw	%cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	18(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	19(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	20(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movb	20(%esi), %dl
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+	movb	%dl, 20(%edi)
+# ifdef USE_AS_STPCPY
+	lea	21(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	22(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	23(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	24(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movb	24(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movb	%cl, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	25(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movw	%cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	26(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+	lea	27(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	28(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+	lea	29(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	30(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	31(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	32(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit33):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movb	32(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+	movb	%cl, 32(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill0):
+	RETURN
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill3):
+	movl	%edx, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%edi)
+	movb	%dl, 4(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%edi)
+	movw	%dx, 4(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill7):
+	movlpd	%xmm0, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill8):
+	movlpd	%xmm0, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill9):
+	movlpd	%xmm0, (%edi)
+	movb	%dl, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill10):
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill11):
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill12):
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill13):
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm0, 5(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill14):
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm0, 6(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill15):
+	movdqu	%xmm0, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill16):
+	movdqu	%xmm0, (%edi)
+	RETURN
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+	movdqu	%xmm2, (%edi, %ecx)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmmExit):
+	bsf	%edx, %edx
+	add	$15, %ebx
+	add	%ecx, %edi
+# ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+# endif
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+
+	.p2align 4
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%edx, %edx
+	sub	$16, %ebx
+	jbe	L(StrncpyFillExit)
+
+	movdqu	%xmm0, (%edi)
+	add	$16, %edi
+
+	mov	%edi, %esi
+	and	$0xf, %esi
+	sub	%esi, %edi
+	add	%esi, %ebx
+	sub	$64, %ebx
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%edi)
+	movdqa	%xmm0, 16(%edi)
+	movdqa	%xmm0, 32(%edi)
+	movdqa	%xmm0, 48(%edi)
+	add	$64, %edi
+	sub	$64, %ebx
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %ebx
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%edi)
+	movdqa	%xmm0, 16(%edi)
+	add	$32, %edi
+	sub	$16, %ebx
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%edi)
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillLess32):
+	add	$16, %ebx
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%edi)
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillExit):
+	add	$16, %ebx
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%ebx), %ecx
+	and	$-16, %ecx
+	add	$48, %ebx
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%edi)
+# ifdef USE_AS_STPCPY
+	lea	64(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	xor	%ecx, %ecx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm4, (%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm5, 16(%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm6, 32(%edi)
+	lea	16(%edi, %ecx), %edi
+	lea	16(%esi, %ecx), %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(ExitZero):
+	movl	%edi, %eax
+	RETURN
+
+END (STRCPY)
+
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int    JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+
+L(ExitStrncpyTable):
+	.int	JMPTBL(L(Exit0), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+	.int    JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+
+	.p2align 4
+L(FillTable):
+	.int	JMPTBL(L(Fill0), L(FillTable))
+	.int	JMPTBL(L(Fill1), L(FillTable))
+	.int	JMPTBL(L(Fill2), L(FillTable))
+	.int	JMPTBL(L(Fill3), L(FillTable))
+	.int	JMPTBL(L(Fill4), L(FillTable))
+	.int	JMPTBL(L(Fill5), L(FillTable))
+	.int	JMPTBL(L(Fill6), L(FillTable))
+	.int	JMPTBL(L(Fill7), L(FillTable))
+	.int	JMPTBL(L(Fill8), L(FillTable))
+	.int	JMPTBL(L(Fill9), L(FillTable))
+	.int	JMPTBL(L(Fill10), L(FillTable))
+	.int	JMPTBL(L(Fill11), L(FillTable))
+	.int	JMPTBL(L(Fill12), L(FillTable))
+	.int	JMPTBL(L(Fill13), L(FillTable))
+	.int	JMPTBL(L(Fill14), L(FillTable))
+	.int	JMPTBL(L(Fill15), L(FillTable))
+	.int	JMPTBL(L(Fill16), L(FillTable))
+# else
+#  define PARMS  4
+#  define ENTRANCE
+#  define RETURN  POP (%edi); ret; CFI_PUSH (%edi)
+#  define RETURN1  ret
+
+	.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	cmpb	$0, 7(%ecx)
+	jz	L(ExitTail8)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	cmpb	$0, 14(%ecx)
+	jz	L(ExitTail15)
+	cmpb	$0, 15(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	PUSH	(%ebx)
+
+	mov	%edx, %edi
+	lea	16(%ecx), %ebx
+	and	$-16, %ebx
+	pxor	%xmm0, %xmm0
+	movdqu	(%ecx), %xmm1
+	movdqu	%xmm1, (%edx)
+	pcmpeqb	(%ebx), %xmm0
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%ecx, %eax
+	lea	16(%ecx), %ecx
+	and	$-16, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+	xor	%ebx, %ebx
+
+	.p2align 4
+	movdqa	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movdqu	%xmm1, (%edx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm3
+	movdqu	%xmm2, (%edx, %ebx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm4
+	movdqu	%xmm3, (%edx, %ebx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm1
+	movdqu	%xmm4, (%edx, %ebx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm2
+	movdqu	%xmm1, (%edx, %ebx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm3
+	movdqu	%xmm2, (%edx, %ebx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movdqu	%xmm3, (%edx, %ebx)
+	mov	%ecx, %eax
+	lea	16(%ecx, %ebx), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	add	$64, %ecx
+	pminub	%xmm7, %xmm3
+	add	$64, %edx
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+L(Aligned64Loop_start):
+	movdqu	%xmm4, -64(%edx)
+	movaps	(%ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%edx)
+	movaps	16(%ecx), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%ecx), %xmm3
+	movdqu	%xmm6, -32(%edx)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%edx)
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$64, %edx
+	add	$64, %ecx
+	test	%eax, %eax
+	jz	L(Aligned64Loop_start)
+L(Aligned64Leave):
+	sub	$0xa0, %ebx
+	pxor	%xmm0, %xmm0
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%ebx), %ebx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%ebx), %ebx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movdqu	%xmm6, -32(%edx)
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%ebx), %ebx
+
+/*-----------------End of main part---------------------------*/
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%ebx, %edx
+	add	%ebx, %ecx
+
+	POP	(%ebx)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	/* Exit 8 */
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	/* Exit 16 */
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm0
+	movlpd	%xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	15(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	1(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+# ifdef USE_AS_STPCPY
+	lea	2(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	3(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	4(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	5(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+	lea	6(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	8(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	9(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	10(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	11(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+	lea	12(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+	lea	13(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+CFI_POP (%edi)
+
+	.p2align 4
+L(ExitTail1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	movl	%edx, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitTail2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	1(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+# ifdef USE_AS_STPCPY
+	lea	2(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	3(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	4(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	5(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+	lea	6(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail8):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail9):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	8(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail10):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	9(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail11):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	10(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail12):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	11(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+	lea	12(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+	lea	13(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail16):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm0
+	movlpd	%xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	15(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+END (STRCPY)
+# endif
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000000..effd85da94
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
@@ -0,0 +1,3901 @@
+/* strcpy with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#  define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#  define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#  define POP(REG)	popl REG; CFI_POP (REG)
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_ssse3
+#  endif
+
+#  ifdef USE_AS_STRNCPY
+#   define PARMS  8
+#   define ENTRANCE PUSH (%ebx)
+#   define RETURN  POP (%ebx); ret; CFI_PUSH (%ebx);
+#   define RETURN1  POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+#  else
+#   define PARMS  4
+#   define ENTRANCE
+#   define RETURN  ret
+#   define RETURN1  POP (%edi); ret; CFI_PUSH (%edi)
+#  endif
+
+#  ifdef USE_AS_STPCPY
+#   define SAVE_RESULT(n)  lea	n(%edx), %eax
+#   define SAVE_RESULT_TAIL(n)  lea	n(%edx), %eax
+#  else
+#   define SAVE_RESULT(n)  movl	%edi, %eax
+#   define SAVE_RESULT_TAIL(n)  movl	%edx, %eax
+#  endif
+
+#  define STR1  PARMS
+#  define STR2  STR1+4
+#  define LEN  STR2+4
+
+/* In this code following instructions are used for copying:
+	movb	- 1 byte
+	movw	- 2 byte
+	movl	- 4 byte
+	movlpd	- 8 byte
+	movaps	- 16 byte - requires 16 byte alignment
+	of	sourse and destination adresses.
+*/
+
+.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+#  ifdef USE_AS_STRNCPY
+	movl	LEN(%esp), %ebx
+	cmp	$8, %ebx
+	jbe	L(StrncpyExit8Bytes)
+#  endif
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	cmpb	$0, 7(%ecx)
+	jz	L(ExitTail8)
+#  ifdef USE_AS_STRNCPY
+	cmp	$16, %ebx
+	jb	L(StrncpyExit15Bytes)
+#  endif
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	cmpb	$0, 14(%ecx)
+	jz	L(ExitTail15)
+#  ifdef USE_AS_STRNCPY
+	cmp	$16, %ebx
+	je	L(ExitTail16)
+#  endif
+	cmpb	$0, 15(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	mov	%edx, %edi
+# endif
+	PUSH	(%esi)
+# ifdef USE_AS_STRNCPY
+	mov	%ecx, %esi
+	sub	$16, %ebx
+	and	$0xf, %esi
+
+/* add 16 bytes ecx_offset to ebx */
+
+	add	%esi, %ebx
+# endif
+	lea	16(%ecx), %esi
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	movlpd	(%ecx), %xmm1
+	movlpd	%xmm1, (%edx)
+
+	pcmpeqb	(%esi), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %esi
+
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%edx, %eax
+	lea	16(%edx), %edx
+	and	$-16, %edx
+	sub	%edx, %eax
+
+# ifdef USE_AS_STRNCPY
+	add	%eax, %esi
+	lea	-1(%esi), %esi
+	and	$1<<31, %esi
+	test	%esi, %esi
+	jnz	L(ContinueCopy)
+	lea	16(%ebx), %ebx
+
+L(ContinueCopy):
+# endif
+	sub	%eax, %ecx
+	mov	%ecx, %eax
+	and	$0xf, %eax
+	mov	$0, %esi
+
+/* case: ecx_offset == edx_offset */
+
+	jz	L(Align16Both)
+
+	cmp	$8, %eax
+	jae	L(ShlHigh8)
+	cmp	$1, %eax
+	je	L(Shl1)
+	cmp	$2, %eax
+	je	L(Shl2)
+	cmp	$3, %eax
+	je	L(Shl3)
+	cmp	$4, %eax
+	je	L(Shl4)
+	cmp	$5, %eax
+	je	L(Shl5)
+	cmp	$6, %eax
+	je	L(Shl6)
+	jmp	L(Shl7)
+
+L(ShlHigh8):
+	je	L(Shl8)
+	cmp	$9, %eax
+	je	L(Shl9)
+	cmp	$10, %eax
+	je	L(Shl10)
+	cmp	$11, %eax
+	je	L(Shl11)
+	cmp	$12, %eax
+	je	L(Shl12)
+	cmp	$13, %eax
+	je	L(Shl13)
+	cmp	$14, %eax
+	je	L(Shl14)
+	jmp	L(Shl15)
+
+L(Align16Both):
+	movaps	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movaps	%xmm1, (%edx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm4
+	movaps	%xmm3, (%edx, %esi)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm1
+	movaps	%xmm4, (%edx, %esi)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm2
+	movaps	%xmm1, (%edx, %esi)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%edx, %esi)
+	mov	%ecx, %eax
+	lea	16(%ecx, %esi), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	lea	112(%ebx, %eax), %ebx
+# endif
+	mov	$-0x40, %esi
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	lea	64(%edx), %edx
+	pcmpeqb	%xmm0, %xmm3
+	lea	64(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeaveCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%edx)
+	movaps	%xmm5, -48(%edx)
+	movaps	%xmm6, -32(%edx)
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+	lea	48(%ebx), %ebx
+# endif
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%edx)
+	pcmpeqb	%xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl1):
+	movaps	-1(%ecx), %xmm1
+	movaps	15(%ecx), %xmm2
+L(Shl1Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	31(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-15(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-1(%ecx), %xmm1
+
+L(Shl1LoopStart):
+	movaps	15(%ecx), %xmm2
+	movaps	31(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	47(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	63(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$1, %xmm3, %xmm4
+	jnz	L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave1)
+# endif
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	mov	$15, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl2):
+	movaps	-2(%ecx), %xmm1
+	movaps	14(%ecx), %xmm2
+L(Shl2Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	30(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-14(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-2(%ecx), %xmm1
+
+L(Shl2LoopStart):
+	movaps	14(%ecx), %xmm2
+	movaps	30(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	46(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	62(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$2, %xmm3, %xmm4
+	jnz	L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave2)
+# endif
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	mov	$14, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl3):
+	movaps	-3(%ecx), %xmm1
+	movaps	13(%ecx), %xmm2
+L(Shl3Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	29(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-13(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-3(%ecx), %xmm1
+
+L(Shl3LoopStart):
+	movaps	13(%ecx), %xmm2
+	movaps	29(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	45(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	61(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$3, %xmm3, %xmm4
+	jnz	L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave3)
+# endif
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	mov	$13, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%ecx), %xmm1
+	movaps	12(%ecx), %xmm2
+L(Shl4Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	28(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-12(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%ecx), %xmm2
+	movaps	28(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave4)
+# endif
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	mov	$12, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl5):
+	movaps	-5(%ecx), %xmm1
+	movaps	11(%ecx), %xmm2
+L(Shl5Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	27(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-11(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-5(%ecx), %xmm1
+
+L(Shl5LoopStart):
+	movaps	11(%ecx), %xmm2
+	movaps	27(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	43(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	59(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$5, %xmm3, %xmm4
+	jnz	L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave5)
+# endif
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 7(%edx)
+	mov	$11, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl6):
+	movaps	-6(%ecx), %xmm1
+	movaps	10(%ecx), %xmm2
+L(Shl6Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	26(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-10(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-6(%ecx), %xmm1
+
+L(Shl6LoopStart):
+	movaps	10(%ecx), %xmm2
+	movaps	26(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	42(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	58(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$6, %xmm3, %xmm4
+	jnz	L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave6)
+# endif
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	6(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 6(%edx)
+	mov	$10, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl7):
+	movaps	-7(%ecx), %xmm1
+	movaps	9(%ecx), %xmm2
+L(Shl7Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	25(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-9(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-7(%ecx), %xmm1
+
+L(Shl7LoopStart):
+	movaps	9(%ecx), %xmm2
+	movaps	25(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	41(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	57(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$7, %xmm3, %xmm4
+	jnz	L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave7)
+# endif
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	5(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 5(%edx)
+	mov	$9, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%ecx), %xmm1
+	movaps	8(%ecx), %xmm2
+L(Shl8Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	24(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-8(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%ecx), %xmm2
+	movaps	24(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave8)
+# endif
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$8, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl9):
+	movaps	-9(%ecx), %xmm1
+	movaps	7(%ecx), %xmm2
+L(Shl9Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	23(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-7(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-9(%ecx), %xmm1
+
+L(Shl9LoopStart):
+	movaps	7(%ecx), %xmm2
+	movaps	23(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	39(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	55(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$9, %xmm3, %xmm4
+	jnz	L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave9)
+# endif
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+	movlpd	-1(%ecx), %xmm0
+	movlpd	%xmm0, -1(%edx)
+	mov	$7, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl10):
+	movaps	-10(%ecx), %xmm1
+	movaps	6(%ecx), %xmm2
+L(Shl10Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	22(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-6(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-10(%ecx), %xmm1
+
+L(Shl10LoopStart):
+	movaps	6(%ecx), %xmm2
+	movaps	22(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	38(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	54(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$10, %xmm3, %xmm4
+	jnz	L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave10)
+# endif
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+	movlpd	-2(%ecx), %xmm0
+	movlpd	%xmm0, -2(%edx)
+	mov	$6, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl11):
+	movaps	-11(%ecx), %xmm1
+	movaps	5(%ecx), %xmm2
+L(Shl11Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	21(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-5(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-11(%ecx), %xmm1
+
+L(Shl11LoopStart):
+	movaps	5(%ecx), %xmm2
+	movaps	21(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	37(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	53(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$11, %xmm3, %xmm4
+	jnz	L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave11)
+# endif
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+	movlpd	-3(%ecx), %xmm0
+	movlpd	%xmm0, -3(%edx)
+	mov	$5, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%ecx), %xmm1
+	movaps	4(%ecx), %xmm2
+L(Shl12Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	20(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-4(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%ecx), %xmm2
+	movaps	20(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave12)
+# endif
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl13):
+	movaps	-13(%ecx), %xmm1
+	movaps	3(%ecx), %xmm2
+L(Shl13Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	19(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-3(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-13(%ecx), %xmm1
+
+L(Shl13LoopStart):
+	movaps	3(%ecx), %xmm2
+	movaps	19(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	35(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	51(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$13, %xmm3, %xmm4
+	jnz	L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave13)
+# endif
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+	movl	-1(%ecx), %esi
+	movl	%esi, -1(%edx)
+	mov	$3, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl14):
+	movaps	-14(%ecx), %xmm1
+	movaps	2(%ecx), %xmm2
+L(Shl14Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	18(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-2(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-14(%ecx), %xmm1
+
+L(Shl14LoopStart):
+	movaps	2(%ecx), %xmm2
+	movaps	18(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	34(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	50(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$14, %xmm3, %xmm4
+	jnz	L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave14)
+# endif
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+	movl	-2(%ecx), %esi
+	movl	%esi, -2(%edx)
+	mov	$2, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl15):
+	movaps	-15(%ecx), %xmm1
+	movaps	1(%ecx), %xmm2
+L(Shl15Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	17(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-1(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-15(%ecx), %xmm1
+
+L(Shl15LoopStart):
+	movaps	1(%ecx), %xmm2
+	movaps	17(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	33(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	49(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$15, %xmm3, %xmm4
+	jnz	L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave15)
+# endif
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+	movl	-3(%ecx), %esi
+	movl	%esi, -3(%edx)
+	mov	$1, %esi
+# ifdef USE_AS_STRCAT
+	jmp	L(CopyFrom1To16Bytes)
+# endif
+
+
+# ifndef USE_AS_STRCAT
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+#  ifdef USE_AS_STRNCPY
+	add	$16, %ebx
+#  endif
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh8)
+
+L(CopyFrom1To16BytesLess8):
+	mov	%al, %ah
+	and	$15, %ah
+	jz	L(ExitHigh4)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT	(3)
+#  ifdef USE_AS_STRNCPY
+	sub	$4, %ebx
+	lea	4(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh4):
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+
+	.p2align 4
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT	(7)
+#  ifdef USE_AS_STRNCPY
+	sub	$8, %ebx
+	lea	8(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh8):
+	mov	%ah, %al
+	and	$15, %al
+	jz	L(ExitHigh12)
+
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT	(11)
+#  ifdef USE_AS_STRNCPY
+	sub	$12, %ebx
+	lea	12(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh12):
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	SAVE_RESULT	(15)
+#  ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	lea	16(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+#   ifdef USE_AS_STRNCPY
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%esi, %ecx
+	add	%esi, %edx
+
+	POP	(%esi)
+
+	test	%al, %al
+	jz	L(ExitHighCase2)
+
+	cmp	$8, %ebx
+	ja	L(CopyFrom1To16BytesLess8)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(Exit7)
+	jmp	L(Exit8)
+
+	.p2align 4
+L(ExitHighCase2):
+	cmp	$8, %ebx
+	jbe	L(CopyFrom1To16BytesLess8Case3)
+
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %ebx
+	je	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(Exit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %ebx
+	je	L(Exit15)
+	jmp	L(Exit16)
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+
+	cmp	$8, %ebx
+	ja	L(ExitHigh8Case3)
+
+L(CopyFrom1To16BytesLess8Case3):
+	cmp	$4, %ebx
+	ja	L(ExitHigh4Case3)
+
+	cmp	$1, %ebx
+	je	L(Exit1)
+	cmp	$2, %ebx
+	je	L(Exit2)
+	cmp	$3, %ebx
+	je	L(Exit3)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT	(4)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh4Case3):
+	cmp	$5, %ebx
+	je	L(Exit5)
+	cmp	$6, %ebx
+	je	L(Exit6)
+	cmp	$7, %ebx
+	je	L(Exit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT	(8)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh8Case3):
+	cmp	$12, %ebx
+	ja	L(ExitHigh12Case3)
+
+	cmp	$9, %ebx
+	je	L(Exit9)
+	cmp	$10, %ebx
+	je	L(Exit10)
+	cmp	$11, %ebx
+	je	L(Exit11)
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT	(12)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh12Case3):
+	cmp	$13, %ebx
+	je	L(Exit13)
+	cmp	$14, %ebx
+	je	L(Exit14)
+	cmp	$15, %ebx
+	je	L(Exit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	SAVE_RESULT	(16)
+	RETURN1
+
+#  endif
+
+	.p2align 4
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	SAVE_RESULT	(0)
+#  ifdef USE_AS_STRNCPY
+	sub	$1, %ebx
+	lea	1(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	SAVE_RESULT	(1)
+#  ifdef USE_AS_STRNCPY
+	sub	$2, %ebx
+	lea	2(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	SAVE_RESULT	(2)
+#  ifdef USE_AS_STRNCPY
+	sub	$3, %ebx
+	lea	3(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	SAVE_RESULT	(4)
+#  ifdef USE_AS_STRNCPY
+	sub	$5, %ebx
+	lea	5(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	SAVE_RESULT	(5)
+#  ifdef USE_AS_STRNCPY
+	sub	$6, %ebx
+	lea	6(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	SAVE_RESULT	(6)
+#  ifdef USE_AS_STRNCPY
+	sub	$7, %ebx
+	lea	7(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit9):
+	movlpd	(%ecx), %xmm0
+	movb	8(%ecx), %al
+	movlpd	%xmm0, (%edx)
+	movb	%al, 8(%edx)
+	SAVE_RESULT	(8)
+#  ifdef USE_AS_STRNCPY
+	sub	$9, %ebx
+	lea	9(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit10):
+	movlpd	(%ecx), %xmm0
+	movw	8(%ecx), %ax
+	movlpd	%xmm0, (%edx)
+	movw	%ax, 8(%edx)
+	SAVE_RESULT	(9)
+#  ifdef USE_AS_STRNCPY
+	sub	$10, %ebx
+	lea	10(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit11):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 7(%edx)
+	SAVE_RESULT	(10)
+#  ifdef USE_AS_STRNCPY
+	sub	$11, %ebx
+	lea	11(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	SAVE_RESULT	(12)
+#  ifdef USE_AS_STRNCPY
+	sub	$13, %ebx
+	lea	13(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	SAVE_RESULT	(13)
+#  ifdef USE_AS_STRNCPY
+	sub	$14, %ebx
+	lea	14(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	SAVE_RESULT	(14)
+#  ifdef USE_AS_STRNCPY
+	sub	$15, %ebx
+	lea	15(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+CFI_POP	(%edi)
+
+#  ifdef USE_AS_STRNCPY
+	.p2align 4
+L(Fill0):
+	RETURN
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill3):
+	movw	%dx, (%ecx)
+	movb	%dl, 2(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%ecx)
+	movb	%dl, 4(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%ecx)
+	movw	%dx, 4(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill7):
+	movl	%edx, (%ecx)
+	movl	%edx, 3(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill8):
+	movlpd	%xmm0, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill9):
+	movlpd	%xmm0, (%ecx)
+	movb	%dl, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill10):
+	movlpd	%xmm0, (%ecx)
+	movw	%dx, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill11):
+	movlpd	%xmm0, (%ecx)
+	movl	%edx, 7(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill12):
+	movlpd	%xmm0, (%ecx)
+	movl	%edx, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill13):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 5(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill14):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 6(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill15):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 7(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill16):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(StrncpyFillExit1):
+	lea	16(%ebx), %ebx
+L(FillFrom1To16Bytes):
+	test	%ebx, %ebx
+	jz	L(Fill0)
+	cmp	$16, %ebx
+	je	L(Fill16)
+	cmp	$8, %ebx
+	je	L(Fill8)
+	jg	L(FillMore8)
+	cmp	$4, %ebx
+	je	L(Fill4)
+	jg	L(FillMore4)
+	cmp	$2, %ebx
+	jl	L(Fill1)
+	je	L(Fill2)
+	jg	L(Fill3)
+L(FillMore8):	/* but less than 16 */
+	cmp	$12, %ebx
+	je	L(Fill12)
+	jl	L(FillLess12)
+	cmp	$14, %ebx
+	jl	L(Fill13)
+	je	L(Fill14)
+	jg	L(Fill15)
+L(FillMore4):	/* but less than 8 */
+	cmp	$6, %ebx
+	jl	L(Fill5)
+	je	L(Fill6)
+	jg	L(Fill7)
+L(FillLess12):	/* but more than 8 */
+	cmp	$10, %ebx
+	jl	L(Fill9)
+	je	L(Fill10)
+	jmp	L(Fill11)
+
+	CFI_PUSH(%edi)
+
+	.p2align 4
+L(StrncpyFillTailWithZero1):
+	POP	(%edi)
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%edx, %edx
+	sub	$16, %ebx
+	jbe	L(StrncpyFillExit1)
+
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 8(%ecx)
+
+	lea	16(%ecx), %ecx
+
+	mov	%ecx, %edx
+	and	$0xf, %edx
+	sub	%edx, %ecx
+	add	%edx, %ebx
+	xor	%edx, %edx
+	sub	$64, %ebx
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%ecx)
+	movdqa	%xmm0, 16(%ecx)
+	movdqa	%xmm0, 32(%ecx)
+	movdqa	%xmm0, 48(%ecx)
+	lea	64(%ecx), %ecx
+	sub	$64, %ebx
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %ebx
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%ecx)
+	movdqa	%xmm0, 16(%ecx)
+	lea	32(%ecx), %ecx
+	sub	$16, %ebx
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%ecx)
+	lea	16(%ecx), %ecx
+	jmp	L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+	add	$16, %ebx
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%ecx)
+	lea	16(%ecx), %ecx
+	jmp	L(FillFrom1To16Bytes)
+#  endif
+
+	.p2align 4
+L(ExitTail1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	SAVE_RESULT_TAIL (0)
+#  ifdef USE_AS_STRNCPY
+	sub	$1, %ebx
+	lea	1(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	SAVE_RESULT_TAIL (1)
+#  ifdef USE_AS_STRNCPY
+	sub	$2, %ebx
+	lea	2(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	SAVE_RESULT_TAIL (2)
+#  ifdef USE_AS_STRNCPY
+	sub	$3, %ebx
+	lea	3(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT_TAIL (3)
+#  ifdef USE_AS_STRNCPY
+	sub	$4, %ebx
+	lea	4(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	SAVE_RESULT_TAIL (4)
+#  ifdef USE_AS_STRNCPY
+	sub	$5, %ebx
+	lea	5(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	SAVE_RESULT_TAIL (5)
+#  ifdef USE_AS_STRNCPY
+	sub	$6, %ebx
+	lea	6(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	SAVE_RESULT_TAIL (6)
+#  ifdef USE_AS_STRNCPY
+	sub	$7, %ebx
+	lea	7(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT_TAIL (7)
+#  ifdef USE_AS_STRNCPY
+	sub	$8, %ebx
+	lea	8(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail9):
+	movlpd	(%ecx), %xmm0
+	movb	8(%ecx), %al
+	movlpd	%xmm0, (%edx)
+	movb	%al, 8(%edx)
+	SAVE_RESULT_TAIL (8)
+#  ifdef USE_AS_STRNCPY
+	sub	$9, %ebx
+	lea	9(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail10):
+	movlpd	(%ecx), %xmm0
+	movw	8(%ecx), %ax
+	movlpd	%xmm0, (%edx)
+	movw	%ax, 8(%edx)
+	SAVE_RESULT_TAIL (9)
+#  ifdef USE_AS_STRNCPY
+	sub	$10, %ebx
+	lea	10(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail11):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 7(%edx)
+	SAVE_RESULT_TAIL (10)
+#  ifdef USE_AS_STRNCPY
+	sub	$11, %ebx
+	lea	11(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail12):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT_TAIL (11)
+#  ifdef USE_AS_STRNCPY
+	sub	$12, %ebx
+	lea	12(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail13):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	SAVE_RESULT_TAIL (12)
+#  ifdef USE_AS_STRNCPY
+	sub	$13, %ebx
+	lea	13(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail14):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	SAVE_RESULT_TAIL (13)
+#  ifdef USE_AS_STRNCPY
+	sub	$14, %ebx
+	lea	14(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail15):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	SAVE_RESULT_TAIL (14)
+#  ifdef USE_AS_STRNCPY
+	sub	$15, %ebx
+	lea	15(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	SAVE_RESULT_TAIL (15)
+#  ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	lea	16(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+# endif
+
+# ifdef USE_AS_STRNCPY
+#  ifndef USE_AS_STRCAT
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+#  endif
+	.p2align 4
+L(StrncpyLeaveCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm4, -64(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm5, -48(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm6, -32(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm6, -32(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+	jmp	L(CopyFrom1To16BytesCase2)
+
+/*--------------------------------------------------*/
+	.p2align 4
+L(StrncpyExit1Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	mov	$15, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit2Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	mov	$14, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit3Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	mov	$13, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit4Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	mov	$12, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit5Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 7(%edx)
+	mov	$11, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit6Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	6(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 6(%edx)
+	mov	$10, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit7Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	5(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 5(%edx)
+	mov	$9, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit8Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$8, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit9Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$7, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit10Case2OrCase3):
+	movlpd	-1(%ecx), %xmm0
+	movlpd	%xmm0, -1(%edx)
+	mov	$6, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit11Case2OrCase3):
+	movlpd	-2(%ecx), %xmm0
+	movlpd	%xmm0, -2(%edx)
+	mov	$5, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit12Case2OrCase3):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit13Case2OrCase3):
+	movl	-1(%ecx), %esi
+	movl	%esi, -1(%edx)
+	mov	$3, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit14Case2OrCase3):
+	movl	-2(%ecx), %esi
+	movl	%esi, -2(%edx)
+	mov	$2, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit15Case2OrCase3):
+	movl	-3(%ecx), %esi
+	movl	%esi, -3(%edx)
+	mov	$1, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave1):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit1)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit1):
+	lea	15(%edx, %esi), %edx
+	lea	15(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave2):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit2)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit2):
+	lea	14(%edx, %esi), %edx
+	lea	14(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave3):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit3)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit3):
+	lea	13(%edx, %esi), %edx
+	lea	13(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave4):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit4)
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit4):
+	lea	12(%edx, %esi), %edx
+	lea	12(%ecx, %esi), %ecx
+	movlpd	-12(%ecx), %xmm0
+	movl	-4(%ecx), %eax
+	movlpd	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave5):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit5)
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit5):
+	lea	11(%edx, %esi), %edx
+	lea	11(%ecx, %esi), %ecx
+	movlpd	-11(%ecx), %xmm0
+	movl	-4(%ecx), %eax
+	movlpd	%xmm0, -11(%edx)
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave6):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit6)
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit6):
+	lea	10(%edx, %esi), %edx
+	lea	10(%ecx, %esi), %ecx
+
+	movlpd	-10(%ecx), %xmm0
+	movw	-2(%ecx), %ax
+	movlpd	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave7):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit7)
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit7):
+	lea	9(%edx, %esi), %edx
+	lea	9(%ecx, %esi), %ecx
+
+	movlpd	-9(%ecx), %xmm0
+	movb	-1(%ecx), %ah
+	movlpd	%xmm0, -9(%edx)
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave8):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit8)
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit8):
+	lea	8(%edx, %esi), %edx
+	lea	8(%ecx, %esi), %ecx
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave9):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit9)
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit9):
+	lea	7(%edx, %esi), %edx
+	lea	7(%ecx, %esi), %ecx
+
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave10):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit10)
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit10):
+	lea	6(%edx, %esi), %edx
+	lea	6(%ecx, %esi), %ecx
+
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave11):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit11)
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit11):
+	lea	5(%edx, %esi), %edx
+	lea	5(%ecx, %esi), %ecx
+	movl	-5(%ecx), %esi
+	movb	-1(%ecx), %ah
+	movl	%esi, -5(%edx)
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave12):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit12)
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit12):
+	lea	4(%edx, %esi), %edx
+	lea	4(%ecx, %esi), %ecx
+	movl	-4(%ecx), %eax
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave13):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit13)
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit13):
+	lea	3(%edx, %esi), %edx
+	lea	3(%ecx, %esi), %ecx
+
+	movl	-4(%ecx), %eax
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave14):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit14)
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit14):
+	lea	2(%edx, %esi), %edx
+	lea	2(%ecx, %esi), %ecx
+	movw	-2(%ecx), %ax
+	movw	%ax, -2(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave15):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit15)
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit15):
+	lea	1(%edx, %esi), %edx
+	lea	1(%ecx, %esi), %ecx
+	movb	-1(%ecx), %ah
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+# endif
+
+# ifndef USE_AS_STRCAT
+#  ifdef USE_AS_STRNCPY
+	CFI_POP (%esi)
+	CFI_POP (%edi)
+
+	.p2align 4
+L(ExitTail0):
+	movl	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(StrncpyExit15Bytes):
+	cmp	$12, %ebx
+	jbe	L(StrncpyExit12Bytes)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmp	$13, %ebx
+	je	L(ExitTail13)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmp	$14, %ebx
+	je	L(ExitTail14)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+#   ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   else
+	movl	%edx, %eax
+#   endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit12Bytes):
+	cmp	$9, %ebx
+	je	L(ExitTail9)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmp	$10, %ebx
+	je	L(ExitTail10)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmp	$11, %ebx
+	je	L(ExitTail11)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT_TAIL (11)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit8Bytes):
+	cmp	$4, %ebx
+	jbe	L(StrncpyExit4Bytes)
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+
+	cmp	$5, %ebx
+	je	L(ExitTail5)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmp	$6, %ebx
+	je	L(ExitTail6)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmp	$7, %ebx
+	je	L(ExitTail7)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+#   ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   else
+	movl	%edx, %eax
+#   endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit4Bytes):
+	test	%ebx, %ebx
+	jz	L(ExitTail0)
+	cmp	$1, %ebx
+	je	L(ExitTail1)
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmp	$2, %ebx
+	je	L(ExitTail2)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmp	$3, %ebx
+	je	L(ExitTail3)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT_TAIL (3)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+	RETURN
+#  endif
+
+END (STRCPY)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S
new file mode 100644
index 0000000000..ffbc03c6d5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S
@@ -0,0 +1,116 @@
+/* Multiple versions of strcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
+# ifndef STRCPY
+#  define STRCPY strcpy
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3	__stpncpy_ssse3
+#  define STRCPY_SSE2		__stpncpy_sse2
+#  define STRCPY_IA32		__stpncpy_ia32
+#  define __GI_STRCPY		__GI_stpncpy
+#  define __GI___STRCPY		__GI___stpncpy
+# else
+#  define STRCPY_SSSE3	__stpcpy_ssse3
+#  define STRCPY_SSE2		__stpcpy_sse2
+#  define STRCPY_IA32		__stpcpy_ia32
+#  define __GI_STRCPY		__GI_stpcpy
+#  define __GI___STRCPY		__GI___stpcpy
+# endif
+#else
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3	__strncpy_ssse3
+#  define STRCPY_SSE2		__strncpy_sse2
+#  define STRCPY_IA32		__strncpy_ia32
+#  define __GI_STRCPY		__GI_strncpy
+# else
+#  define STRCPY_SSSE3	__strcpy_ssse3
+#  define STRCPY_SSE2		__strcpy_sse2
+#  define STRCPY_IA32		__strcpy_ia32
+#  define __GI_STRCPY		__GI_strcpy
+# endif
+#endif
+
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncpy in static library since we
+   need strncpy before the initialization happened.  */
+#if IS_IN (libc)
+
+	.text
+ENTRY(STRCPY)
+	.type	STRCPY, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCPY_IA32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCPY_SSE2)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCPY_SSSE3)
+2:	ret
+END(STRCPY)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCPY_IA32, @function; \
+	.align 16; \
+	.globl STRCPY_IA32; \
+	.hidden STRCPY_IA32; \
+	STRCPY_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCPY_IA32, .-STRCPY_IA32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCPY; __GI_STRCPY = STRCPY_IA32
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI___STRCPY; __GI___STRCPY = STRCPY_IA32
+
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+#  include "../../stpncpy.S"
+# else
+#  include "../../i586/stpcpy.S"
+# endif
+#else
+# ifndef USE_AS_STRNCPY
+#  include "../../i586/strcpy.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c
new file mode 100644
index 0000000000..6d61e190a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c
@@ -0,0 +1,2 @@
+#define __strcspn_sse2 __strcspn_ia32
+#include <sysdeps/x86_64/multiarch/strcspn-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S
new file mode 100644
index 0000000000..21e5093924
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S
@@ -0,0 +1,75 @@
+/* Multiple versions of strcspn
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRPBRK
+#define STRCSPN_SSE42	__strpbrk_sse42
+#define STRCSPN_IA32	__strpbrk_ia32
+#define __GI_STRCSPN	__GI_strpbrk
+#else
+#ifndef STRCSPN
+#define STRCSPN		strcspn
+#define STRCSPN_SSE42	__strcspn_sse42
+#define STRCSPN_IA32	__strcspn_ia32
+#define __GI_STRCSPN	__GI_strcspn
+#endif
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strpbrk in static library since we
+   need strpbrk before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc)
+	.text
+ENTRY(STRCSPN)
+	.type	STRCSPN, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCSPN_IA32)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCSPN_SSE42)
+2:	ret
+END(STRCSPN)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCSPN_IA32, @function; \
+	.globl STRCSPN_IA32; \
+	.p2align 4; \
+	STRCSPN_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCSPN_IA32, .-STRCSPN_IA32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_IA32
+#endif
+
+#ifdef USE_AS_STRPBRK
+#include "../../strpbrk.S"
+#else
+#include "../../strcspn.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
new file mode 100644
index 0000000000..d3ea864bab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
@@ -0,0 +1,125 @@
+/* strlen with SSE2 and BSF
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined SHARED && IS_IN (libc)
+
+#include <sysdep.h>
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+#define PARMS		4 + 8	/* Preserve ESI and EDI.  */
+#define	STR		PARMS
+#define ENTRANCE	PUSH (%esi); PUSH (%edi); cfi_remember_state
+#define RETURN		POP (%edi); POP (%esi); ret; \
+			cfi_restore_state; cfi_remember_state
+
+	.text
+ENTRY ( __strlen_sse2_bsf)
+	ENTRANCE
+	mov	STR(%esp), %edi
+	xor	%eax, %eax
+	mov	%edi, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%edi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	mov	%edi, %eax
+	and	$-16, %eax
+	jmp	L(align16_start)
+L(next):
+
+	mov	%edi, %eax
+	and	$-16, %eax
+	pcmpeqb	(%eax), %xmm0
+	mov	$-1, %esi
+	sub	%eax, %ecx
+	shl	%cl, %esi
+	pmovmskb %xmm0, %edx
+	and	%esi, %edx
+	jnz	L(exit)
+L(align16_start):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	.p2align 4
+L(align16_loop):
+	pcmpeqb	16(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(align16_loop)
+L(exit):
+	sub	%edi, %eax
+L(exit_less16):
+	bsf	%edx, %edx
+	add	%edx, %eax
+	RETURN
+L(exit16):
+	sub	%edi, %eax
+	bsf	%edx, %edx
+	add	%edx, %eax
+	add	$16, %eax
+	RETURN
+L(exit32):
+	sub	%edi, %eax
+	bsf	%edx, %edx
+	add	%edx, %eax
+	add	$32, %eax
+	RETURN
+L(exit48):
+	sub	%edi, %eax
+	bsf	%edx, %edx
+	add	%edx, %eax
+	add	$48, %eax
+	POP (%edi)
+	POP (%esi)
+	ret
+
+END ( __strlen_sse2_bsf)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S
new file mode 100644
index 0000000000..36fc1469d0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S
@@ -0,0 +1,695 @@
+/* strlen with SSE2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */
+
+#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+
+#  include <sysdep.h>
+#  define PARMS	4
+#  define STR	PARMS
+#  define RETURN	ret
+
+#  ifdef USE_AS_STRNLEN
+#   define LEN	PARMS + 8
+#   define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#   define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#   define PUSH(REG)	pushl	REG;	CFI_PUSH (REG)
+#   define POP(REG)	popl	REG;	CFI_POP (REG)
+#   undef RETURN
+#   define RETURN	POP (%edi); CFI_PUSH(%edi); ret
+#  endif
+
+#  ifndef STRLEN
+#   define STRLEN	__strlen_sse2
+#  endif
+
+	atom_text_section
+ENTRY (STRLEN)
+	mov	STR(%esp), %edx
+#  ifdef USE_AS_STRNLEN
+	PUSH	(%edi)
+	movl	LEN(%esp), %edi
+	sub	$4, %edi
+	jbe	L(len_less4_prolog)
+#  endif
+# endif
+	xor	%eax, %eax
+	cmpb	$0, (%edx)
+	jz	L(exit_tail0)
+	cmpb	$0, 1(%edx)
+	jz	L(exit_tail1)
+	cmpb	$0, 2(%edx)
+	jz	L(exit_tail2)
+	cmpb	$0, 3(%edx)
+	jz	L(exit_tail3)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less8_prolog)
+# endif
+
+	cmpb	$0, 4(%edx)
+	jz	L(exit_tail4)
+	cmpb	$0, 5(%edx)
+	jz	L(exit_tail5)
+	cmpb	$0, 6(%edx)
+	jz	L(exit_tail6)
+	cmpb	$0, 7(%edx)
+	jz	L(exit_tail7)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less12_prolog)
+# endif
+
+	cmpb	$0, 8(%edx)
+	jz	L(exit_tail8)
+	cmpb	$0, 9(%edx)
+	jz	L(exit_tail9)
+	cmpb	$0, 10(%edx)
+	jz	L(exit_tail10)
+	cmpb	$0, 11(%edx)
+	jz	L(exit_tail11)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less16_prolog)
+# endif
+
+	cmpb	$0, 12(%edx)
+	jz	L(exit_tail12)
+	cmpb	$0, 13(%edx)
+	jz	L(exit_tail13)
+	cmpb	$0, 14(%edx)
+	jz	L(exit_tail14)
+	cmpb	$0, 15(%edx)
+	jz	L(exit_tail15)
+
+	pxor	%xmm0, %xmm0
+	lea	16(%edx), %eax
+	mov	%eax, %ecx
+	and	$-16, %eax
+
+# ifdef USE_AS_STRNLEN
+	and	$15, %edx
+	add	%edx, %edi
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	mov	%eax, %edx
+	and	$63, %edx
+	add	%edx, %edi
+# endif
+
+	and	$-0x40, %eax
+
+	.p2align 4
+L(aligned_64_loop):
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+	movaps	(%eax), %xmm0
+	movaps	16(%eax), %xmm1
+	movaps	32(%eax), %xmm2
+	movaps	48(%eax), %xmm6
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqb	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	64(%eax), %eax
+	jz	L(aligned_64_loop)
+
+	pcmpeqb	-64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	48(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqb	-32(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqb	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+L(exit):
+	sub	%ecx, %eax
+	test	%dl, %dl
+	jz	L(exit_high)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_8)
+	test	$0x01, %dl
+	jnz	L(exit_tail0)
+	test	$0x02, %dl
+	jnz	L(exit_tail1)
+	test	$0x04, %dl
+	jnz	L(exit_tail2)
+	add	$3, %eax
+	RETURN
+
+	.p2align 4
+L(exit_8):
+	test	$0x10, %dl
+	jnz	L(exit_tail4)
+	test	$0x20, %dl
+	jnz	L(exit_tail5)
+	test	$0x40, %dl
+	jnz	L(exit_tail6)
+	add	$7, %eax
+	RETURN
+
+	.p2align 4
+L(exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_high_8)
+	test	$0x01, %dh
+	jnz	L(exit_tail8)
+	test	$0x02, %dh
+	jnz	L(exit_tail9)
+	test	$0x04, %dh
+	jnz	L(exit_tail10)
+	add	$11, %eax
+	RETURN
+
+	.p2align 4
+L(exit_high_8):
+	test	$0x10, %dh
+	jnz	L(exit_tail12)
+	test	$0x20, %dh
+	jnz	L(exit_tail13)
+	test	$0x40, %dh
+	jnz	L(exit_tail14)
+	add	$15, %eax
+L(exit_tail0):
+	RETURN
+
+# ifdef USE_AS_STRNLEN
+
+	.p2align 4
+L(len_less64):
+	pxor	%xmm0, %xmm0
+	add	$64, %edi
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	movl	LEN(%esp), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit):
+	sub	%ecx, %eax
+
+	test	%dl, %dl
+	jz	L(strnlen_exit_high)
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(strnlen_exit_8)
+	test	$0x01, %dl
+	jnz	L(exit_tail0)
+	test	$0x02, %dl
+	jnz	L(strnlen_exit_tail1)
+	test	$0x04, %dl
+	jnz	L(strnlen_exit_tail2)
+	sub	$4, %edi
+	jb	L(return_start_len)
+	lea	3(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_8):
+	test	$0x10, %dl
+	jnz	L(strnlen_exit_tail4)
+	test	$0x20, %dl
+	jnz	L(strnlen_exit_tail5)
+	test	$0x40, %dl
+	jnz	L(strnlen_exit_tail6)
+	sub	$8, %edi
+	jb	L(return_start_len)
+	lea	7(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(strnlen_exit_high_8)
+	test	$0x01, %dh
+	jnz	L(strnlen_exit_tail8)
+	test	$0x02, %dh
+	jnz	L(strnlen_exit_tail9)
+	test	$0x04, %dh
+	jnz	L(strnlen_exit_tail10)
+	sub	$12, %edi
+	jb	L(return_start_len)
+	lea	11(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_high_8):
+	test	$0x10, %dh
+	jnz	L(strnlen_exit_tail12)
+	test	$0x20, %dh
+	jnz	L(strnlen_exit_tail13)
+	test	$0x40, %dh
+	jnz	L(strnlen_exit_tail14)
+	sub	$16, %edi
+	jb	L(return_start_len)
+	lea	15(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail1):
+	sub	$2, %edi
+	jb	L(return_start_len)
+	lea	1(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail2):
+	sub	$3, %edi
+	jb	L(return_start_len)
+	lea	2(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail4):
+	sub	$5, %edi
+	jb	L(return_start_len)
+	lea	4(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail5):
+	sub	$6, %edi
+	jb	L(return_start_len)
+	lea	5(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail6):
+	sub	$7, %edi
+	jb	L(return_start_len)
+	lea	6(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail8):
+	sub	$9, %edi
+	jb	L(return_start_len)
+	lea	8(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail9):
+	sub	$10, %edi
+	jb	L(return_start_len)
+	lea	9(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail10):
+	sub	$11, %edi
+	jb	L(return_start_len)
+	lea	10(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail12):
+	sub	$13, %edi
+	jb	L(return_start_len)
+	lea	12(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail13):
+	sub	$14, %edi
+	jb	L(return_start_len)
+	lea	13(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail14):
+	sub	$15, %edi
+	jb	L(return_start_len)
+	lea	14(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(return_start_len):
+	movl	LEN(%esp), %eax
+	RETURN
+
+/* for prolog only */
+
+	.p2align 4
+L(len_less4_prolog):
+	xor	%eax, %eax
+
+	add	$4, %edi
+	jz	L(exit_tail0)
+
+	cmpb	$0, (%edx)
+	jz	L(exit_tail0)
+	cmp	$1, %edi
+	je	L(exit_tail1)
+
+	cmpb	$0, 1(%edx)
+	jz	L(exit_tail1)
+	cmp	$2, %edi
+	je	L(exit_tail2)
+
+	cmpb	$0, 2(%edx)
+	jz	L(exit_tail2)
+	cmp	$3, %edi
+	je	L(exit_tail3)
+
+	cmpb	$0, 3(%edx)
+	jz	L(exit_tail3)
+	mov	$4, %eax
+	RETURN
+
+	.p2align 4
+L(len_less8_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 4(%edx)
+	jz	L(exit_tail4)
+	cmp	$1, %edi
+	je	L(exit_tail5)
+
+	cmpb	$0, 5(%edx)
+	jz	L(exit_tail5)
+	cmp	$2, %edi
+	je	L(exit_tail6)
+
+	cmpb	$0, 6(%edx)
+	jz	L(exit_tail6)
+	cmp	$3, %edi
+	je	L(exit_tail7)
+
+	cmpb	$0, 7(%edx)
+	jz	L(exit_tail7)
+	mov	$8, %eax
+	RETURN
+
+
+	.p2align 4
+L(len_less12_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 8(%edx)
+	jz	L(exit_tail8)
+	cmp	$1, %edi
+	je	L(exit_tail9)
+
+	cmpb	$0, 9(%edx)
+	jz	L(exit_tail9)
+	cmp	$2, %edi
+	je	L(exit_tail10)
+
+	cmpb	$0, 10(%edx)
+	jz	L(exit_tail10)
+	cmp	$3, %edi
+	je	L(exit_tail11)
+
+	cmpb	$0, 11(%edx)
+	jz	L(exit_tail11)
+	mov	$12, %eax
+	RETURN
+
+	.p2align 4
+L(len_less16_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 12(%edx)
+	jz	L(exit_tail12)
+	cmp	$1, %edi
+	je	L(exit_tail13)
+
+	cmpb	$0, 13(%edx)
+	jz	L(exit_tail13)
+	cmp	$2, %edi
+	je	L(exit_tail14)
+
+	cmpb	$0, 14(%edx)
+	jz	L(exit_tail14)
+	cmp	$3, %edi
+	je	L(exit_tail15)
+
+	cmpb	$0, 15(%edx)
+	jz	L(exit_tail15)
+	mov	$16, %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(exit_tail1):
+	add	$1, %eax
+	RETURN
+
+L(exit_tail2):
+	add	$2, %eax
+	RETURN
+
+L(exit_tail3):
+	add	$3, %eax
+	RETURN
+
+L(exit_tail4):
+	add	$4, %eax
+	RETURN
+
+L(exit_tail5):
+	add	$5, %eax
+	RETURN
+
+L(exit_tail6):
+	add	$6, %eax
+	RETURN
+
+L(exit_tail7):
+	add	$7, %eax
+	RETURN
+
+L(exit_tail8):
+	add	$8, %eax
+	RETURN
+
+L(exit_tail9):
+	add	$9, %eax
+	RETURN
+
+L(exit_tail10):
+	add	$10, %eax
+	RETURN
+
+L(exit_tail11):
+	add	$11, %eax
+	RETURN
+
+L(exit_tail12):
+	add	$12, %eax
+	RETURN
+
+L(exit_tail13):
+	add	$13, %eax
+	RETURN
+
+L(exit_tail14):
+	add	$14, %eax
+	RETURN
+
+L(exit_tail15):
+	add	$15, %eax
+# ifndef USE_AS_STRCAT
+	RETURN
+END (STRLEN)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S
new file mode 100644
index 0000000000..77cf6bcdb0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S
@@ -0,0 +1,60 @@
+/* Multiple versions of strlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc and for the
+   DSO.  In static binaries, we need strlen before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(strlen)
+	.type	strlen, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strlen_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strlen_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strlen_sse2)
+2:	ret
+END(strlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strlen_ia32, @function; \
+	.globl __strlen_ia32; \
+	.p2align 4; \
+	__strlen_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strlen_ia32, .-__strlen_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strlen; __GI_strlen = __strlen_ia32
+#endif
+
+#include "../../i586/strlen.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c
new file mode 100644
index 0000000000..76581eb62b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c
@@ -0,0 +1,8 @@
+#include <string.h>
+
+extern __typeof (strncasecmp) __strncasecmp_nonascii;
+
+#define __strncasecmp __strncasecmp_nonascii
+#include <string/strncase.c>
+
+strong_alias (__strncasecmp_nonascii, __strncasecmp_ia32)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S
new file mode 100644
index 0000000000..a56e63a566
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S
@@ -0,0 +1,39 @@
+/* Entry point for multi-version x86 strncasecmp.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY(__strncasecmp)
+	.type	__strncasecmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strncasecmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strncasecmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	LOAD_FUNC_GOT_EAX (__strncasecmp_sse4_2)
+2:	ret
+END(__strncasecmp)
+
+weak_alias (__strncasecmp, strncasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c
new file mode 100644
index 0000000000..7e601af271
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c
@@ -0,0 +1,13 @@
+#include <string.h>
+
+extern __typeof (strncasecmp_l) __strncasecmp_l_nonascii;
+
+#define __strncasecmp_l __strncasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL    1
+#include <string/strncase.c>
+
+strong_alias (__strncasecmp_l_nonascii, __strncasecmp_l_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+   the IFUNC.  */
+strong_alias (__strncasecmp_l_nonascii, __GI___strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S
new file mode 100644
index 0000000000..557210832e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRNCASECMP_L 1
+#include "strcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S
new file mode 100644
index 0000000000..d438a1ae35
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRNCASECMP_L 1
+#include "strcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S
new file mode 100644
index 0000000000..8a74ee8574
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S
@@ -0,0 +1,7 @@
+/* Multiple versions of strncasecmp_l
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCMP __strncasecmp_l
+#define USE_AS_STRNCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strncasecmp_l, strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c
new file mode 100644
index 0000000000..132a000545
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c
@@ -0,0 +1,8 @@
+#define STRNCAT __strncat_ia32
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+  __hidden_ver1 (__strncat_ia32, __GI___strncat, __strncat_ia32);
+#endif
+
+#include "string/strncat.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S
new file mode 100644
index 0000000000..f1045b72b8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S
@@ -0,0 +1,4 @@
+#define STRCAT  __strncat_sse2
+#define USE_AS_STRNCAT
+
+#include "strcat-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S
new file mode 100644
index 0000000000..625b90a978
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S
@@ -0,0 +1,4 @@
+#define STRCAT  __strncat_ssse3
+#define USE_AS_STRNCAT
+
+#include "strcat-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S
new file mode 100644
index 0000000000..5c1bf41453
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncat
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCAT strncat
+#define USE_AS_STRNCAT
+#include "strcat.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c
new file mode 100644
index 0000000000..cc059da494
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c
@@ -0,0 +1,8 @@
+#ifdef SHARED
+# define STRNCMP __strncmp_ia32
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)  \
+    __hidden_ver1 (__strncmp_ia32, __GI_strncmp, __strncmp_ia32);
+#endif
+
+#include "string/strncmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S
new file mode 100644
index 0000000000..cf14dfaf6c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP	__strncmp_sse4_2
+# include "strcmp-sse4.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
new file mode 100644
index 0000000000..536c8685f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP	__strncmp_ssse3
+# include "strcmp-ssse3.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S
new file mode 100644
index 0000000000..150d4786d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncmp
+   All versions must be listed in ifunc-impl-list.c.  */
+#define USE_AS_STRNCMP
+#define STRCMP	strncmp
+#include "strcmp.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c
new file mode 100644
index 0000000000..201e3f98b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c
@@ -0,0 +1,8 @@
+#define STRNCPY __strncpy_ia32
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)  \
+    __hidden_ver1 (__strncpy_ia32, __GI_strncpy, __strncpy_ia32);
+#endif
+
+#include "string/strncpy.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S
new file mode 100644
index 0000000000..bdd99239a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
new file mode 100644
index 0000000000..bf82ee447d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S
new file mode 100644
index 0000000000..9c257efc6e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define USE_AS_STRNCPY
+#define STRCPY strncpy
+#include "strcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c
new file mode 100644
index 0000000000..351e939a93
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c
@@ -0,0 +1,10 @@
+#define STRNLEN  __strnlen_ia32
+#ifdef SHARED
+# undef libc_hidden_def
+# define libc_hidden_def(name)  \
+    __hidden_ver1 (__strnlen_ia32, __GI_strnlen, __strnlen_ia32); \
+    strong_alias (__strnlen_ia32, __strnlen_ia32_1); \
+    __hidden_ver1 (__strnlen_ia32_1, __GI___strnlen, __strnlen_ia32_1);
+#endif
+
+#include "string/strnlen.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S
new file mode 100644
index 0000000000..56b6ae2a5c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNLEN
+#define STRLEN __strnlen_sse2
+#include "strlen-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S
new file mode 100644
index 0000000000..d241522c70
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S
@@ -0,0 +1,37 @@
+/* Multiple versions of strnlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__strnlen)
+	.type	__strnlen, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strnlen_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strnlen_sse2)
+2:	ret
+END(__strnlen)
+
+weak_alias(__strnlen, strnlen)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c
new file mode 100644
index 0000000000..5db62053b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c
@@ -0,0 +1,2 @@
+#define __strpbrk_sse2 __strpbrk_ia32
+#include <sysdeps/x86_64/multiarch/strpbrk-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S
new file mode 100644
index 0000000000..7201d6376f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strpbrk
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCSPN strpbrk
+#define USE_AS_STRPBRK
+#include "strcspn.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
new file mode 100644
index 0000000000..39a7c8825b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
@@ -0,0 +1,282 @@
+/* strrchr with SSE2 with bsf and bsr
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	.text
+ENTRY (__strrchr_sse2_bsf)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	PUSH	(%edi)
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$63, %ecx
+	cmp	$48, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	ja	L(crosscashe)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+	add	$16, %edi
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_return_value1):
+	bsf	%edx, %ecx
+	mov	$2, %edx
+	shl	%cl, %edx
+	sub	$1, %edx
+	and	%edx, %eax
+	jz	L(return_null)
+	bsr	%eax, %eax
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+	CFI_PUSH	(%edi)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%edx, %edx
+	jnz	L(unaligned_return_value1)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	lea	16(%edi), %esi
+	and	$-16, %edi
+	add	$16, %edi
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+	L(crosscashe):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	add	$16, %edi
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_return_value):
+	add	%ecx, %edi
+	bsf	%edx, %ecx
+	mov	$2, %edx
+	shl	%cl, %edx
+	sub	$1, %edx
+	and	%edx, %eax
+	jz	L(return_null)
+	bsr	%eax, %eax
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+	CFI_PUSH	(%edi)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(unaligned_return_value)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	add	$16, %edi
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%ebx, %ebx
+	jz	L(return_null_1)
+	bsr	%ebx, %eax
+	add	%esi, %eax
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	sub	$16, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH	(%edi)
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(return_value_1)
+	mov	%eax, %ebx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(return_value_1):
+	bsf	%ecx, %ecx
+	mov	$2, %edx
+	shl	%cl, %edx
+	sub	$1, %edx
+	and	%edx, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	bsr	%eax, %eax
+	add	%edi, %eax
+	sub	$16, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH	(%edi)
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH	(%edi)
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+/* Return NULL.  */
+	.p2align 4
+L(return_null_1):
+	POP	(%ebx)
+	POP	(%esi)
+	POP	(%edi)
+	xor	%eax, %eax
+	ret
+
+END (__strrchr_sse2_bsf)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S
new file mode 100644
index 0000000000..20934288be
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S
@@ -0,0 +1,708 @@
+/* strrchr SSE2 without bsf and bsr
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  8
+# define ENTRANCE PUSH(%edi);
+# define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	atom_text_section
+ENTRY (__strrchr_sse2)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$63, %ecx
+	cmp	$48, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	ja	L(crosscache)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %ecx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%ecx, %ecx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%ecx, %ecx
+	jnz	L(prolog_find_zero_1)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	mov	%edi, %esi
+	and	$-16, %edi
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(crosscache):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(prolog_find_zero)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%ebx, %ebx
+	jz	L(return_null_1)
+	mov	%ebx, %eax
+	mov	%esi, %edi
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(return_null_1):
+	POP	(%ebx)
+	POP	(%esi)
+
+	xor	%eax, %eax
+	RETURN
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(find_zero)
+	mov	%eax, %ebx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	%cl, %cl
+	jz	L(find_zero_high)
+	mov	%cl, %dl
+	and	$15, %dl
+	jz	L(find_zero_8)
+	test	$0x01, %cl
+	jnz	L(FindZeroExit1)
+	test	$0x02, %cl
+	jnz	L(FindZeroExit2)
+	test	$0x04, %cl
+	jnz	L(FindZeroExit3)
+	and	$1 << 4 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_8):
+	test	$0x10, %cl
+	jnz	L(FindZeroExit5)
+	test	$0x20, %cl
+	jnz	L(FindZeroExit6)
+	test	$0x40, %cl
+	jnz	L(FindZeroExit7)
+	and	$1 << 8 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_high):
+	mov	%ch, %dh
+	and	$15, %dh
+	jz	L(find_zero_high_8)
+	test	$0x01, %ch
+	jnz	L(FindZeroExit9)
+	test	$0x02, %ch
+	jnz	L(FindZeroExit10)
+	test	$0x04, %ch
+	jnz	L(FindZeroExit11)
+	and	$1 << 12 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_high_8):
+	test	$0x10, %ch
+	jnz	L(FindZeroExit13)
+	test	$0x20, %ch
+	jnz	L(FindZeroExit14)
+	test	$0x40, %ch
+	jnz	L(FindZeroExit15)
+	and	$1 << 16 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit1):
+	and	$1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit2):
+	and	$1 << 2 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit3):
+	and	$1 << 3 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit5):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit6):
+	and	$1 << 6 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit7):
+	and	$1 << 7 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit9):
+	and	$1 << 9 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit10):
+	and	$1 << 10 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit11):
+	and	$1 << 11 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit13):
+	and	$1 << 13 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit14):
+	and	$1 << 14 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit15):
+	and	$1 << 15 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	.p2align 4
+L(match_exit):
+	test	%ah, %ah
+	jnz	L(match_exit_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(match_exit_8)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_exit_8):
+	test	$0x80, %al
+	jnz	L(Exit8)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	lea	-12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_exit_high):
+	mov	%ah, %dh
+	and	$15 << 4, %dh
+	jnz	L(match_exit_high_8)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_exit_high_8):
+	test	$0x80, %ah
+	jnz	L(Exit16)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	lea	-4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	lea	-15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	lea	-14(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	lea	-13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	lea	-11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	lea	-10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit8):
+	lea	-9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	lea	-7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	lea	-6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	lea	-5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	lea	-3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	lea	-2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit16):
+	lea	-1(%edi), %eax
+	RETURN
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%ecx, %edi
+	mov     %edx, %ecx
+L(prolog_find_zero_1):
+	test	%cl, %cl
+	jz	L(prolog_find_zero_high)
+	mov	%cl, %dl
+	and	$15, %dl
+	jz	L(prolog_find_zero_8)
+	test	$0x01, %cl
+	jnz	L(PrologFindZeroExit1)
+	test	$0x02, %cl
+	jnz	L(PrologFindZeroExit2)
+	test	$0x04, %cl
+	jnz	L(PrologFindZeroExit3)
+	and	$1 << 4 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_8):
+	test	$0x10, %cl
+	jnz	L(PrologFindZeroExit5)
+	test	$0x20, %cl
+	jnz	L(PrologFindZeroExit6)
+	test	$0x40, %cl
+	jnz	L(PrologFindZeroExit7)
+	and	$1 << 8 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_high):
+	mov	%ch, %dh
+	and	$15, %dh
+	jz	L(prolog_find_zero_high_8)
+	test	$0x01, %ch
+	jnz	L(PrologFindZeroExit9)
+	test	$0x02, %ch
+	jnz	L(PrologFindZeroExit10)
+	test	$0x04, %ch
+	jnz	L(PrologFindZeroExit11)
+	and	$1 << 12 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_high_8):
+	test	$0x10, %ch
+	jnz	L(PrologFindZeroExit13)
+	test	$0x20, %ch
+	jnz	L(PrologFindZeroExit14)
+	test	$0x40, %ch
+	jnz	L(PrologFindZeroExit15)
+	and	$1 << 16 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit1):
+	and	$1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit2):
+	and	$1 << 2 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit3):
+	and	$1 << 3 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit5):
+	and	$1 << 5 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit6):
+	and	$1 << 6 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit7):
+	and	$1 << 7 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit9):
+	and	$1 << 9 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit10):
+	and	$1 << 10 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit11):
+	and	$1 << 11 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit13):
+	and	$1 << 13 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit14):
+	and	$1 << 14 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit15):
+	and	$1 << 15 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+END (__strrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S
new file mode 100644
index 0000000000..d9281eaeae
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S
@@ -0,0 +1,57 @@
+/* Multiple versions of strrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(strrchr)
+	.type	strrchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strrchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strrchr_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strrchr_sse2)
+2:	ret
+END(strrchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strrchr_ia32, @function; \
+	.globl __strrchr_ia32; \
+	.p2align 4; \
+	__strrchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strrchr_ia32, .-__strrchr_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strrchr; __GI_strrchr = __strrchr_ia32
+#endif
+
+#include "../../strrchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c
new file mode 100644
index 0000000000..bea09dea71
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c
@@ -0,0 +1,2 @@
+#define __strspn_sse2 __strspn_ia32
+#include <sysdeps/x86_64/multiarch/strspn-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S
new file mode 100644
index 0000000000..1269062381
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S
@@ -0,0 +1,56 @@
+/* Multiple versions of strspn
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+	.text
+ENTRY(strspn)
+	.type	strspn, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strspn_ia32)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strspn_sse42)
+2:	ret
+END(strspn)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strspn_ia32, @function; \
+	.globl __strspn_ia32; \
+	.p2align 4; \
+__strspn_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strspn_ia32, .-__strspn_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strspn; __GI_strspn = __strspn_ia32
+#endif
+
+#include "../../strspn.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c
new file mode 100644
index 0000000000..593cfec273
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/test-multiarch.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c
new file mode 100644
index 0000000000..7760b966e2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/varshift.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h
new file mode 100644
index 0000000000..7c72c70d67
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/varshift.h>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c
new file mode 100644
index 0000000000..38d41d04de
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c
@@ -0,0 +1,22 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# undef libc_hidden_weak
+# define libc_hidden_weak(name)
+
+# undef weak_alias
+# define weak_alias(name,alias)
+
+# ifdef SHARED
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+   __hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32); \
+   strong_alias (__wcschr_ia32, __wcschr_ia32_1); \
+   __hidden_ver1 (__wcschr_ia32_1, __GI___wcschr, __wcschr_ia32_1);
+# endif
+#endif
+
+extern __typeof (wcschr) __wcschr_ia32;
+
+#define WCSCHR  __wcschr_ia32
+#include <wcsmbs/wcschr.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S
new file mode 100644
index 0000000000..9ff6c3b8d6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S
@@ -0,0 +1,219 @@
+/* wcschr with SSE2, without using bsf instructions
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define STR1	PARMS
+# define STR2	STR1+4
+
+	atom_text_section
+ENTRY (__wcschr_sse2)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	mov	%ecx, %eax
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+
+	and	$63, %eax
+	cmp	$48, %eax
+	ja	L(cross_cache)
+
+	movdqu	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	and	$-16, %ecx
+	jmp	L(loop)
+
+	.p2align 4
+L(cross_cache):
+	PUSH	(%edi)
+	mov	%ecx, %edi
+	mov	%eax, %ecx
+	and	$-16, %edi
+	and	$15, %ecx
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	jz	L(unaligned_no_match)
+
+	add	%edi, %ecx
+	POP	(%edi)
+
+	test	%edx, %edx
+	jz	L(match_case1)
+	test	%al, %al
+	jz	L(match_higth_case2)
+	test	$15, %al
+	jnz	L(match_case2_4)
+	test	$15, %dl
+	jnz	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(unaligned_no_match):
+	mov	%edi, %ecx
+	POP	(%edi)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	pxor	%xmm2, %xmm2
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	add	$16, %ecx
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	pmovmskb %xmm2, %edx
+	test	%eax, %eax
+	jz	L(return_null)
+	test	%edx, %edx
+	jz	L(match_case1)
+
+	.p2align 4
+L(match_case2):
+	test	%al, %al
+	jz	L(match_higth_case2)
+	test	$15, %al
+	jnz	L(match_case2_4)
+	test	$15, %dl
+	jnz	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case2_4):
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(match_higth_case2):
+	test	%dl, %dl
+	jnz	L(return_null)
+	test	$15, %ah
+	jnz	L(match_case2_12)
+	test	$15, %dh
+	jnz	L(return_null)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case2_12):
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case1):
+	test	%al, %al
+	jz	L(match_higth_case1)
+
+	test	$0x01, %al
+	jnz	L(exit0)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_higth_case1):
+	test	$0x01, %ah
+	jnz	L(exit3)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit0):
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit3):
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+END (__wcschr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S
new file mode 100644
index 0000000000..d3c65a6436
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S
@@ -0,0 +1,36 @@
+/* Multiple versions of wcschr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__wcschr)
+	.type	wcschr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcschr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcschr_sse2)
+2:	ret
+END(__wcschr)
+weak_alias (__wcschr, wcschr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c
new file mode 100644
index 0000000000..e3337d77e2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c
@@ -0,0 +1,14 @@
+#include <wchar.h>
+
+#define WCSCMP __wcscmp_ia32
+#ifdef SHARED
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+  __hidden_ver1 (__wcscmp_ia32, __GI___wcscmp, __wcscmp_ia32);
+#endif
+#undef weak_alias
+#define weak_alias(name, alias)
+
+extern __typeof (wcscmp) __wcscmp_ia32;
+
+#include "wcsmbs/wcscmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
new file mode 100644
index 0000000000..a464b58204
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
@@ -0,0 +1,1018 @@
+/* wcscmp with SSE2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define ENTRANCE PUSH(%esi); PUSH(%edi)
+# define RETURN  POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi);
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+
+/* Note: wcscmp uses signed comparison, not unsugned as in strcmp function. */
+
+	.text
+ENTRY (__wcscmp_sse2)
+/*
+	* This implementation uses SSE to compare up to 16 bytes at a time.
+*/
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %eax
+
+	mov	(%eax), %ecx
+	cmp	%ecx, (%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	4(%eax), %ecx
+	cmp	%ecx, 4(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	8(%eax), %ecx
+	cmp	%ecx, 8(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	12(%eax), %ecx
+	cmp	%ecx, 12(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	ENTRANCE
+	add	$16, %eax
+	add	$16, %edx
+
+	mov	%eax, %esi
+	mov	%edx, %edi
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
+	mov	%al, %ch
+	mov	%dl, %cl
+	and	$63, %eax		/* esi alignment in cache line */
+	and	$63, %edx		/* edi alignment in cache line */
+	and	$15, %cl
+	jz	L(continue_00)
+	cmp	$16, %edx
+	jb	L(continue_0)
+	cmp	$32, %edx
+	jb	L(continue_16)
+	cmp	$48, %edx
+	jb	L(continue_32)
+
+L(continue_48):
+	and	$15, %ch
+	jz	L(continue_48_00)
+	cmp	$16, %eax
+	jb	L(continue_0_48)
+	cmp	$32, %eax
+	jb	L(continue_16_48)
+	cmp	$48, %eax
+	jb	L(continue_32_48)
+
+	.p2align 4
+L(continue_48_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_48_48)
+
+L(continue_0):
+	and	$15, %ch
+	jz	L(continue_0_00)
+	cmp	$16, %eax
+	jb	L(continue_0_0)
+	cmp	$32, %eax
+	jb	L(continue_0_16)
+	cmp	$48, %eax
+	jb	L(continue_0_32)
+
+	.p2align 4
+L(continue_0_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	mov	48(%esi), %ecx
+	cmp	%ecx, 48(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	52(%esi), %ecx
+	cmp	%ecx, 52(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	56(%esi), %ecx
+	cmp	%ecx, 56(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	60(%esi), %ecx
+	cmp	%ecx, 60(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_0_48)
+
+	.p2align 4
+L(continue_00):
+	and	$15, %ch
+	jz	L(continue_00_00)
+	cmp	$16, %eax
+	jb	L(continue_00_0)
+	cmp	$32, %eax
+	jb	L(continue_00_16)
+	cmp	$48, %eax
+	jb	L(continue_00_32)
+
+	.p2align 4
+L(continue_00_48):
+	pcmpeqd	(%edi), %xmm0
+	mov	(%edi), %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(less4_double_words1)
+
+	cmp	(%esi), %eax
+	jne	L(nequal)
+
+	mov	4(%edi), %eax
+	cmp	4(%esi), %eax
+	jne	L(nequal)
+
+	mov	8(%edi), %eax
+	cmp	8(%esi), %eax
+	jne	L(nequal)
+
+	mov	12(%edi), %eax
+	cmp	12(%esi), %eax
+	jne	L(nequal)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_32):
+	and	$15, %ch
+	jz	L(continue_32_00)
+	cmp	$16, %eax
+	jb	L(continue_0_32)
+	cmp	$32, %eax
+	jb	L(continue_16_32)
+	cmp	$48, %eax
+	jb	L(continue_32_32)
+
+	.p2align 4
+L(continue_32_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	16(%esi), %ecx
+	cmp	%ecx, 16(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	20(%esi), %ecx
+	cmp	%ecx, 20(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	24(%esi), %ecx
+	cmp	%ecx, 24(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	28(%esi), %ecx
+	cmp	%ecx, 28(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results */
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(continue_16):
+	and	$15, %ch
+	jz	L(continue_16_00)
+	cmp	$16, %eax
+	jb	L(continue_0_16)
+	cmp	$32, %eax
+	jb	L(continue_16_16)
+	cmp	$48, %eax
+	jb	L(continue_16_32)
+
+	.p2align 4
+L(continue_16_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	mov	32(%esi), %ecx
+	cmp	%ecx, 32(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	36(%esi), %ecx
+	cmp	%ecx, 36(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	40(%esi), %ecx
+	cmp	%ecx, 40(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	44(%esi), %ecx
+	cmp	%ecx, 44(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_16_48)
+
+	.p2align 4
+L(continue_00_00):
+	movdqa	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqa	16(%edi), %xmm3
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqa	32(%edi), %xmm5
+	pcmpeqd	%xmm5, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm5		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm5		/* packed sub of comparison results*/
+	pmovmskb %xmm5, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqa	48(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_00_00)
+
+	.p2align 4
+L(continue_00_32):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_00_16):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_00_0):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_48_00):
+	pcmpeqd	(%esi), %xmm0
+	mov	(%edi), %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(less4_double_words1)
+
+	cmp	(%esi), %eax
+	jne	L(nequal)
+
+	mov	4(%edi), %eax
+	cmp	4(%esi), %eax
+	jne	L(nequal)
+
+	mov	8(%edi), %eax
+	cmp	8(%esi), %eax
+	jne	L(nequal)
+
+	mov	12(%edi), %eax
+	cmp	12(%esi), %eax
+	jne	L(nequal)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_32_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_16_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_0_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_32_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_16_16):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm3
+	movdqu	16(%esi), %xmm4
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_0_0):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm3
+	movdqu	16(%esi), %xmm4
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_0_16):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(continue_0_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_16_48)
+
+	.p2align 4
+L(continue_16_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(less4_double_words1):
+	cmp	(%esi), %eax
+	jne	L(nequal)
+	test	%eax, %eax
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(less4_double_words):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word):
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word):
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_16):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_16)
+	and	$15, %dl
+	jz	L(second_double_word_16)
+	mov	16(%esi), %ecx
+	cmp	%ecx, 16(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_16):
+	mov	20(%esi), %ecx
+	cmp	%ecx, 20(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_16):
+	and	$15, %dh
+	jz	L(fourth_double_word_16)
+	mov	24(%esi), %ecx
+	cmp	%ecx, 24(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_16):
+	mov	28(%esi), %ecx
+	cmp	%ecx, 28(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_32):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_32)
+	and	$15, %dl
+	jz	L(second_double_word_32)
+	mov	32(%esi), %ecx
+	cmp	%ecx, 32(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_32):
+	mov	36(%esi), %ecx
+	cmp	%ecx, 36(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_32):
+	and	$15, %dh
+	jz	L(fourth_double_word_32)
+	mov	40(%esi), %ecx
+	cmp	%ecx, 40(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_32):
+	mov	44(%esi), %ecx
+	cmp	%ecx, 44(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_48):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_48)
+	and	$15, %dl
+	jz	L(second_double_word_48)
+	mov	48(%esi), %ecx
+	cmp	%ecx, 48(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_48):
+	mov	52(%esi), %ecx
+	cmp	%ecx, 52(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_48):
+	and	$15, %dh
+	jz	L(fourth_double_word_48)
+	mov	56(%esi), %ecx
+	cmp	%ecx, 56(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_48):
+	mov	60(%esi), %ecx
+	cmp	%ecx, 60(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(nequal):
+	mov	$1, %eax
+	jg	L(return)
+	neg	%eax
+	RETURN
+
+	.p2align 4
+L(return):
+	RETURN
+
+	.p2align 4
+L(equal):
+	xorl	%eax, %eax
+	RETURN
+
+	CFI_POP (%edi)
+	CFI_POP (%esi)
+
+	.p2align 4
+L(neq):
+	mov	$1, %eax
+	jg	L(neq_bigger)
+	neg	%eax
+
+L(neq_bigger):
+	ret
+
+	.p2align 4
+L(eq):
+	xorl	%eax, %eax
+	ret
+
+END (__wcscmp_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S
new file mode 100644
index 0000000000..7118bdd4db
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S
@@ -0,0 +1,39 @@
+/* Multiple versions of wcscmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc and for the
+   DSO.  In static binaries, we need wcscmp before the initialization
+   happened.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__wcscmp)
+	.type	__wcscmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcscmp_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcscmp_sse2)
+2:	ret
+END(__wcscmp)
+weak_alias (__wcscmp, wcscmp)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c
new file mode 100644
index 0000000000..fb3000392b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define wcscpy  __wcscpy_ia32
+#endif
+
+#include "wcsmbs/wcscpy.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
new file mode 100644
index 0000000000..6280ba92ab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
@@ -0,0 +1,600 @@
+/* wcscpy with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define RETURN	POP (%edi); ret; CFI_PUSH (%edi)
+# define STR1	PARMS
+# define STR2	STR1+4
+# define LEN	STR2+4
+
+	atom_text_section
+ENTRY (__wcscpy_ssse3)
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+
+	cmp	$0, (%ecx)
+	jz	L(ExitTail4)
+	cmp	$0, 4(%ecx)
+	jz	L(ExitTail8)
+	cmp	$0, 8(%ecx)
+	jz	L(ExitTail12)
+	cmp	$0, 12(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	mov	%edx, %edi
+	PUSH	(%esi)
+	lea	16(%ecx), %esi
+
+	and	$-16, %esi
+
+	pxor	%xmm0, %xmm0
+	pcmpeqd	(%esi), %xmm0
+	movdqu	(%ecx), %xmm1
+	movdqu	%xmm1, (%edx)
+
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%edx, %eax
+	lea	16(%edx), %edx
+	and	$-16, %edx
+	sub	%edx, %eax
+
+	sub	%eax, %ecx
+	mov	%ecx, %eax
+	and	$0xf, %eax
+	mov	$0, %esi
+
+	jz	L(Align16Both)
+	cmp	$4, %eax
+	je	L(Shl4)
+	cmp	$8, %eax
+	je	L(Shl8)
+	jmp	L(Shl12)
+
+L(Align16Both):
+	movaps	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movaps	%xmm1, (%edx)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm4
+	movaps	%xmm3, (%edx, %esi)
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm1
+	movaps	%xmm4, (%edx, %esi)
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm2
+	movaps	%xmm1, (%edx, %esi)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%edx, %esi)
+	mov	%ecx, %eax
+	lea	16(%ecx, %esi), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+
+	mov	$-0x40, %esi
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	lea	64(%edx), %edx
+	pcmpeqd	%xmm0, %xmm3
+	lea	64(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%edx)
+	movaps	%xmm5, -48(%edx)
+	movaps	%xmm6, -32(%edx)
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%edx)
+	pcmpeqd	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	$-0x40, %esi
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%ecx), %xmm1
+	movaps	12(%ecx), %xmm2
+L(Shl4Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	28(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-12(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%ecx), %xmm2
+	movaps	28(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	POP	(%esi)
+	add	$12, %edx
+	add	$12, %ecx
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%ecx), %xmm1
+	movaps	8(%ecx), %xmm2
+L(Shl8Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	24(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-8(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%ecx), %xmm2
+	movaps	24(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	POP	(%esi)
+	add	$8, %edx
+	add	$8, %ecx
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%ecx), %xmm1
+	movaps	4(%ecx), %xmm2
+L(Shl12Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	20(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-4(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%ecx), %xmm2
+	movaps	20(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit12)
+L(Exit16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edi, %eax
+	RETURN
+
+CFI_POP	(%edi)
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	ret
+
+END (__wcscpy_ssse3)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S
new file mode 100644
index 0000000000..cfc97dd87c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S
@@ -0,0 +1,36 @@
+/* Multiple versions of wcscpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(wcscpy)
+	.type	wcscpy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcscpy_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcscpy_ssse3)
+2:	ret
+END(wcscpy)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c
new file mode 100644
index 0000000000..a335dc0f7e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c
@@ -0,0 +1,9 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# define WCSLEN  __wcslen_ia32
+#endif
+
+extern __typeof (wcslen) __wcslen_ia32;
+
+#include "wcsmbs/wcslen.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S
new file mode 100644
index 0000000000..bd3fc4c79b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S
@@ -0,0 +1,193 @@
+/* wcslen with SSE2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+# define STR	4
+
+	.text
+ENTRY (__wcslen_sse2)
+	mov	STR(%esp), %edx
+
+	cmp	$0, (%edx)
+	jz	L(exit_tail0)
+	cmp	$0, 4(%edx)
+	jz	L(exit_tail1)
+	cmp	$0, 8(%edx)
+	jz	L(exit_tail2)
+	cmp	$0, 12(%edx)
+	jz	L(exit_tail3)
+	cmp	$0, 16(%edx)
+	jz	L(exit_tail4)
+	cmp	$0, 20(%edx)
+	jz	L(exit_tail5)
+	cmp	$0, 24(%edx)
+	jz	L(exit_tail6)
+	cmp	$0, 28(%edx)
+	jz	L(exit_tail7)
+
+	pxor	%xmm0, %xmm0
+
+	lea	32(%edx), %eax
+	lea	16(%edx), %ecx
+	and	$-16, %eax
+
+	pcmpeqd	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	and	$-0x40, %eax
+
+	.p2align 4
+L(aligned_64_loop):
+	movaps	(%eax), %xmm0
+	movaps	16(%eax), %xmm1
+	movaps	32(%eax), %xmm2
+	movaps	48(%eax), %xmm6
+
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqd	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	64(%eax), %eax
+	jz	L(aligned_64_loop)
+
+	pcmpeqd	-64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	48(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	-32(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	jmp	L(aligned_64_loop)
+
+	.p2align 4
+L(exit):
+	sub	%ecx, %eax
+	shr	$2, %eax
+	test	%dl, %dl
+	jz	L(exit_high)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_1)
+	ret
+
+	.p2align 4
+L(exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_3)
+	add	$2, %eax
+	ret
+
+	.p2align 4
+L(exit_1):
+	add	$1, %eax
+	ret
+
+	.p2align 4
+L(exit_3):
+	add	$3, %eax
+	ret
+
+	.p2align 4
+L(exit_tail0):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_tail1):
+	mov	$1, %eax
+	ret
+
+	.p2align 4
+L(exit_tail2):
+	mov	$2, %eax
+	ret
+
+	.p2align 4
+L(exit_tail3):
+	mov	$3, %eax
+	ret
+
+	.p2align 4
+L(exit_tail4):
+	mov	$4, %eax
+	ret
+
+	.p2align 4
+L(exit_tail5):
+	mov	$5, %eax
+	ret
+
+	.p2align 4
+L(exit_tail6):
+	mov	$6, %eax
+	ret
+
+	.p2align 4
+L(exit_tail7):
+	mov	$7, %eax
+	ret
+
+END (__wcslen_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S
new file mode 100644
index 0000000000..6ef9b6e7b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S
@@ -0,0 +1,37 @@
+/* Multiple versions of wcslen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__wcslen)
+	.type	__wcslen, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcslen_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcslen_sse2)
+2:	ret
+END(__wcslen)
+
+weak_alias(__wcslen, wcslen)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c
new file mode 100644
index 0000000000..8d8a335b5b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define wcsrchr  __wcsrchr_ia32
+#endif
+
+#include "wcsmbs/wcsrchr.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
new file mode 100644
index 0000000000..1a9b60e55e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
@@ -0,0 +1,354 @@
+/* wcsrchr with SSE2, without using bsf instructions.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	8
+# define ENTRANCE	PUSH (%edi);
+# define RETURN	POP (%edi); ret; CFI_PUSH (%edi);
+# define STR1	PARMS
+# define STR2	STR1+4
+
+	atom_text_section
+ENTRY (__wcsrchr_sse2)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	mov	%ecx, %edi
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+
+/* ECX has OFFSET. */
+	and	$63, %ecx
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+/* Find where NULL is.  */
+	pmovmskb %xmm2, %ecx
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%ecx, %ecx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+
+	PUSH	(%esi)
+
+	xor	%edx, %edx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%ecx, %ecx
+	jnz	L(prolog_find_zero_1)
+
+	PUSH	(%esi)
+
+/* Save current match */
+	mov	%eax, %edx
+	mov	%edi, %esi
+	and	$-16, %edi
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(crosscache):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm3
+	pcmpeqd	%xmm1, %xmm0
+/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	PUSH	(%esi)
+
+	xor	%edx, %edx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(prolog_find_zero)
+
+	PUSH	(%esi)
+
+	mov	%eax, %edx
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm3
+	pcmpeqd	%xmm3, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm4
+	pcmpeqd	%xmm4, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm4
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm4, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm5
+	pcmpeqd	%xmm5, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm5
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm5, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%edx, %edx
+	jz	L(return_null_1)
+	mov	%edx, %eax
+	mov	%esi, %edi
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(return_null_1):
+	POP	(%esi)
+
+	xor	%eax, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(find_zero)
+/* save match info */
+	mov	%eax, %edx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	%cl, %cl
+	jz	L(find_zero_in_third_or_fourth_wchar)
+	test	$15, %cl
+	jz	L(find_zero_in_second_wchar)
+	and	$1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_third_or_fourth_wchar):
+	test	$15, %ch
+	jz	L(find_zero_in_fourth_wchar)
+	and	$1 << 9 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_fourth_wchar):
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match_second_wchar):
+	lea	-12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_third_or_fourth_wchar):
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_third_wchar):
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_fourth_wchar):
+	lea	-4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%ecx, %edi
+	mov     %edx, %ecx
+L(prolog_find_zero_1):
+	test	%cl, %cl
+	jz	L(prolog_find_zero_in_third_or_fourth_wchar)
+	test	$15, %cl
+	jz	L(prolog_find_zero_in_second_wchar)
+	and	$1, %eax
+	jz	L(return_null)
+
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_null)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_third_or_fourth_wchar):
+	test	$15, %ch
+	jz	L(prolog_find_zero_in_fourth_wchar)
+	and	$1 << 9 - 1, %eax
+	jz	L(return_null)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_fourth_wchar):
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+END (__wcsrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S
new file mode 100644
index 0000000000..cf67333995
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S
@@ -0,0 +1,35 @@
+/* Multiple versions of wcsrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(wcsrchr)
+	.type	wcsrchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcsrchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcsrchr_sse2)
+2:	ret
+END(wcsrchr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000000..75ab4b94c1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c
@@ -0,0 +1,9 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# define WMEMCMP  __wmemcmp_ia32
+#endif
+
+extern __typeof (wmemcmp) __wmemcmp_ia32;
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000000..1a857c7e21
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_2
+
+#include "memcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000000..a41ef95fc1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S
new file mode 100644
index 0000000000..1b9a54a413
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S
@@ -0,0 +1,40 @@
+/* Multiple versions of wmemcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+	.text
+ENTRY(wmemcmp)
+	.type	wmemcmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wmemcmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wmemcmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wmemcmp_sse4_2)
+2:	ret
+END(wmemcmp)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/nptl/tls.h b/REORG.TODO/sysdeps/i386/i686/nptl/tls.h
new file mode 100644
index 0000000000..5b527af9d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/nptl/tls.h
@@ -0,0 +1,35 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _TLS_H
+
+/* Additional definitions for <tls.h> on i686 and up.  */
+
+
+/* Macros to load from and store into segment registers.  We can use
+   the 32-bit instructions.  */
+#define TLS_GET_GS() \
+  ({ int __seg; __asm ("movl %%gs, %0" : "=q" (__seg)); __seg; })
+#define TLS_SET_GS(val) \
+  __asm ("movl %0, %%gs" :: "q" (val))
+
+
+/* Get the full set of definitions.  */
+#include_next <tls.h>
+
+#endif	/* tls.h */
diff --git a/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S b/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S
new file mode 100644
index 0000000000..ce9c94d41a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S
@@ -0,0 +1,20 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define HAVE_CMOV	1
+#include <sysdeps/i386/pthread_spin_trylock.S>
diff --git a/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h b/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h
new file mode 100644
index 0000000000..9b5a1b0d47
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h
@@ -0,0 +1,23 @@
+/* Define macros for stack address aliasing issues for NPTL.  i686 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* What is useful is to avoid the 64k aliasing problem which reliably
+   happens if all stacks use sizes which are a multiple of 64k.  Tell
+   the stack allocator to disturb this by allocation one more page if
+   necessary.  */
+#define MULTI_PAGE_ALIASING     65536
diff --git a/REORG.TODO/sysdeps/i386/i686/strcmp.S b/REORG.TODO/sysdeps/i386/i686/strcmp.S
new file mode 100644
index 0000000000..1ae305912e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/strcmp.S
@@ -0,0 +1,52 @@
+/* Highly optimized version for ix86, x>=6.
+   Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define STR1	PARMS
+#define STR2	STR1+4
+
+        .text
+ENTRY (strcmp)
+
+	movl	STR1(%esp), %ecx
+	movl	STR2(%esp), %edx
+
+L(oop):	movb	(%ecx), %al
+	cmpb	(%edx), %al
+	jne	L(neq)
+	incl	%ecx
+	incl	%edx
+	testb	%al, %al
+	jnz	L(oop)
+
+	xorl	%eax, %eax
+	/* when strings are equal, pointers rest one beyond
+	   the end of the NUL terminators.  */
+	ret
+
+L(neq):	movl	$1, %eax
+	movl	$-1, %ecx
+	cmovbl	%ecx, %eax
+
+	ret
+END (strcmp)
+libc_hidden_builtin_def (strcmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h b/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h
new file mode 100644
index 0000000000..51f03fe77b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h
@@ -0,0 +1,44 @@
+/* Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <stdint.h>
+#ifndef __SSE__
+#include_next <tst-stack-align.h>
+#else
+#include <xmmintrin.h>
+
+#define TEST_STACK_ALIGN() \
+  ({									     \
+    __m128 _m;								     \
+    double _d = 12.0;							     \
+    long double _ld = 15.0;						     \
+    int _ret = 0;							     \
+    printf ("__m128:  %p %zu\n", &_m, __alignof (__m128));		     \
+    if ((((uintptr_t) &_m) & (__alignof (__m128) - 1)) != 0)		     \
+      _ret = 1;								     \
+									     \
+    printf ("double:  %g %p %zu\n", _d, &_d, __alignof (double));	     \
+    if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0)		     \
+      _ret = 1;								     \
+									     \
+    printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double));    \
+    if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0)	     \
+      _ret = 1;								     \
+    _ret;								     \
+    })
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i786/Implies b/REORG.TODO/sysdeps/i386/i786/Implies
new file mode 100644
index 0000000000..1cd29f63cf
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i786/Implies
@@ -0,0 +1,2 @@
+# The PPro and PII cores are mostly the same.
+i386/i686
diff --git a/REORG.TODO/sysdeps/i386/init-arch.h b/REORG.TODO/sysdeps/i386/init-arch.h
new file mode 100644
index 0000000000..72881c5679
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/init-arch.h
@@ -0,0 +1,19 @@
+/* Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define MINIMUM_ISA 486
+#include <sysdeps/x86/init-arch.h>
diff --git a/REORG.TODO/sysdeps/i386/jmpbuf-offsets.h b/REORG.TODO/sysdeps/i386/jmpbuf-offsets.h
new file mode 100644
index 0000000000..1c95db7287
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/jmpbuf-offsets.h
@@ -0,0 +1,25 @@
+/* Private macros for accessing __jmp_buf contents.  i386 version.
+   Copyright (C) 2006-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define JB_BX	0
+#define JB_SI	1
+#define JB_DI	2
+#define JB_BP	3
+#define JB_SP	4
+#define JB_PC	5
+#define JB_SIZE 24
diff --git a/REORG.TODO/sysdeps/i386/jmpbuf-unwind.h b/REORG.TODO/sysdeps/i386/jmpbuf-unwind.h
new file mode 100644
index 0000000000..0a63a832cc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/jmpbuf-unwind.h
@@ -0,0 +1,47 @@
+/* Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <setjmp.h>
+#include <jmpbuf-offsets.h>
+#include <stdint.h>
+#include <unwind.h>
+#include <sysdep.h>
+
+/* Test if longjmp to JMPBUF would unwind the frame
+   containing a local variable at ADDRESS.  */
+#define _JMPBUF_UNWINDS(jmpbuf, address, demangle) \
+  ((void *) (address) < (void *) demangle ((jmpbuf)[JB_SP]))
+
+#define _JMPBUF_CFA_UNWINDS_ADJ(_jmpbuf, _context, _adj) \
+  _JMPBUF_UNWINDS_ADJ (_jmpbuf, (void *) _Unwind_GetCFA (_context), _adj)
+
+static inline uintptr_t __attribute__ ((unused))
+_jmpbuf_sp (__jmp_buf regs)
+{
+  uintptr_t sp = regs[JB_SP];
+#ifdef PTR_DEMANGLE
+  PTR_DEMANGLE (sp);
+#endif
+  return sp;
+}
+
+#define _JMPBUF_UNWINDS_ADJ(_jmpbuf, _address, _adj) \
+  ((uintptr_t) (_address) - (_adj) < _jmpbuf_sp (_jmpbuf) - (_adj))
+
+/* We use the normal longjmp for unwinding.  */
+#define __libc_unwind_longjmp(buf, val) __libc_longjmp (buf, val)
diff --git a/REORG.TODO/sysdeps/i386/ldbl2mpn.c b/REORG.TODO/sysdeps/i386/ldbl2mpn.c
new file mode 100644
index 0000000000..076be0ae7e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/ldbl2mpn.c
@@ -0,0 +1,120 @@
+/* Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+#include <ieee754.h>
+#include <float.h>
+#include <stdlib.h>
+
+/* Convert a `long double' in IEEE854 standard double-precision format to a
+   multi-precision integer representing the significand scaled up by its
+   number of bits (64 for long double) and an integral power of two
+   (MPN frexpl). */
+
+mp_size_t
+__mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
+			   int *expt, int *is_neg,
+			   long double value)
+{
+  union ieee854_long_double u;
+  u.d = value;
+
+  *is_neg = u.ieee.negative;
+  *expt = (int) u.ieee.exponent - IEEE854_LONG_DOUBLE_BIAS;
+
+#if BITS_PER_MP_LIMB == 32
+  res_ptr[0] = u.ieee.mantissa1; /* Low-order 32 bits of fraction.  */
+  res_ptr[1] = u.ieee.mantissa0; /* High-order 32 bits.  */
+  #define N 2
+#elif BITS_PER_MP_LIMB == 64
+  /* Hopefully the compiler will combine the two bitfield extracts
+     and this composition into just the original quadword extract.  */
+  res_ptr[0] = ((mp_limb_t) u.ieee.mantissa0 << 32) | u.ieee.mantissa1;
+  #define N 1
+#else
+  #error "mp_limb size " BITS_PER_MP_LIMB "not accounted for"
+#endif
+
+  if (u.ieee.exponent == 0)
+    {
+      /* A biased exponent of zero is a special case.
+	 Either it is a zero or it is a denormal number.  */
+      if (res_ptr[0] == 0 && res_ptr[N - 1] == 0) /* Assumes N<=2.  */
+	/* It's zero.  */
+	*expt = 0;
+      else
+	{
+	  /* It is a denormal number, meaning it has no implicit leading
+	     one bit, and its exponent is in fact the format minimum.  */
+	  int cnt;
+
+	  /* One problem with Intel's 80-bit format is that the explicit
+	     leading one in the normalized representation has to be zero
+	     for denormalized number.  If it is one, the number is according
+	     to Intel's specification an invalid number.  We make the
+	     representation unique by explicitly clearing this bit.  */
+	  res_ptr[N - 1] &= ~((mp_limb_t) 1 << ((LDBL_MANT_DIG - 1) % BITS_PER_MP_LIMB));
+
+	  if (res_ptr[N - 1] != 0)
+	    {
+	      count_leading_zeros (cnt, res_ptr[N - 1]);
+	      if (cnt != 0)
+		{
+#if N == 2
+		  res_ptr[N - 1] = res_ptr[N - 1] << cnt
+				   | (res_ptr[0] >> (BITS_PER_MP_LIMB - cnt));
+		  res_ptr[0] <<= cnt;
+#else
+		  res_ptr[N - 1] <<= cnt;
+#endif
+		}
+	      *expt = LDBL_MIN_EXP - 1 - cnt;
+	    }
+	  else if (res_ptr[0] != 0)
+	    {
+	      count_leading_zeros (cnt, res_ptr[0]);
+	      res_ptr[N - 1] = res_ptr[0] << cnt;
+	      res_ptr[0] = 0;
+	      *expt = LDBL_MIN_EXP - 1 - BITS_PER_MP_LIMB - cnt;
+	    }
+	  else
+	    {
+	      /* This is the special case of the pseudo denormal number
+		 with only the implicit leading bit set.  The value is
+		 in fact a normal number and so we have to treat this
+		 case differently.  */
+#if N == 2
+	      res_ptr[N - 1] = 0x80000000ul;
+#else
+	      res_ptr[0] = 0x8000000000000000ul;
+#endif
+	      *expt = LDBL_MIN_EXP - 1;
+	    }
+	}
+    }
+  else if (u.ieee.exponent < 0x7fff
+#if N == 2
+	   && res_ptr[0] == 0
+#endif
+	   && res_ptr[N - 1] == 0)
+    /* Pseudo zero.  */
+    *expt = 0;
+
+  return N;
+}
diff --git a/REORG.TODO/sysdeps/i386/ldsodefs.h b/REORG.TODO/sysdeps/i386/ldsodefs.h
new file mode 100644
index 0000000000..a369f5fc68
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/ldsodefs.h
@@ -0,0 +1,41 @@
+/* Run-time dynamic linker data structures for loaded ELF shared objects.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef	_I386_LDSODEFS_H
+#define	_I386_LDSODEFS_H	1
+
+#include <elf.h>
+#include <cpu-features.h>
+
+struct La_i86_regs;
+struct La_i86_retval;
+
+#define ARCH_PLTENTER_MEMBERS						\
+    Elf32_Addr (*i86_gnu_pltenter) (Elf32_Sym *, unsigned int, uintptr_t *, \
+				    uintptr_t *, struct La_i86_regs *,	\
+				    unsigned int *, const char *name,	\
+				    long int *framesizep)
+
+#define ARCH_PLTEXIT_MEMBERS						\
+    unsigned int (*i86_gnu_pltexit) (Elf32_Sym *, unsigned int, uintptr_t *, \
+				     uintptr_t *, const struct La_i86_regs *, \
+				     struct La_i86_retval *, const char *)
+
+#include_next <ldsodefs.h>
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/link-defines.sym b/REORG.TODO/sysdeps/i386/link-defines.sym
new file mode 100644
index 0000000000..0995adb37f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/link-defines.sym
@@ -0,0 +1,20 @@
+#include "link.h"
+#include <stddef.h>
+
+--
+LONG_DOUBLE_SIZE	sizeof (long double)
+
+LR_SIZE			sizeof (struct La_i86_regs)
+LR_EDX_OFFSET		offsetof (struct La_i86_regs, lr_edx)
+LR_ECX_OFFSET		offsetof (struct La_i86_regs, lr_ecx)
+LR_EAX_OFFSET		offsetof (struct La_i86_regs, lr_eax)
+LR_EBP_OFFSET		offsetof (struct La_i86_regs, lr_ebp)
+LR_ESP_OFFSET		offsetof (struct La_i86_regs, lr_esp)
+
+LRV_SIZE		sizeof (struct La_i86_retval)
+LRV_EAX_OFFSET		offsetof (struct La_i86_retval, lrv_eax)
+LRV_EDX_OFFSET		offsetof (struct La_i86_retval, lrv_edx)
+LRV_ST0_OFFSET		offsetof (struct La_i86_retval, lrv_st0)
+LRV_ST1_OFFSET		offsetof (struct La_i86_retval, lrv_st1)
+LRV_BND0_OFFSET		offsetof (struct La_i86_retval, lrv_bnd0)
+LRV_BND1_OFFSET		offsetof (struct La_i86_retval, lrv_bnd1)
diff --git a/REORG.TODO/sysdeps/i386/lshift.S b/REORG.TODO/sysdeps/i386/lshift.S
new file mode 100644
index 0000000000..fa4b07793f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/lshift.S
@@ -0,0 +1,103 @@
+/* i80386 __mpn_lshift --
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+12		/* space for 3 saved regs */
+#define RES	PARMS
+#define S	RES+4
+#define SIZE	S+4
+#define CNT	SIZE+4
+
+	.text
+ENTRY (__mpn_lshift)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 8)
+	movl	S(%esp),%esi
+	cfi_rel_offset (esi, 4)
+	movl	SIZE(%esp),%edx
+	movl	CNT(%esp),%ecx
+	subl	$4,%esi			/* adjust s_ptr */
+
+	movl	(%esi,%edx,4),%ebx	/* read most significant limb */
+	cfi_rel_offset (ebx, 0)
+	cfi_remember_state
+	xorl	%eax,%eax
+	shldl	%cl,%ebx,%eax		/* compute carry limb */
+	decl	%edx
+	jz	L(end)
+	pushl	%eax			/* push carry limb onto stack */
+	cfi_adjust_cfa_offset (4)
+	testb	$1,%dl
+	jnz	L(1)			/* enter loop in the middle */
+	movl	%ebx,%eax
+
+	ALIGN (3)
+L(oop):	movl	(%esi,%edx,4),%ebx	/* load next lower limb */
+	shldl	%cl,%ebx,%eax		/* compute result limb */
+	movl	%eax,(%edi,%edx,4)	/* store it */
+	decl	%edx
+L(1):	movl	(%esi,%edx,4),%eax
+	shldl	%cl,%eax,%ebx
+	movl	%ebx,(%edi,%edx,4)
+	decl	%edx
+	jnz	L(oop)
+
+	shll	%cl,%eax		/* compute least significant limb */
+	movl	%eax,(%edi)		/* store it */
+
+	popl	%eax			/* pop carry limb */
+	cfi_adjust_cfa_offset (-4)
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+
+	cfi_restore_state
+L(end):	shll	%cl,%ebx		/* compute least significant limb */
+	movl	%ebx,(%edi)		/* store it */
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_lshift)
diff --git a/REORG.TODO/sysdeps/i386/machine-gmon.h b/REORG.TODO/sysdeps/i386/machine-gmon.h
new file mode 100644
index 0000000000..d5d8cdf7c6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/machine-gmon.h
@@ -0,0 +1,40 @@
+/* i386-specific implementation of profiling support.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* We need a special version of the `mcount' function since for ix86 it
+   must not clobber any register.  This has several reasons:
+     - there is a bug in gcc as of version 2.7.2.2 which prohibits the
+       use of profiling together with nested functions
+     - the ELF `fixup' function uses GCC's regparm feature
+     - some (future) systems might want to pass parameters in registers.  */
+
+/* We must not pollute the global namespace.  */
+#define mcount_internal __mcount_internal
+
+extern void mcount_internal (u_long frompc, u_long selfpc) internal_function;
+
+#define _MCOUNT_DECL(frompc, selfpc) \
+void internal_function mcount_internal (u_long frompc, u_long selfpc)
+
+
+/* Define MCOUNT as empty since we have the implementation in another
+   file.  */
+#define MCOUNT
diff --git a/REORG.TODO/sysdeps/i386/memchr.S b/REORG.TODO/sysdeps/i386/memchr.S
new file mode 100644
index 0000000000..db4a6418ff
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memchr.S
@@ -0,0 +1,322 @@
+/* memchr (str, chr, len) -- Return pointer to first occurrence of CHR in STR
+	 less than LEN.  For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+   This version is developed using the same algorithm as the fast C
+   version which carries the following introduction:
+   Based on strlen implementation by Torbjorn Granlund (tege@sics.se),
+   with help from Dan Sahlin (dan@sics.se) and
+   commentary by Jim Blandy (jimb@ai.mit.edu);
+   adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
+   and implemented by Roland McGrath (roland@ai.mit.edu).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+8		/* space for 2 saved regs */
+#define RTN	PARMS
+#define STR	RTN
+#define CHR	STR+4
+#define LEN	CHR+4
+
+	.text
+ENTRY (__memchr)
+
+	/* Save callee-safe registers used in this function.  */
+	pushl %esi
+	cfi_adjust_cfa_offset (4)
+	pushl %edi
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+
+	/* Load parameters into registers.  */
+	movl STR(%esp), %eax	/* str: pointer to memory block.  */
+	movl CHR(%esp), %edx	/* c: byte we are looking for.  */
+	movl LEN(%esp), %esi	/* len: length of memory block.  */
+	cfi_rel_offset (esi, 4)
+
+	/* If my must not test more than three characters test
+	   them one by one.  This is especially true for 0.  */
+	cmpl $4, %esi
+	jb L(3)
+
+	/* At the moment %edx contains CHR.  What we need for the
+	   algorithm is CHR in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* Now it is 0|0|c|c */
+	movl %edx, %ecx
+	shll $16, %edx		/* Now c|c|0|0 */
+	movw %cx, %dx		/* And finally c|c|c|c */
+
+	/* Better performance can be achieved if the word (32
+	   bit) memory access is aligned on a four-byte-boundary.
+	   So process first bytes one by one until boundary is
+	   reached. Don't use a loop for better performance.  */
+
+	testb $3, %al		/* correctly aligned ? */
+	je L(2)			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L(9)			/* target found => return */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length counter */
+	je L(4)			/* len==0 => return NULL */
+
+	testb $3, %al		/* correctly aligned ? */
+	je L(2)			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L(9)			/* target found => return */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length counter */
+	je L(4)			/* len==0 => return NULL */
+
+	testb $3, %al		/* correctly aligned ? */
+	je L(2)			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L(9)			/* target found => return */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length counter */
+	/* no test for len==0 here, because this is done in the
+	   loop head */
+	jmp L(2)
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for CHR, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is CHR.  This turns each byte that is CHR
+	 into a zero.  */
+
+
+	/* Each round the main loop processes 16 bytes.  */
+
+	ALIGN (4)
+
+L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occurred in the last byte => it was 0.	*/
+	jnc L(8)
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is CHR we don't get 0 in %edi.  */
+	jnz L(8)		/* found it => return pointer */
+
+	/* This process is unfolded four times for better performance.
+	   we don't increment the source pointer each time.  Instead we
+	   use offsets and increment by 16 in each run of the loop.  But
+	   before probing for the matching byte we need some extra code
+	   (following LL(13) below).  Even the len can be compared with
+	   constants instead of decrementing each time.  */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(7)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(7)		/* found it => return pointer */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(6)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(6)		/* found it => return pointer */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(5)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(5)		/* found it => return pointer */
+
+	/* Adjust both counters for a full round, i.e. 16 bytes.  */
+	addl $16, %eax
+L(2):	subl $16, %esi
+	jae L(1)		/* Still more than 16 bytes remaining */
+
+	/* Process remaining bytes separately.  */
+	cmpl $4-16, %esi	/* rest < 4 bytes? */
+	jb L(3)			/* yes, than test byte by byte */
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(8)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jne L(8)		/* found it => return pointer */
+	addl $4, %eax		/* adjust source pointer */
+
+	cmpl $8-16, %esi	/* rest < 8 bytes? */
+	jb L(3)			/* yes, than test byte by byte */
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(8)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jne L(8)		/* found it => return pointer */
+	addl $4, %eax		/* adjust source pointer */
+
+	cmpl $12-16, %esi	/* rest < 12 bytes? */
+	jb L(3)			/* yes, than test byte by byte */
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(8)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jne L(8)		/* found it => return pointer */
+	addl $4, %eax		/* adjust source pointer */
+
+	/* Check the remaining bytes one by one.  */
+L(3):	andl $3, %esi		/* mask out uninteresting bytes */
+	jz L(4)			/* no remaining bytes => return NULL */
+
+	cmpb %dl, (%eax)	/* compare byte with CHR */
+	je L(9)			/* equal, than return pointer */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length */
+	jz L(4)			/* no remaining bytes => return NULL */
+
+	cmpb %dl, (%eax)	/* compare byte with CHR */
+	je L(9)			/* equal, than return pointer */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length */
+	jz L(4)			/* no remaining bytes => return NULL */
+
+	cmpb %dl, (%eax)	/* compare byte with CHR */
+	je L(9)			/* equal, than return pointer */
+
+L(4):	/* no byte found => return NULL */
+	xorl %eax, %eax
+	jmp L(9)
+
+	/* add missing source pointer increments */
+L(5):	addl $4, %eax
+L(6):	addl $4, %eax
+L(7):	addl $4, %eax
+
+	/* Test for the matching byte in the word.  %ecx contains a NUL
+	   char in the byte which originally was the byte we are looking
+	   at.  */
+L(8):	testb %cl, %cl		/* test first byte in dword */
+	jz L(9)			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	testb %ch, %ch		/* test second byte in dword */
+	jz L(9)			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	testl $0xff0000, %ecx	/* test third byte in dword */
+	jz L(9)			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	/* No further test needed we we know it is one of the four bytes.  */
+L(9):	popl %edi		/* pop saved registers */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+	popl %esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+
+	ret
+END (__memchr)
+
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
diff --git a/REORG.TODO/sysdeps/i386/memcmp.S b/REORG.TODO/sysdeps/i386/memcmp.S
new file mode 100644
index 0000000000..01f8f8ef03
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memcmp.S
@@ -0,0 +1,73 @@
+/* Compare two memory blocks for differences in the first COUNT bytes.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* space for 1 saved reg */
+#define BLK1	PARMS
+#define BLK2	BLK1+4
+#define LEN	BLK2+4
+
+	.text
+ENTRY (memcmp)
+
+	pushl %esi		/* Save callee-safe registers.  */
+	cfi_adjust_cfa_offset (4)
+	movl %edi, %edx		/* Note that %edx is not used and can
+				   so be used to save %edi.  It's faster.  */
+	cfi_register (edi, edx)
+
+	movl BLK1(%esp), %esi
+	cfi_rel_offset (esi, 0)
+	movl BLK2(%esp), %edi
+	movl LEN(%esp), %ecx
+
+	cld			/* Set direction of comparison.  */
+
+	xorl %eax, %eax		/* Default result.  */
+
+	repe			/* Compare at most %ecx bytes.  */
+	cmpsb
+	jz L(1)			/* If even last byte was equal we return 0.  */
+
+	/* The memory blocks are not equal.  So result of the last
+	   subtraction is present in the carry flag.  It is set when
+	   the byte in block #2 is bigger.  In this case we have to
+	   return -1 (=0xffffffff), else 1.  */
+	sbbl %eax, %eax		/* This is tricky.  %eax == 0 and carry is set
+				   or not depending on last subtraction.  */
+
+	/* At this point %eax == 0, if the byte of block #1 was bigger, and
+	   0xffffffff if the last byte of block #2 was bigger.  The latter
+	   case is already correct but the former needs a little adjustment.
+	   Note that the following operation does not change 0xffffffff.  */
+	orb $1, %al		/* Change 0 to 1.  */
+
+L(1):	popl %esi		/* Restore registers.  */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	movl %edx, %edi
+	cfi_restore (edi)
+
+	ret
+END (memcmp)
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
+libc_hidden_builtin_def (memcmp)
diff --git a/REORG.TODO/sysdeps/i386/memcopy.h b/REORG.TODO/sysdeps/i386/memcopy.h
new file mode 100644
index 0000000000..dc6173ee29
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memcopy.h
@@ -0,0 +1,92 @@
+/* memcopy.h -- definitions for memory copy functions.  i386 version.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Torbjorn Granlund (tege@sics.se).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/generic/memcopy.h>
+
+#undef	OP_T_THRES
+#define	OP_T_THRES	8
+
+#undef	BYTE_COPY_FWD
+#define BYTE_COPY_FWD(dst_bp, src_bp, nbytes)				      \
+  do {									      \
+    int __d0;								      \
+    asm volatile(/* Clear the direction flag, so copying goes forward.  */    \
+		 "cld\n"						      \
+		 /* Copy bytes.  */					      \
+		 "rep\n"						      \
+		 "movsb" :						      \
+		 "=D" (dst_bp), "=S" (src_bp), "=c" (__d0) :		      \
+		 "0" (dst_bp), "1" (src_bp), "2" (nbytes) :		      \
+		 "memory");						      \
+  } while (0)
+
+#undef	BYTE_COPY_BWD
+#define BYTE_COPY_BWD(dst_ep, src_ep, nbytes)				      \
+  do									      \
+    {									      \
+      int __d0;								      \
+      asm volatile(/* Set the direction flag, so copying goes backwards.  */  \
+		   "std\n"						      \
+		   /* Copy bytes.  */					      \
+		   "rep\n"						      \
+		   "movsb\n"						      \
+		   /* Clear the dir flag.  Convention says it should be 0. */ \
+		   "cld" :						      \
+		   "=D" (dst_ep), "=S" (src_ep), "=c" (__d0) :		      \
+		   "0" (dst_ep - 1), "1" (src_ep - 1), "2" (nbytes) :	      \
+		   "memory");						      \
+      dst_ep += 1;							      \
+      src_ep += 1;							      \
+    } while (0)
+
+#undef	WORD_COPY_FWD
+#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)		      \
+  do									      \
+    {									      \
+      int __d0;								      \
+      asm volatile(/* Clear the direction flag, so copying goes forward.  */  \
+		   "cld\n"						      \
+		   /* Copy longwords.  */				      \
+		   "rep\n"						      \
+		   "movsl" :						      \
+ 		   "=D" (dst_bp), "=S" (src_bp), "=c" (__d0) :		      \
+		   "0" (dst_bp), "1" (src_bp), "2" ((nbytes) / 4) :	      \
+		   "memory");						      \
+      (nbytes_left) = (nbytes) % 4;					      \
+    } while (0)
+
+#undef	WORD_COPY_BWD
+#define WORD_COPY_BWD(dst_ep, src_ep, nbytes_left, nbytes)		      \
+  do									      \
+    {									      \
+      int __d0;								      \
+      asm volatile(/* Set the direction flag, so copying goes backwards.  */  \
+		   "std\n"						      \
+		   /* Copy longwords.  */				      \
+		   "rep\n"						      \
+		   "movsl\n"						      \
+		   /* Clear the dir flag.  Convention says it should be 0. */ \
+		   "cld" :						      \
+		   "=D" (dst_ep), "=S" (src_ep), "=c" (__d0) :		      \
+		   "0" (dst_ep - 4), "1" (src_ep - 4), "2" ((nbytes) / 4) :   \
+		   "memory");						      \
+      dst_ep += 4;							      \
+      src_ep += 4;							      \
+      (nbytes_left) = (nbytes) % 4;					      \
+    } while (0)
diff --git a/REORG.TODO/sysdeps/i386/memcpy.S b/REORG.TODO/sysdeps/i386/memcpy.S
new file mode 100644
index 0000000000..06568ea724
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memcpy.S
@@ -0,0 +1,95 @@
+/* memcpy with REP MOVSB/STOSB
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		memcpy
+# define MEMCPY_CHK	__memcpy_chk
+#endif
+
+#ifdef USE_AS_BCOPY
+# define STR2		12
+# define STR1		STR2+4
+# define N     		STR1+4
+#else
+# define STR1		12
+# define STR2		STR1+4
+# define N     		STR2+4
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+	.text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+	PUSH	(%esi)
+	PUSH	(%edi)
+	movl	N(%esp), %ecx
+	movl	STR1(%esp), %edi
+	movl	STR2(%esp), %esi
+	mov	%edi, %eax
+#ifdef USE_AS_MEMPCPY
+	add	%ecx, %eax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%esi, %edi
+	ja	L(copy_backward)
+	je	L(bwd_write_0bytes)
+#endif
+
+	rep	movsb
+	POP	(%edi)
+	POP	(%esi)
+	ret
+
+#ifdef USE_AS_MEMMOVE
+L(copy_backward):
+	lea	-1(%edi,%ecx), %edi
+	lea	-1(%esi,%ecx), %esi
+	std
+	rep	movsb
+	cld
+L(bwd_write_0bytes):
+	POP	(%edi)
+	POP	(%esi)
+	ret
+#endif
+
+END (MEMCPY)
+
+#ifndef USE_AS_BCOPY
+libc_hidden_builtin_def (MEMCPY)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memcpy_chk.S b/REORG.TODO/sysdeps/i386/memcpy_chk.S
new file mode 100644
index 0000000000..0f6f585c41
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memcpy_chk.S
@@ -0,0 +1,34 @@
+/* Checking memcpy for i386.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+	/* For libc.so this is defined in memcpy.S.
+	   For libc.a, this is a separate source to avoid
+	   memcpy bringing in __chk_fail and all routines
+	   it calls.  */
+        .text
+ENTRY (__memcpy_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	memcpy
+END (__memcpy_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memmove.S b/REORG.TODO/sysdeps/i386/memmove.S
new file mode 100644
index 0000000000..60a45d21e0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memmove.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		memmove
+#define MEMCPY_CHK	__memmove_chk
+#include "memcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/memmove_chk.S b/REORG.TODO/sysdeps/i386/memmove_chk.S
new file mode 100644
index 0000000000..0c7037cc05
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memmove_chk.S
@@ -0,0 +1,33 @@
+/* Checking memmove for i386
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+/* For libc.so this is defined in memmove.S.  For libc.a, this is a
+   separate source to avoid memmove bringing in __chk_fail and all
+   routines it calls.  */
+        .text
+ENTRY (__memmove_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	memmove
+END (__memmove_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/mempcpy.S b/REORG.TODO/sysdeps/i386/mempcpy.S
new file mode 100644
index 0000000000..61addb75f4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/mempcpy.S
@@ -0,0 +1,7 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy
+#define MEMCPY_CHK	__mempcpy_chk
+#include "memcpy.S"
+
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/i386/mempcpy_chk.S b/REORG.TODO/sysdeps/i386/mempcpy_chk.S
new file mode 100644
index 0000000000..4d8ac5c25b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/mempcpy_chk.S
@@ -0,0 +1,33 @@
+/* Checking mempcpy for i386
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+/* For libc.so this is defined in mempcpy.S.  For libc.a, this is a
+   separate source to avoid mempcpy bringing in __chk_fail and all
+   routines it calls.  */
+        .text
+ENTRY (__mempcpy_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	mempcpy
+END (__mempcpy_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memset.S b/REORG.TODO/sysdeps/i386/memset.S
new file mode 100644
index 0000000000..46ae65d2e4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memset.S
@@ -0,0 +1,68 @@
+/* memset with REP MOVSB/STOSB
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define STR1  8
+#ifdef USE_AS_BZERO
+#define N     STR1+4
+#else
+#define STR2  STR1+4
+#define N     STR2+4
+#endif
+
+	.text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk)
+#endif
+ENTRY (memset)
+	PUSH    (%edi)
+	movl	N(%esp), %ecx
+	movl	STR1(%esp), %edi
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	STR2(%esp), %eax
+	mov	%edi, %edx
+#endif
+	rep	stosb
+#ifndef USE_AS_BZERO
+	mov	%edx, %eax
+#endif
+	POP     (%edi)
+	ret
+END (memset)
+
+#ifndef USE_AS_BZERO
+libc_hidden_builtin_def (memset)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memset_chk.S b/REORG.TODO/sysdeps/i386/memset_chk.S
new file mode 100644
index 0000000000..da7837111e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memset_chk.S
@@ -0,0 +1,33 @@
+/* Checking memset for i386.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+/* For libc.so this is defined in memset.S.  For libc.a, this is a
+   separate source to avoid memset bringing in __chk_fail and all
+   routines it calls.  */
+        .text
+ENTRY (__memset_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	memset
+END (__memset_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memusage.h b/REORG.TODO/sysdeps/i386/memusage.h
new file mode 100644
index 0000000000..30167be833
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memusage.h
@@ -0,0 +1,20 @@
+/* Copyright (C) 2000-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define GETSP() ({ register uintptr_t stack_ptr asm ("esp"); stack_ptr; })
+
+#include <sysdeps/generic/memusage.h>
diff --git a/REORG.TODO/sysdeps/i386/mp_clz_tab.c b/REORG.TODO/sysdeps/i386/mp_clz_tab.c
new file mode 100644
index 0000000000..860f98cc62
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/mp_clz_tab.c
@@ -0,0 +1 @@
+/* __clz_tab not needed on i386.  */
diff --git a/REORG.TODO/sysdeps/i386/mul_1.S b/REORG.TODO/sysdeps/i386/mul_1.S
new file mode 100644
index 0000000000..cf83d1b343
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/mul_1.S
@@ -0,0 +1,86 @@
+/* i80386 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+   the result in a second limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+16	/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define SIZE	S1+4
+#define S2LIMB	SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+	.text
+ENTRY (__mpn_mul_1)
+
+	pushl	%res_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%s1_ptr
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%s2_limb
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp), %res_ptr
+	cfi_rel_offset (res_ptr, 12)
+	movl	S1(%esp), %s1_ptr
+	cfi_rel_offset (s1_ptr, 8)
+	movl	SIZE(%esp), %size
+	movl	S2LIMB(%esp), %s2_limb
+	cfi_rel_offset (s2_limb, 0)
+	leal	(%res_ptr,%size,4), %res_ptr
+	leal	(%s1_ptr,%size,4), %s1_ptr
+	negl	%size
+	xorl	%ebp, %ebp
+	cfi_rel_offset (ebp, 4)
+	ALIGN (3)
+L(oop):
+	movl	(%s1_ptr,%size,4), %eax
+	mull	%s2_limb
+	addl	%ebp, %eax
+	movl	%eax, (%res_ptr,%size,4)
+	adcl	$0, %edx
+	movl	%edx, %ebp
+
+	incl	%size
+	jnz	L(oop)
+	movl	%ebp, %eax
+
+	popl	%s2_limb
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s2_limb)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%s1_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (s1_ptr)
+	popl	%res_ptr
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (res_ptr)
+
+	ret
+#undef size
+END (__mpn_mul_1)
diff --git a/REORG.TODO/sysdeps/i386/nptl/Makefile b/REORG.TODO/sysdeps/i386/nptl/Makefile
new file mode 100644
index 0000000000..2c61b352eb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/Makefile
@@ -0,0 +1,26 @@
+# Copyright (C) 2002-2017 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+ifeq ($(subdir),csu)
+gen-as-const-headers += tcb-offsets.sym
+endif
+
+ifeq ($(subdir),nptl)
+CFLAGS-pthread_create.c += -mpreferred-stack-boundary=4
+CFLAGS-tst-align.c += -mpreferred-stack-boundary=4
+CFLAGS-tst-align2.c += -mpreferred-stack-boundary=4
+endif
diff --git a/REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c
new file mode 100644
index 0000000000..a1205b9698
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c
@@ -0,0 +1,19 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Not needed.  pthread_spin_init is an alias for pthread_spin_unlock.  */
diff --git a/REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S
new file mode 100644
index 0000000000..160244b7a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S
@@ -0,0 +1,37 @@
+/* Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <lowlevellock.h>
+
+	.globl	pthread_spin_lock
+	.type	pthread_spin_lock,@function
+	.align	16
+pthread_spin_lock:
+	mov	4(%esp), %eax
+1:	LOCK
+	decl	0(%eax)
+	jne	2f
+	xor	%eax, %eax
+	ret
+
+	.align	16
+2:	rep
+	nop
+	cmpl	$0, 0(%eax)
+	jg	1b
+	jmp	2b
+	.size	pthread_spin_lock,.-pthread_spin_lock
diff --git a/REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S
new file mode 100644
index 0000000000..b6636ae8d7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S
@@ -0,0 +1,31 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+	.globl	pthread_spin_unlock
+	.type	pthread_spin_unlock,@function
+	.align	16
+pthread_spin_unlock:
+	movl	4(%esp), %eax
+	movl	$1, (%eax)
+	xorl	%eax, %eax
+	ret
+	.size	pthread_spin_unlock,.-pthread_spin_unlock
+
+	/* The implementation of pthread_spin_init is identical.  */
+	.globl	pthread_spin_init
+pthread_spin_init = pthread_spin_unlock
diff --git a/REORG.TODO/sysdeps/i386/nptl/pthreaddef.h b/REORG.TODO/sysdeps/i386/nptl/pthreaddef.h
new file mode 100644
index 0000000000..54abccd11b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/pthreaddef.h
@@ -0,0 +1,40 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Default stack size.  */
+#define ARCH_STACK_DEFAULT_SIZE	(2 * 1024 * 1024)
+
+/* Required stack pointer alignment at beginning.  SSE requires 16
+   bytes.  */
+#define STACK_ALIGN		16
+
+/* Minimal stack size after allocating thread descriptor and guard size.  */
+#define MINIMAL_REST_STACK	2048
+
+/* Alignment requirement for TCB.
+
+   Some processors such as Intel Atom pay a big penalty on every
+   access using a segment override if that segment's base is not
+   aligned to the size of a cache line.  (See Intel 64 and IA-32
+   Architectures Optimization Reference Manual, section 13.3.3.3,
+   "Segment Base".)  On such machines, a cache line is 64 bytes.  */
+#define TCB_ALIGNMENT		64
+
+
+/* Location of current stack frame.  */
+#define CURRENT_STACK_FRAME	__builtin_frame_address (0)
diff --git a/REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym b/REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym
new file mode 100644
index 0000000000..695a810386
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym
@@ -0,0 +1,17 @@
+#include <sysdep.h>
+#include <tls.h>
+#include <kernel-features.h>
+
+RESULT			offsetof (struct pthread, result)
+TID			offsetof (struct pthread, tid)
+CANCELHANDLING		offsetof (struct pthread, cancelhandling)
+CLEANUP_JMP_BUF		offsetof (struct pthread, cleanup_jmp_buf)
+MULTIPLE_THREADS_OFFSET	offsetof (tcbhead_t, multiple_threads)
+SYSINFO_OFFSET		offsetof (tcbhead_t, sysinfo)
+CLEANUP			offsetof (struct pthread, cleanup)
+CLEANUP_PREV		offsetof (struct _pthread_cleanup_buffer, __prev)
+MUTEX_FUTEX		offsetof (pthread_mutex_t, __data.__lock)
+POINTER_GUARD		offsetof (tcbhead_t, pointer_guard)
+#ifndef __ASSUME_PRIVATE_FUTEX
+PRIVATE_FUTEX		offsetof (tcbhead_t, private_futex)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/nptl/tls.h b/REORG.TODO/sysdeps/i386/nptl/tls.h
new file mode 100644
index 0000000000..f9a6b11ecf
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/tls.h
@@ -0,0 +1,435 @@
+/* Definition for thread-local data handling.  nptl/i386 version.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _TLS_H
+#define _TLS_H	1
+
+#include <dl-sysdep.h>
+#ifndef __ASSEMBLER__
+# include <stdbool.h>
+# include <stddef.h>
+# include <stdint.h>
+# include <stdlib.h>
+# include <sysdep.h>
+# include <libc-pointer-arith.h> /* For cast_to_integer. */
+# include <kernel-features.h>
+# include <dl-dtv.h>
+
+typedef struct
+{
+  void *tcb;		/* Pointer to the TCB.  Not necessarily the
+			   thread descriptor used by libpthread.  */
+  dtv_t *dtv;
+  void *self;		/* Pointer to the thread descriptor.  */
+  int multiple_threads;
+  uintptr_t sysinfo;
+  uintptr_t stack_guard;
+  uintptr_t pointer_guard;
+  int gscope_flag;
+#ifndef __ASSUME_PRIVATE_FUTEX
+  int private_futex;
+#else
+  int __glibc_reserved1;
+#endif
+  /* Reservation of some values for the TM ABI.  */
+  void *__private_tm[4];
+  /* GCC split stack support.  */
+  void *__private_ss;
+} tcbhead_t;
+
+# define TLS_MULTIPLE_THREADS_IN_TCB 1
+
+#else /* __ASSEMBLER__ */
+# include <tcb-offsets.h>
+#endif
+
+
+/* Alignment requirement for the stack.  For IA-32 this is governed by
+   the SSE memory functions.  */
+#define STACK_ALIGN	16
+
+#ifndef __ASSEMBLER__
+/* Get system call information.  */
+# include <sysdep.h>
+
+/* The old way: using LDT.  */
+
+/* Structure passed to `modify_ldt', 'set_thread_area', and 'clone' calls.  */
+struct user_desc
+{
+  unsigned int entry_number;
+  unsigned long int base_addr;
+  unsigned int limit;
+  unsigned int seg_32bit:1;
+  unsigned int contents:2;
+  unsigned int read_exec_only:1;
+  unsigned int limit_in_pages:1;
+  unsigned int seg_not_present:1;
+  unsigned int useable:1;
+  unsigned int empty:25;
+};
+
+/* Initializing bit fields is slow.  We speed it up by using a union.  */
+union user_desc_init
+{
+  struct user_desc desc;
+  unsigned int vals[4];
+};
+
+
+/* This is the size of the initial TCB.  Can't be just sizeof (tcbhead_t),
+   because NPTL getpid, __libc_alloca_cutoff etc. need (almost) the whole
+   struct pthread even when not linked with -lpthread.  */
+# define TLS_INIT_TCB_SIZE sizeof (struct pthread)
+
+/* Alignment requirements for the initial TCB.  */
+# define TLS_INIT_TCB_ALIGN __alignof__ (struct pthread)
+
+/* This is the size of the TCB.  */
+# define TLS_TCB_SIZE sizeof (struct pthread)
+
+/* Alignment requirements for the TCB.  */
+# define TLS_TCB_ALIGN __alignof__ (struct pthread)
+
+/* The TCB can have any size and the memory following the address the
+   thread pointer points to is unspecified.  Allocate the TCB there.  */
+# define TLS_TCB_AT_TP	1
+# define TLS_DTV_AT_TP	0
+
+/* Get the thread descriptor definition.  */
+# include <nptl/descr.h>
+
+
+/* Install the dtv pointer.  The pointer passed is to the element with
+   index -1 which contain the length.  */
+# define INSTALL_DTV(descr, dtvp) \
+  ((tcbhead_t *) (descr))->dtv = (dtvp) + 1
+
+/* Install new dtv for current thread.  */
+# define INSTALL_NEW_DTV(dtvp) \
+  ({ struct pthread *__pd;						      \
+     THREAD_SETMEM (__pd, header.dtv, (dtvp)); })
+
+/* Return dtv of given thread descriptor.  */
+# define GET_DTV(descr) \
+  (((tcbhead_t *) (descr))->dtv)
+
+/* Macros to load from and store into segment registers.  */
+# ifndef TLS_GET_GS
+#  define TLS_GET_GS() \
+  ({ int __seg; __asm ("movw %%gs, %w0" : "=q" (__seg)); __seg & 0xffff; })
+# endif
+# ifndef TLS_SET_GS
+#  define TLS_SET_GS(val) \
+  __asm ("movw %w0, %%gs" :: "q" (val))
+# endif
+
+#ifdef NEED_DL_SYSINFO
+# define INIT_SYSINFO \
+  _head->sysinfo = GLRO(dl_sysinfo)
+# define SETUP_THREAD_SYSINFO(pd) \
+  ((pd)->header.sysinfo = THREAD_GETMEM (THREAD_SELF, header.sysinfo))
+# define CHECK_THREAD_SYSINFO(pd) \
+  assert ((pd)->header.sysinfo == THREAD_GETMEM (THREAD_SELF, header.sysinfo))
+#else
+# define INIT_SYSINFO
+#endif
+
+#ifndef LOCK_PREFIX
+# ifdef UP
+#  define LOCK_PREFIX  /* nothing */
+# else
+#  define LOCK_PREFIX "lock;"
+# endif
+#endif
+
+static inline void __attribute__ ((unused, always_inline))
+tls_fill_user_desc (union user_desc_init *desc,
+                    unsigned int entry_number,
+                    void *pd)
+{
+  desc->vals[0] = entry_number;
+  /* The 'base_addr' field.  Pointer to the TCB.  */
+  desc->vals[1] = (unsigned long int) pd;
+  /* The 'limit' field.  We use 4GB which is 0xfffff pages.  */
+  desc->vals[2] = 0xfffff;
+  /* Collapsed value of the bitfield:
+     .seg_32bit = 1
+     .contents = 0
+     .read_exec_only = 0
+     .limit_in_pages = 1
+     .seg_not_present = 0
+     .useable = 1 */
+  desc->vals[3] = 0x51;
+}
+
+/* Code to initially initialize the thread pointer.  This might need
+   special attention since 'errno' is not yet available and if the
+   operation can cause a failure 'errno' must not be touched.  */
+# define TLS_INIT_TP(thrdescr) \
+  ({ void *_thrdescr = (thrdescr);					      \
+     tcbhead_t *_head = _thrdescr;					      \
+     union user_desc_init _segdescr;					      \
+     int _result;							      \
+									      \
+     _head->tcb = _thrdescr;						      \
+     /* For now the thread descriptor is at the same address.  */	      \
+     _head->self = _thrdescr;						      \
+     /* New syscall handling support.  */				      \
+     INIT_SYSINFO;							      \
+									      \
+     /* Let the kernel pick a value for the 'entry_number' field.  */	      \
+     tls_fill_user_desc (&_segdescr, -1, _thrdescr);			      \
+									      \
+     /* Install the TLS.  */						      \
+     INTERNAL_SYSCALL_DECL (err);					      \
+     _result = INTERNAL_SYSCALL (set_thread_area, err, 1, &_segdescr.desc);   \
+									      \
+     if (_result == 0)							      \
+       /* We know the index in the GDT, now load the segment register.	      \
+	  The use of the GDT is described by the value 3 in the lower	      \
+	  three bits of the segment descriptor value.			      \
+									      \
+	  Note that we have to do this even if the numeric value of	      \
+	  the descriptor does not change.  Loading the segment register	      \
+	  causes the segment information from the GDT to be loaded	      \
+	  which is necessary since we have changed it.   */		      \
+       TLS_SET_GS (_segdescr.desc.entry_number * 8 + 3);		      \
+									      \
+     _result == 0 ? NULL						      \
+     : "set_thread_area failed when setting up thread-local storage\n"; })
+
+# define TLS_DEFINE_INIT_TP(tp, pd)					      \
+  union user_desc_init _segdescr;					      \
+  /* Find the 'entry_number' field that the kernel selected in TLS_INIT_TP.   \
+     The first three bits of the segment register value select the GDT,	      \
+     ignore them.  We get the index from the value of the %gs register in     \
+     the current thread.  */						      \
+  tls_fill_user_desc (&_segdescr, TLS_GET_GS () >> 3, pd);		      \
+  const struct user_desc *tp = &_segdescr.desc
+
+
+/* Return the address of the dtv for the current thread.  */
+# define THREAD_DTV() \
+  ({ struct pthread *__pd;						      \
+     THREAD_GETMEM (__pd, header.dtv); })
+
+
+/* Return the thread descriptor for the current thread.
+
+   The contained asm must *not* be marked volatile since otherwise
+   assignments like
+	pthread_descr self = thread_self();
+   do not get optimized away.  */
+# define THREAD_SELF \
+  ({ struct pthread *__self;						      \
+     asm ("movl %%gs:%c1,%0" : "=r" (__self)				      \
+	  : "i" (offsetof (struct pthread, header.self)));		      \
+     __self;})
+
+/* Magic for libthread_db to know how to do THREAD_SELF.  */
+# define DB_THREAD_SELF \
+  REGISTER_THREAD_AREA (32, offsetof (struct user_regs_struct, xgs), 3) \
+  REGISTER_THREAD_AREA (64, 26 * 8, 3) /* x86-64's user_regs_struct->gs */
+
+
+/* Read member of the thread descriptor directly.  */
+# define THREAD_GETMEM(descr, member) \
+  ({ __typeof (descr->member) __value;					      \
+     if (sizeof (__value) == 1)						      \
+       asm volatile ("movb %%gs:%P2,%b0"				      \
+		     : "=q" (__value)					      \
+		     : "0" (0), "i" (offsetof (struct pthread, member)));     \
+     else if (sizeof (__value) == 4)					      \
+       asm volatile ("movl %%gs:%P1,%0"					      \
+		     : "=r" (__value)					      \
+		     : "i" (offsetof (struct pthread, member)));	      \
+     else								      \
+       {								      \
+	 if (sizeof (__value) != 8)					      \
+	   /* There should not be any value with a size other than 1,	      \
+	      4 or 8.  */						      \
+	   abort ();							      \
+									      \
+	 asm volatile ("movl %%gs:%P1,%%eax\n\t"			      \
+		       "movl %%gs:%P2,%%edx"				      \
+		       : "=A" (__value)					      \
+		       : "i" (offsetof (struct pthread, member)),	      \
+			 "i" (offsetof (struct pthread, member) + 4));	      \
+       }								      \
+     __value; })
+
+
+/* Same as THREAD_GETMEM, but the member offset can be non-constant.  */
+# define THREAD_GETMEM_NC(descr, member, idx) \
+  ({ __typeof (descr->member[0]) __value;				      \
+     if (sizeof (__value) == 1)						      \
+       asm volatile ("movb %%gs:%P2(%3),%b0"				      \
+		     : "=q" (__value)					      \
+		     : "0" (0), "i" (offsetof (struct pthread, member[0])),   \
+		     "r" (idx));					      \
+     else if (sizeof (__value) == 4)					      \
+       asm volatile ("movl %%gs:%P1(,%2,4),%0"				      \
+		     : "=r" (__value)					      \
+		     : "i" (offsetof (struct pthread, member[0])),	      \
+		       "r" (idx));					      \
+     else								      \
+       {								      \
+	 if (sizeof (__value) != 8)					      \
+	   /* There should not be any value with a size other than 1,	      \
+	      4 or 8.  */						      \
+	   abort ();							      \
+									      \
+	 asm volatile  ("movl %%gs:%P1(,%2,8),%%eax\n\t"		      \
+			"movl %%gs:4+%P1(,%2,8),%%edx"			      \
+			: "=&A" (__value)				      \
+			: "i" (offsetof (struct pthread, member[0])),	      \
+			  "r" (idx));					      \
+       }								      \
+     __value; })
+
+
+
+/* Set member of the thread descriptor directly.  */
+# define THREAD_SETMEM(descr, member, value) \
+  ({ if (sizeof (descr->member) == 1)					      \
+       asm volatile ("movb %b0,%%gs:%P1" :				      \
+		     : "iq" (value),					      \
+		       "i" (offsetof (struct pthread, member)));	      \
+     else if (sizeof (descr->member) == 4)				      \
+       asm volatile ("movl %0,%%gs:%P1" :				      \
+		     : "ir" (value),					      \
+		       "i" (offsetof (struct pthread, member)));	      \
+     else								      \
+       {								      \
+	 if (sizeof (descr->member) != 8)				      \
+	   /* There should not be any value with a size other than 1,	      \
+	      4 or 8.  */						      \
+	   abort ();							      \
+									      \
+	 asm volatile ("movl %%eax,%%gs:%P1\n\t"			      \
+		       "movl %%edx,%%gs:%P2" :				      \
+		       : "A" ((uint64_t) cast_to_integer (value)),	      \
+			 "i" (offsetof (struct pthread, member)),	      \
+			 "i" (offsetof (struct pthread, member) + 4));	      \
+       }})
+
+
+/* Same as THREAD_SETMEM, but the member offset can be non-constant.  */
+# define THREAD_SETMEM_NC(descr, member, idx, value) \
+  ({ if (sizeof (descr->member[0]) == 1)				      \
+       asm volatile ("movb %b0,%%gs:%P1(%2)" :				      \
+		     : "iq" (value),					      \
+		       "i" (offsetof (struct pthread, member)),		      \
+		       "r" (idx));					      \
+     else if (sizeof (descr->member[0]) == 4)				      \
+       asm volatile ("movl %0,%%gs:%P1(,%2,4)" :			      \
+		     : "ir" (value),					      \
+		       "i" (offsetof (struct pthread, member)),		      \
+		       "r" (idx));					      \
+     else								      \
+       {								      \
+	 if (sizeof (descr->member[0]) != 8)				      \
+	   /* There should not be any value with a size other than 1,	      \
+	      4 or 8.  */						      \
+	   abort ();							      \
+									      \
+	 asm volatile ("movl %%eax,%%gs:%P1(,%2,8)\n\t"			      \
+		       "movl %%edx,%%gs:4+%P1(,%2,8)" :			      \
+		       : "A" ((uint64_t) cast_to_integer (value)),	      \
+			 "i" (offsetof (struct pthread, member)),	      \
+			 "r" (idx));					      \
+       }})
+
+
+/* Atomic compare and exchange on TLS, returning old value.  */
+#define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \
+  ({ __typeof (descr->member) __ret;					      \
+     __typeof (oldval) __old = (oldval);				      \
+     if (sizeof (descr->member) == 4)					      \
+       asm volatile (LOCK_PREFIX "cmpxchgl %2, %%gs:%P3"		      \
+		     : "=a" (__ret)					      \
+		     : "0" (__old), "r" (newval),			      \
+		       "i" (offsetof (struct pthread, member)));	      \
+     else								      \
+       /* Not necessary for other sizes in the moment.  */		      \
+       abort ();							      \
+     __ret; })
+
+
+/* Atomic logical and.  */
+#define THREAD_ATOMIC_AND(descr, member, val) \
+  (void) ({ if (sizeof ((descr)->member) == 4)				      \
+	      asm volatile (LOCK_PREFIX "andl %1, %%gs:%P0"		      \
+			    :: "i" (offsetof (struct pthread, member)),	      \
+			       "ir" (val));				      \
+	    else							      \
+	      /* Not necessary for other sizes in the moment.  */	      \
+	      abort (); })
+
+
+/* Atomic set bit.  */
+#define THREAD_ATOMIC_BIT_SET(descr, member, bit) \
+  (void) ({ if (sizeof ((descr)->member) == 4)				      \
+	      asm volatile (LOCK_PREFIX "orl %1, %%gs:%P0"		      \
+			    :: "i" (offsetof (struct pthread, member)),	      \
+			       "ir" (1 << (bit)));			      \
+	    else							      \
+	      /* Not necessary for other sizes in the moment.  */	      \
+	      abort (); })
+
+
+/* Set the stack guard field in TCB head.  */
+#define THREAD_SET_STACK_GUARD(value) \
+  THREAD_SETMEM (THREAD_SELF, header.stack_guard, value)
+#define THREAD_COPY_STACK_GUARD(descr) \
+  ((descr)->header.stack_guard						      \
+   = THREAD_GETMEM (THREAD_SELF, header.stack_guard))
+
+
+/* Set the pointer guard field in the TCB head.  */
+#define THREAD_SET_POINTER_GUARD(value) \
+  THREAD_SETMEM (THREAD_SELF, header.pointer_guard, value)
+#define THREAD_COPY_POINTER_GUARD(descr) \
+  ((descr)->header.pointer_guard					      \
+   = THREAD_GETMEM (THREAD_SELF, header.pointer_guard))
+
+
+/* Get and set the global scope generation counter in the TCB head.  */
+#define THREAD_GSCOPE_FLAG_UNUSED 0
+#define THREAD_GSCOPE_FLAG_USED   1
+#define THREAD_GSCOPE_FLAG_WAIT   2
+#define THREAD_GSCOPE_RESET_FLAG() \
+  do									      \
+    { int __res;							      \
+      asm volatile ("xchgl %0, %%gs:%P1"				      \
+		    : "=r" (__res)					      \
+		    : "i" (offsetof (struct pthread, header.gscope_flag)),    \
+		      "0" (THREAD_GSCOPE_FLAG_UNUSED));			      \
+      if (__res == THREAD_GSCOPE_FLAG_WAIT)				      \
+	lll_futex_wake (&THREAD_SELF->header.gscope_flag, 1, LLL_PRIVATE);    \
+    }									      \
+  while (0)
+#define THREAD_GSCOPE_SET_FLAG() \
+  THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED)
+#define THREAD_GSCOPE_WAIT() \
+  GL(dl_wait_lookup_done) ()
+
+#endif /* __ASSEMBLER__ */
+
+#endif	/* tls.h */
diff --git a/REORG.TODO/sysdeps/i386/preconfigure b/REORG.TODO/sysdeps/i386/preconfigure
new file mode 100644
index 0000000000..c8fefd1bff
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/preconfigure
@@ -0,0 +1,5 @@
+# preconfigure fragment for i386.
+
+case "$machine" in
+i[4567]86)	base_machine=i386 machine=i386/$machine ;;
+esac
diff --git a/REORG.TODO/sysdeps/i386/pthread_spin_trylock.S b/REORG.TODO/sysdeps/i386/pthread_spin_trylock.S
new file mode 100644
index 0000000000..f71a9fcb2d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/pthread_spin_trylock.S
@@ -0,0 +1,46 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <pthread-errnos.h>
+
+
+#ifdef UP
+# define LOCK
+#else
+# define LOCK lock
+#endif
+
+	.globl	pthread_spin_trylock
+	.type	pthread_spin_trylock,@function
+	.align	16
+pthread_spin_trylock:
+	movl	4(%esp), %edx
+	movl	$1, %eax
+	xorl	%ecx, %ecx
+	LOCK
+	cmpxchgl %ecx, (%edx)
+	movl	$EBUSY, %eax
+#ifdef HAVE_CMOV
+	cmovel	%ecx, %eax
+#else
+	jne	0f
+	movl	%ecx, %eax
+0:
+#endif
+	ret
+	.size	pthread_spin_trylock,.-pthread_spin_trylock
diff --git a/REORG.TODO/sysdeps/i386/rawmemchr.S b/REORG.TODO/sysdeps/i386/rawmemchr.S
new file mode 100644
index 0000000000..246ec3f18e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/rawmemchr.S
@@ -0,0 +1,222 @@
+/* rawmemchr (str, ch) -- Return pointer to first occurrence of CH in STR.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+   This version is developed using the same algorithm as the fast C
+   version which carries the following introduction:
+   Based on strlen implementation by Torbjorn Granlund (tege@sics.se),
+   with help from Dan Sahlin (dan@sics.se) and
+   commentary by Jim Blandy (jimb@ai.mit.edu);
+   adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
+   and implemented by Roland McGrath (roland@ai.mit.edu).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* space for 1 saved reg */
+#define RTN	PARMS
+#define STR	RTN
+#define CHR	STR+4
+
+	.text
+ENTRY (__rawmemchr)
+
+	/* Save callee-safe register used in this function.  */
+	pushl %edi
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+
+	/* Load parameters into registers.  */
+	movl STR(%esp), %eax
+	movl CHR(%esp), %edx
+
+	/* At the moment %edx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* Now it is 0|0|c|c */
+	movl %edx, %ecx
+	shll $16, %edx		/* Now c|c|0|0 */
+	movw %cx, %dx		/* And finally c|c|c|c */
+
+	/* Better performance can be achieved if the word (32
+	   bit) memory access is aligned on a four-byte-boundary.
+	   So process first bytes one by one until boundary is
+	   reached. Don't use a loop for better performance.  */
+
+	testb $3, %al		/* correctly aligned ? */
+	je L(1)			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L(9)			/* target found => return */
+	incl %eax		/* increment source pointer */
+
+	testb $3, %al		/* correctly aligned ? */
+	je L(1)			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L(9)			/* target found => return */
+	incl %eax		/* increment source pointer */
+
+	testb $3, %al		/* correctly aligned ? */
+	je L(1)			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L(9)			/* target found => return */
+	incl %eax		/* increment source pointer */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for C, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is C.  This turns each byte that is C
+	 into a zero.  */
+
+
+	/* Each round the main loop processes 16 bytes.  */
+	ALIGN (4)
+
+L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occurred in the last byte => it was 0.	*/
+	jnc L(8)
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %edi.  */
+	jnz L(8)		/* found it => return pointer */
+
+	/* This process is unfolded four times for better performance.
+	   we don't increment the source pointer each time.  Instead we
+	   use offsets and increment by 16 in each run of the loop.  But
+	   before probing for the matching byte we need some extra code
+	   (following LL(13) below).  Even the len can be compared with
+	   constants instead of decrementing each time.  */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(7)		/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(7)		/* found it => return pointer */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(6)		/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(6)		/* found it => return pointer */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(5)		/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(5)		/* found it => return pointer */
+
+	/* Adjust both counters for a full round, i.e. 16 bytes.  */
+	addl $16, %eax
+	jmp L(1)
+	/* add missing source pointer increments */
+L(5):	addl $4, %eax
+L(6):	addl $4, %eax
+L(7):	addl $4, %eax
+
+	/* Test for the matching byte in the word.  %ecx contains a NUL
+	   char in the byte which originally was the byte we are looking
+	   at.  */
+L(8):	testb %cl, %cl		/* test first byte in dword */
+	jz L(9)			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	testb %ch, %ch		/* test second byte in dword */
+	jz L(9)			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	testl $0xff0000, %ecx	/* test third byte in dword */
+	jz L(9)			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	/* No further test needed we we know it is one of the four bytes.  */
+
+L(9):
+	popl %edi		/* pop saved register */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__rawmemchr)
+
+libc_hidden_def (__rawmemchr)
+weak_alias (__rawmemchr, rawmemchr)
diff --git a/REORG.TODO/sysdeps/i386/rshift.S b/REORG.TODO/sysdeps/i386/rshift.S
new file mode 100644
index 0000000000..cf179052b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/rshift.S
@@ -0,0 +1,105 @@
+/* i80386 __mpn_rshift --
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+12		/* space for 3 saved regs */
+#define RES	PARMS
+#define S	RES+4
+#define SIZE	S+4
+#define CNT	SIZE+4
+
+	.text
+ENTRY (__mpn_rshift)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 8)
+	movl	S(%esp),%esi
+	cfi_rel_offset (esi, 4)
+	movl	SIZE(%esp),%edx
+	movl	CNT(%esp),%ecx
+	leal	-4(%edi,%edx,4),%edi
+	leal	(%esi,%edx,4),%esi
+	negl	%edx
+
+	movl	(%esi,%edx,4),%ebx	/* read least significant limb */
+	cfi_rel_offset (ebx, 0)
+	cfi_remember_state
+	xorl	%eax,%eax
+	shrdl	%cl,%ebx,%eax		/* compute carry limb */
+	incl	%edx
+	jz	L(end)
+	pushl	%eax			/* push carry limb onto stack */
+	cfi_adjust_cfa_offset (4)
+	testb	$1,%dl
+	jnz	L(1)			/* enter loop in the middle */
+	movl	%ebx,%eax
+
+	ALIGN (3)
+L(oop):	movl	(%esi,%edx,4),%ebx	/* load next higher limb */
+	shrdl	%cl,%ebx,%eax		/* compute result limb */
+	movl	%eax,(%edi,%edx,4)	/* store it */
+	incl	%edx
+L(1):	movl	(%esi,%edx,4),%eax
+	shrdl	%cl,%eax,%ebx
+	movl	%ebx,(%edi,%edx,4)
+	incl	%edx
+	jnz	L(oop)
+
+	shrl	%cl,%eax		/* compute most significant limb */
+	movl	%eax,(%edi)		/* store it */
+
+	popl	%eax			/* pop carry limb */
+	cfi_adjust_cfa_offset (-4)
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+
+	cfi_restore_state
+L(end):	shrl	%cl,%ebx		/* compute most significant limb */
+	movl	%ebx,(%edi)		/* store it */
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_rshift)
diff --git a/REORG.TODO/sysdeps/i386/setfpucw.c b/REORG.TODO/sysdeps/i386/setfpucw.c
new file mode 100644
index 0000000000..40b995f18a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/setfpucw.c
@@ -0,0 +1,54 @@
+/* Set the FPU control word for x86.
+   Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+#include <fpu_control.h>
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+void
+__setfpucw (fpu_control_t set)
+{
+  fpu_control_t cw;
+
+  /* Fetch the current control word.  */
+  __asm__ ("fnstcw %0" : "=m" (*&cw));
+
+  /* Preserve the reserved bits, and set the rest as the user
+     specified (or the default, if the user gave zero).  */
+  cw &= _FPU_RESERVED;
+  cw |= set & ~_FPU_RESERVED;
+
+  __asm__ ("fldcw %0" : : "m" (*&cw));
+
+  /* If the CPU supports SSE, we set the MXCSR as well.  */
+  if (HAS_CPU_FEATURE (SSE))
+    {
+      unsigned int xnew_exc;
+
+      /* Get the current MXCSR.  */
+      __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+      xnew_exc &= ~((0xc00 << 3) | (FE_ALL_EXCEPT << 7));
+      xnew_exc |= ((set & 0xc00) << 3) | ((set & FE_ALL_EXCEPT) << 7);
+
+      __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+    }
+}
diff --git a/REORG.TODO/sysdeps/i386/setjmp.S b/REORG.TODO/sysdeps/i386/setjmp.S
new file mode 100644
index 0000000000..738a899e8b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/setjmp.S
@@ -0,0 +1,58 @@
+/* setjmp for i386.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <jmpbuf-offsets.h>
+#include <asm-syntax.h>
+#include <stap-probe.h>
+
+#define PARMS	4		/* no space for saved regs */
+#define JMPBUF	PARMS
+#define SIGMSK	JMPBUF+4
+
+ENTRY (__sigsetjmp)
+
+	movl JMPBUF(%esp), %eax
+
+	/* Save registers.  */
+	movl %ebx, (JB_BX*4)(%eax)
+	movl %esi, (JB_SI*4)(%eax)
+	movl %edi, (JB_DI*4)(%eax)
+	leal JMPBUF(%esp), %ecx	/* Save SP as it will be after we return.  */
+#ifdef PTR_MANGLE
+	PTR_MANGLE (%ecx)
+#endif
+	movl %ecx, (JB_SP*4)(%eax)
+	movl 0(%esp), %ecx	/* Save PC we are returning to now.  */
+	LIBC_PROBE (setjmp, 3, 4@%eax, -4@SIGMSK(%esp), 4@%ecx)
+#ifdef PTR_MANGLE
+	PTR_MANGLE (%ecx)
+#endif
+	movl %ecx, (JB_PC*4)(%eax)
+	movl %ebp, (JB_BP*4)(%eax) /* Save caller's frame pointer.  */
+
+#if IS_IN (rtld)
+	/* In ld.so we never save the signal mask.  */
+	xorl %eax, %eax
+	ret
+#else
+	/* Make a tail call to __sigjmp_save; it takes the same args.  */
+	jmp __sigjmp_save
+#endif
+END (__sigsetjmp)
+hidden_def (__sigsetjmp)
diff --git a/REORG.TODO/sysdeps/i386/stackguard-macros.h b/REORG.TODO/sysdeps/i386/stackguard-macros.h
new file mode 100644
index 0000000000..039762927c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/stackguard-macros.h
@@ -0,0 +1,12 @@
+#include <stdint.h>
+
+#define STACK_CHK_GUARD \
+  ({ uintptr_t x; asm ("movl %%gs:0x14, %0" : "=r" (x)); x; })
+
+#define POINTER_CHK_GUARD \
+  ({							\
+     uintptr_t x;					\
+     asm ("movl %%gs:%c1, %0" : "=r" (x)		\
+	  : "i" (offsetof (tcbhead_t, pointer_guard)));	\
+     x;							\
+   })
diff --git a/REORG.TODO/sysdeps/i386/stackinfo.h b/REORG.TODO/sysdeps/i386/stackinfo.h
new file mode 100644
index 0000000000..ba17867d3a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/stackinfo.h
@@ -0,0 +1,43 @@
+/* Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file contains a bit of information about the stack allocation
+   of the processor.  */
+
+#ifndef _STACKINFO_H
+#define _STACKINFO_H	1
+
+#include <elf.h>
+
+/* On x86 the stack grows down.  */
+#define _STACK_GROWS_DOWN	1
+
+/* Default to an executable stack.  PF_X can be overridden if PT_GNU_STACK is
+ * present, but it is presumed absent.  */
+#define DEFAULT_STACK_PERMS (PF_R|PF_W|PF_X)
+
+/* Access to the stack pointer.  The macros are used in alloca_account
+   for which they need to act as barriers as well, hence the additional
+   (unnecessary) parameters.  */
+#define stackinfo_get_sp() \
+  ({ void *p__; asm volatile ("mov %%esp, %0" : "=r" (p__)); p__; })
+#define stackinfo_sub_sp(ptr) \
+  ({ ptrdiff_t d__;                                             \
+     asm volatile ("sub %%esp, %0" : "=r" (d__) : "0" (ptr));   \
+     d__; })
+
+#endif	/* stackinfo.h */
diff --git a/REORG.TODO/sysdeps/i386/start.S b/REORG.TODO/sysdeps/i386/start.S
new file mode 100644
index 0000000000..ccb1e2b38f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/start.S
@@ -0,0 +1,139 @@
+/* Startup code compliant to the ELF i386 ABI.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file with other
+   programs, and to distribute those programs without any restriction
+   coming from the use of this file. (The GNU Lesser General Public
+   License restrictions do apply in other respects; for example, they
+   cover modification of the file, and distribution when not linked
+   into another program.)
+
+   Note that people who make modified versions of this file are not
+   obligated to grant this special exception for their modified
+   versions; it is their choice whether to do so. The GNU Lesser
+   General Public License gives permission to release a modified
+   version without this exception; this exception also makes it
+   possible to release a modified version which carries forward this
+   exception.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is the canonical entry point, usually the first thing in the text
+   segment.  The SVR4/i386 ABI (pages 3-31, 3-32) says that when the entry
+   point runs, most registers' values are unspecified, except for:
+
+   %edx		Contains a function pointer to be registered with `atexit'.
+		This is how the dynamic linker arranges to have DT_FINI
+		functions called for shared libraries that have been loaded
+		before this code runs.
+
+   %esp		The stack contains the arguments and environment:
+		0(%esp)			argc
+		4(%esp)			argv[0]
+		...
+		(4*argc)(%esp)		NULL
+		(4*(argc+1))(%esp)	envp[0]
+		...
+					NULL
+*/
+
+	.text
+	.globl _start
+	.type _start,@function
+_start:
+	/* Clear the frame pointer.  The ABI suggests this be done, to mark
+	   the outermost frame obviously.  */
+	xorl %ebp, %ebp
+
+	/* Extract the arguments as encoded on the stack and set up
+	   the arguments for `main': argc, argv.  envp will be determined
+	   later in __libc_start_main.  */
+	popl %esi		/* Pop the argument count.  */
+	movl %esp, %ecx		/* argv starts just at the current stack top.*/
+
+	/* Before pushing the arguments align the stack to a 16-byte
+	(SSE needs 16-byte alignment) boundary to avoid penalties from
+	misaligned accesses.  Thanks to Edward Seidl <seidl@janed.com>
+	for pointing this out.  */
+	andl $0xfffffff0, %esp
+	pushl %eax		/* Push garbage because we allocate
+				   28 more bytes.  */
+
+	/* Provide the highest stack address to the user code (for stacks
+	   which grow downwards).  */
+	pushl %esp
+
+	pushl %edx		/* Push address of the shared library
+				   termination function.  */
+
+#ifdef SHARED
+	/* Load PIC register.  */
+	call 1f
+	addl $_GLOBAL_OFFSET_TABLE_, %ebx
+
+	/* Push address of our own entry points to .fini and .init.  */
+	leal __libc_csu_fini@GOTOFF(%ebx), %eax
+	pushl %eax
+	leal __libc_csu_init@GOTOFF(%ebx), %eax
+	pushl %eax
+
+	pushl %ecx		/* Push second argument: argv.  */
+	pushl %esi		/* Push first argument: argc.  */
+
+	pushl main@GOT(%ebx)
+
+	/* Call the user's main function, and exit with its value.
+	   But let the libc call main.    */
+	call __libc_start_main@PLT
+#else
+	/* Push address of our own entry points to .fini and .init.  */
+	pushl $__libc_csu_fini
+	pushl $__libc_csu_init
+
+	pushl %ecx		/* Push second argument: argv.  */
+	pushl %esi		/* Push first argument: argc.  */
+
+	pushl $main
+
+	/* Call the user's main function, and exit with its value.
+	   But let the libc call main.    */
+	call __libc_start_main
+#endif
+
+	hlt			/* Crash if somehow `exit' does return.  */
+
+#ifdef SHARED
+1:	movl	(%esp), %ebx
+	ret
+#endif
+
+/* To fulfill the System V/i386 ABI we need this symbol.  Yuck, it's so
+   meaningless since we don't support machines < 80386.  */
+	.section .rodata
+	.globl _fp_hw
+_fp_hw:	.long 3
+	.size _fp_hw, 4
+	.type _fp_hw,@object
+
+/* Define a symbol for the first piece of initialized data.  */
+	.data
+	.globl __data_start
+__data_start:
+	.long 0
+	.weak data_start
+	data_start = __data_start
diff --git a/REORG.TODO/sysdeps/i386/stpcpy.S b/REORG.TODO/sysdeps/i386/stpcpy.S
new file mode 100644
index 0000000000..d9981b677b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/stpcpy.S
@@ -0,0 +1,88 @@
+/* Copy SRC to DEST returning the address of the terminating '\0' in DEST.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper (drepper@gnu.ai.mit.edu).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This function is defined neither in ANSI nor POSIX standards but is
+   also not invented here.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define RTN	PARMS
+#define DEST	RTN
+#define SRC	DEST+4
+
+	.text
+ENTRY (__stpcpy)
+
+	movl DEST(%esp), %eax
+	movl SRC(%esp), %ecx
+	subl %eax, %ecx		/* magic: reduce number of loop variants
+				   to one using addressing mode */
+
+	/* Here we would like to write
+
+	subl $4, %eax
+	ALIGN (4)
+
+	but the assembler is too smart and optimizes for the shortest
+	form where the number only needs one byte.  But if we could
+	have the long form we would not need the alignment.  */
+
+	.byte 0x81, 0xe8	/* This is `subl $0x00000004, %eax' */
+	.long 0x00000004
+
+	/* Four times unfolded loop with only one loop counter.  This
+	   is achieved by the use of index+base addressing mode.  As the
+	   loop counter we use the destination address because this is
+	   also the result.  */
+L(1):	addl $4, %eax		/* increment loop counter */
+
+	movb (%eax,%ecx), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(2)			/* yes, then exit */
+
+	movb 1(%eax,%ecx), %dl	/* load current char */
+	movb %dl, 1(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(3)			/* yes, then exit */
+
+	movb 2(%eax,%ecx), %dl	/* load current char */
+	movb %dl, 2(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(4)			/* yes, then exit */
+
+	movb 3(%eax,%ecx), %dl	/* load current char */
+	movb %dl, 3(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jnz L(1)		/* no, then continue loop */
+
+	incl %eax		/* correct loop counter */
+L(4):	incl %eax
+L(3):	incl %eax
+L(2):
+
+	ret
+END (__stpcpy)
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/i386/stpncpy.S b/REORG.TODO/sysdeps/i386/stpncpy.S
new file mode 100644
index 0000000000..46f2aba713
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/stpncpy.S
@@ -0,0 +1,147 @@
+/* copy no more than N bytes from SRC to DEST, returning the address of
+   the terminating '\0' in DEST.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Some bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+     - original wrote n+1 chars in some cases.
+     - stpncpy() ought to behave like strncpy() ie. not null-terminate
+       if limited by n.  glibc-1.09 stpncpy() does this.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* space for 1 saved reg */
+#define RTN	PARMS
+#define DEST	RTN
+#define SRC	DEST+4
+#define LEN	SRC+4
+
+	.text
+ENTRY (__stpncpy)
+
+	pushl %esi
+	cfi_adjust_cfa_offset (4)
+
+	movl DEST(%esp), %eax
+	movl SRC(%esp), %esi
+	cfi_rel_offset (esi, 0)
+	movl LEN(%esp), %ecx
+
+	subl %eax, %esi		/* magic: reduce number of loop variants
+				   to one using addressing mode */
+	jmp L(1)		/* jump to loop "head" */
+
+	ALIGN(4)
+
+	/* Four times unfolded loop with two loop counters.  We get the
+	   third value (the source address) by using the index+base
+	   addressing mode.  */
+L(2):	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(7)			/* yes, then exit */
+
+	movb 1(%eax,%esi), %dl	/* load current char */
+	movb %dl, 1(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(6)			/* yes, then exit */
+
+	movb 2(%eax,%esi), %dl	/* load current char */
+	movb %dl, 2(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(5)			/* yes, then exit */
+
+	movb 3(%eax,%esi), %dl	/* load current char */
+	movb %dl, 3(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(4)			/* yes, then exit */
+
+	addl $4, %eax		/* increment loop counter for full round */
+
+L(1):	subl $4, %ecx		/* still more than 4 bytes allowed? */
+	jae L(2)		/* yes, then go to start of loop */
+
+	/* The maximal remaining 15 bytes are not processed in a loop.  */
+
+	addl $4, %ecx		/* correct above subtraction */
+	jz L(9)			/* maximal allowed char reached => go to end */
+
+	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(3)			/* yes, then exit */
+
+	incl %eax		/* increment pointer */
+	decl %ecx		/* decrement length counter */
+	jz L(9)			/* no more allowed => exit */
+
+	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(3)			/* yes, then exit */
+
+	incl %eax		/* increment pointer */
+	decl %ecx		/* decrement length counter */
+	jz L(9)			/* no more allowed => exit */
+
+	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L(3)			/* yes, then exit */
+
+	incl %eax		/* increment pointer */
+	jmp L(9)		/* we don't have to test for counter underflow
+				   because we know we had a most 3 bytes
+				   remaining => exit */
+
+	/* When coming from the main loop we have to adjust the pointer.  */
+L(4):	decl %ecx		/* decrement counter */
+	incl %eax		/* increment pointer */
+
+L(5):	decl %ecx		/* increment pointer */
+	incl %eax		/* increment pointer */
+
+L(6):	decl %ecx		/* increment pointer */
+	incl %eax		/* increment pointer */
+L(7):
+
+	addl $3, %ecx		/* correct pre-decrementation of counter
+				   at the beginning of the loop; but why 3
+				   and not 4?  Very simple, we have to count
+				   the NUL char we already wrote.  */
+	jz L(9)			/* counter is also 0 => exit */
+
+	/* We now have to fill the rest of the buffer with NUL.  This
+	   is done in a tricky way.  Please note that the addressing mode
+	   used below is not the same we used above.  Here we use the
+	   %ecx register.  */
+L(8):
+	movb $0, (%ecx,%eax)	/* store NUL char */
+L(3):	decl %ecx		/* all bytes written? */
+	jnz L(8)		/* no, then again */
+
+L(9):	popl %esi		/* restore saved register content */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+
+	ret
+END (__stpncpy)
+
+libc_hidden_def (__stpncpy)
+weak_alias (__stpncpy, stpncpy)
diff --git a/REORG.TODO/sysdeps/i386/strcat.S b/REORG.TODO/sysdeps/i386/strcat.S
new file mode 100644
index 0000000000..4a26b3c528
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strcat.S
@@ -0,0 +1,265 @@
+/* strcat(dest, src) -- Append SRC on the end of DEST.
+   For Intel 80x86, x>=4.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>.
+   Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* space for 1 saved reg */
+#define RTN	PARMS
+#define DEST	RTN
+#define SRC	DEST+4
+
+	.text
+ENTRY (strcat)
+
+	pushl %edi		/* Save callee-safe register.  */
+	cfi_adjust_cfa_offset (4)
+
+	movl DEST(%esp), %edx
+	movl SRC(%esp), %ecx
+
+	testb $0xff, (%ecx)	/* Is source string empty? */
+	jz L(8)			/* yes => return */
+
+	/* Test the first bytes separately until destination is aligned.  */
+	testl $3, %edx		/* destination pointer aligned? */
+	jz L(1)			/* yes => begin scan loop */
+	testb $0xff, (%edx)	/* is end of string? */
+	jz L(2)			/* yes => start appending */
+	incl %edx		/* increment source pointer */
+
+	testl $3, %edx		/* destination pointer aligned? */
+	jz L(1)			/* yes => begin scan loop */
+	testb $0xff, (%edx)	/* is end of string? */
+	jz L(2)			/* yes => start appending */
+	incl %edx		/* increment source pointer */
+
+	testl $3, %edx		/* destination pointer aligned? */
+	jz L(1)			/* yes => begin scan loop */
+	testb $0xff, (%edx)	/* is end of string? */
+	jz L(2)			/* yes => start appending */
+	incl %edx		/* increment source pointer */
+
+	/* Now we are aligned.  Begin scan loop.  */
+	jmp L(1)
+
+	cfi_rel_offset (edi, 0)
+	ALIGN(4)
+
+L(4):	addl $16,%edx		/* increment destination pointer for round */
+
+L(1):	movl (%edx), %eax	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+
+	/* If you compare this with the algorithm in memchr.S you will
+	   notice that here is an `xorl' statement missing.  But you must
+	   not forget that we are looking for C == 0 and `xorl $0, %eax'
+	   is a no-op.  */
+
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occurred in the last byte => it was 0.	*/
+	jnc L(3)
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %ecx.  */
+	jnz L(3)
+
+	movl 4(%edx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(5)		/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(5)		/* one byte is NUL => stop copying */
+
+	movl 8(%edx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(6)		/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(6)		/* one byte is NUL => stop copying */
+
+	movl 12(%edx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(7)		/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L(4)			/* no byte is NUL => carry on copying */
+
+L(7):	addl $4, %edx		/* adjust source pointer */
+L(6):	addl $4, %edx
+L(5):	addl $4, %edx
+
+L(3):	testb %al, %al		/* is first byte NUL? */
+	jz L(2)			/* yes => start copying */
+	incl %edx		/* increment source pointer */
+
+	testb %ah, %ah		/* is second byte NUL? */
+	jz L(2)			/* yes => start copying */
+	incl %edx		/* increment source pointer */
+
+	testl $0xff0000, %eax	/* is third byte NUL? */
+	jz L(2)			/* yes => start copying */
+	incl %edx		/* increment source pointer */
+
+L(2):	subl %ecx, %edx		/* reduce number of loop variants */
+
+	/* Now we have to align the source pointer.  */
+	testl $3, %ecx		/* pointer correctly aligned? */
+	jz L(29)		/* yes => start copy loop */
+	movb (%ecx), %al	/* get first byte */
+	movb %al, (%ecx,%edx)	/* and store it */
+	andb %al, %al		/* is byte NUL? */
+	jz L(8)			/* yes => return */
+	incl %ecx		/* increment pointer */
+
+	testl $3, %ecx		/* pointer correctly aligned? */
+	jz L(29)		/* yes => start copy loop */
+	movb (%ecx), %al	/* get first byte */
+	movb %al, (%ecx,%edx)	/* and store it */
+	andb %al, %al		/* is byte NUL? */
+	jz L(8)			/* yes => return */
+	incl %ecx		/* increment pointer */
+
+	testl $3, %ecx		/* pointer correctly aligned? */
+	jz L(29)		/* yes => start copy loop */
+	movb (%ecx), %al	/* get first byte */
+	movb %al, (%ecx,%edx)	/* and store it */
+	andb %al, %al		/* is byte NUL? */
+	jz L(8)			/* yes => return */
+	incl %ecx		/* increment pointer */
+
+	/* Now we are aligned.  */
+	jmp L(29)		/* start copy loop */
+
+	ALIGN(4)
+
+L(28):	movl %eax, 12(%ecx,%edx)/* store word at destination */
+	addl $16, %ecx		/* adjust pointer for full round */
+
+L(29):	movl (%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(9)		/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(9)		/* one byte is NUL => stop copying */
+	movl %eax, (%ecx,%edx)	/* store word to destination */
+
+	movl 4(%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(91)		/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(91)		/* one byte is NUL => stop copying */
+	movl %eax, 4(%ecx,%edx)	/* store word to destination */
+
+	movl 8(%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(92)		/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(92)		/* one byte is NUL => stop copying */
+	movl %eax, 8(%ecx,%edx)	/* store word to destination */
+
+	movl 12(%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(93)		/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L(28)		/* no is NUL => carry on copying */
+
+L(93):	addl $4, %ecx		/* adjust pointer */
+L(92):	addl $4, %ecx
+L(91):	addl $4, %ecx
+
+L(9):	movb %al, (%ecx,%edx)	/* store first byte of last word */
+	orb %al, %al		/* is it NUL? */
+	jz L(8)			/* yes => return */
+
+	movb %ah, 1(%ecx,%edx)	/* store second byte of last word */
+	orb %ah, %ah		/* is it NUL? */
+	jz L(8)			/* yes => return */
+
+	shrl $16, %eax		/* make upper bytes accessible */
+	movb %al, 2(%ecx,%edx)	/* store third byte of last word */
+	orb %al, %al		/* is it NUL? */
+	jz L(8)			/* yes => return */
+
+	movb %ah, 3(%ecx,%edx)	/* store fourth byte of last word */
+
+L(8):	movl DEST(%esp), %eax	/* start address of destination is result */
+	popl %edi		/* restore saved register */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (strcat)
+libc_hidden_builtin_def (strcat)
diff --git a/REORG.TODO/sysdeps/i386/strchr.S b/REORG.TODO/sysdeps/i386/strchr.S
new file mode 100644
index 0000000000..6075e77882
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strchr.S
@@ -0,0 +1,290 @@
+/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4		/* space for 1 saved reg */
+#define RTN	PARMS
+#define STR	RTN
+#define CHR	STR+4
+
+	.text
+ENTRY (strchr)
+
+	pushl %edi		/* Save callee-safe registers used here.  */
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+	movl STR(%esp), %eax
+	movl CHR(%esp), %edx
+
+	/* At the moment %edx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* now it is 0|0|c|c */
+	movl %edx, %ecx
+	shll $16, %edx		/* now it is c|c|0|0 */
+	movw %cx, %dx		/* and finally c|c|c|c */
+
+	/* Before we start with the main loop we process single bytes
+	   until the source pointer is aligned.  This has two reasons:
+	   1. aligned 32-bit memory access is faster
+	   and (more important)
+	   2. we process in the main loop 32 bit in one step although
+	      we don't know the end of the string.  But accessing at
+	      4-byte alignment guarantees that we never access illegal
+	      memory if this would not also be done by the trivial
+	      implementation (this is because all processor inherent
+	      boundaries are multiples of 4.  */
+
+	testb $3, %al		/* correctly aligned ? */
+	jz L(11)		/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L(6)			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L(2)			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	testb $3, %al		/* correctly aligned ? */
+	jz L(11)		/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L(6)			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L(2)			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	testb $3, %al		/* correctly aligned ? */
+	jz L(11)		/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L(6)			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L(2)			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	/* No we have reached alignment.  */
+	jmp L(11)		/* begin loop */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for C, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is C.  This turns each byte that is C
+	 into a zero.  */
+
+	/* Each round the main loop processes 16 bytes.  */
+
+	ALIGN(4)
+
+L(1):	addl $16, %eax		/* adjust pointer for whole round */
+
+L(11):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occurred in the last byte => it was 0.	*/
+	jnc L(7)
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %edi.  */
+	jnz L(7)		/* found it => return pointer */
+
+	/* Now we made sure the dword does not contain the character we are
+	   looking for.  But because we deal with strings we have to check
+	   for the end of string before testing the next dword.  */
+
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(2)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(2)		/* found NUL => return NULL */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+	jnc L(71)		/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(71)		/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(2)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(2)		/* found NUL => return NULL */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+	jnc L(72)		/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(72)		/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(2)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(2)		/* found NUL => return NULL */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+	jnc L(73)		/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(73)		/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(2)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L(1)			/* no NUL found => restart loop */
+
+L(2):	/* Return NULL.  */
+	xorl %eax, %eax
+	popl %edi		/* restore saved register content */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+L(73):	addl $4, %eax		/* adjust pointer */
+L(72):	addl $4, %eax
+L(71):	addl $4, %eax
+
+	/* We now scan for the byte in which the character was matched.
+	   But we have to take care of the case that a NUL char is
+	   found before this in the dword.  Note that we XORed %ecx
+	   with the byte we're looking for, therefore the tests below look
+	   reversed.  */
+
+L(7):	testb %cl, %cl		/* is first byte C? */
+	jz L(6)			/* yes => return pointer */
+	cmpb %dl, %cl		/* is first byte NUL? */
+	je L(2)			/* yes => return NULL */
+	incl %eax		/* it's not in the first byte */
+
+	testb %ch, %ch		/* is second byte C? */
+	jz L(6)			/* yes => return pointer */
+	cmpb %dl, %ch		/* is second byte NUL? */
+	je L(2)			/* yes => return NULL? */
+	incl %eax		/* it's not in the second byte */
+
+	shrl $16, %ecx		/* make upper byte accessible */
+	testb %cl, %cl		/* is third byte C? */
+	jz L(6)			/* yes => return pointer */
+	cmpb %dl, %cl		/* is third byte NUL? */
+	je L(2)			/* yes => return NULL */
+
+	/* It must be in the fourth byte and it cannot be NUL.  */
+	incl %eax
+
+L(6):
+	popl %edi		/* restore saved register content */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (strchr)
+
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
diff --git a/REORG.TODO/sysdeps/i386/strchrnul.S b/REORG.TODO/sysdeps/i386/strchrnul.S
new file mode 100644
index 0000000000..800b872c74
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strchrnul.S
@@ -0,0 +1,278 @@
+/* strchrnul (str, chr) -- Return pointer to first occurrence of CHR in STR
+   or the final NUL byte.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.org>
+   Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+4	/* space for 1 saved reg */
+#define RTN	PARMS
+#define STR	RTN
+#define CHR	STR+4
+
+	.text
+ENTRY (__strchrnul)
+
+	pushl %edi		/* Save callee-safe registers used here.  */
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+
+	movl STR(%esp), %eax
+	movl CHR(%esp), %edx
+
+	/* At the moment %edx contains CHR.  What we need for the
+	   algorithm is CHR in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* now it is 0|0|c|c */
+	movl %edx, %ecx
+	shll $16, %edx		/* now it is c|c|0|0 */
+	movw %cx, %dx		/* and finally c|c|c|c */
+
+	/* Before we start with the main loop we process single bytes
+	   until the source pointer is aligned.  This has two reasons:
+	   1. aligned 32-bit memory access is faster
+	   and (more important)
+	   2. we process in the main loop 32 bit in one step although
+	      we don't know the end of the string.  But accessing at
+	      4-byte alignment guarantees that we never access illegal
+	      memory if this would not also be done by the trivial
+	      implementation (this is because all processor inherent
+	      boundaries are multiples of 4.  */
+
+	testb $3, %al		/* correctly aligned ? */
+	jz L(11)		/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L(6)			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L(6)			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	testb $3, %al		/* correctly aligned ? */
+	jz L(11)		/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L(6)			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L(6)			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	testb $3, %al		/* correctly aligned ? */
+	jz L(11)		/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L(6)			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L(6)			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	/* No we have reached alignment.  */
+	jmp L(11)		/* begin loop */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for CHR, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is CHR.  This turns each byte that is CHR
+	 into a zero.  */
+
+	/* Each round the main loop processes 16 bytes.  */
+
+	ALIGN(4)
+
+L(1):	addl $16, %eax		/* adjust pointer for whole round */
+
+L(11):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* CHR */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occurred in the last byte => it was 0.	*/
+	jnc L(7)
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is CHR we don't get 0 in %edi.  */
+	jnz L(7)		/* found it => return pointer */
+
+	/* Now we made sure the dword does not contain the character we are
+	   looking for.  But because we deal with strings we have to check
+	   for the end of string before testing the next dword.  */
+
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(7)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(7)		/* found NUL => return NULL */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* CHR */
+	jnc L(71)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(71)		/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(71)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(71)		/* found NUL => return NULL */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* CHR */
+	jnc L(72)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(72)		/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(72)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(72)		/* found NUL => return NULL */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* CHR */
+	jnc L(73)		/* highest byte is CHR => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(73)		/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(73)		/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L(1)			/* no NUL found => restart loop */
+
+L(73):	addl $4, %eax		/* adjust pointer */
+L(72):	addl $4, %eax
+L(71):	addl $4, %eax
+
+	/* We now scan for the byte in which the character was matched.
+	   But we have to take care of the case that a NUL char is
+	   found before this in the dword.  */
+
+L(7):	testb %cl, %cl		/* is first byte CHR? */
+	jz L(6)			/* yes => return pointer */
+	cmpb %dl, %cl		/* is first byte NUL? */
+	je L(6)			/* yes => return NULL */
+	incl %eax		/* it's not in the first byte */
+
+	testb %ch, %ch		/* is second byte CHR? */
+	jz L(6)			/* yes => return pointer */
+	cmpb %dl, %ch		/* is second byte NUL? */
+	je L(6)			/* yes => return NULL? */
+	incl %eax		/* it's not in the second byte */
+
+	shrl $16, %ecx		/* make upper byte accessible */
+	testb %cl, %cl		/* is third byte CHR? */
+	jz L(6)			/* yes => return pointer */
+	cmpb %dl, %cl		/* is third byte NUL? */
+	je L(6)			/* yes => return NULL */
+
+	/* It must be in the fourth byte and it cannot be NUL.  */
+	incl %eax
+
+L(6):	popl %edi		/* restore saved register content */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__strchrnul)
+
+weak_alias (__strchrnul, strchrnul)
diff --git a/REORG.TODO/sysdeps/i386/strcspn.S b/REORG.TODO/sysdeps/i386/strcspn.S
new file mode 100644
index 0000000000..c852a3b1e5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strcspn.S
@@ -0,0 +1,240 @@
+/* strcspn (str, ss) -- Return the length of the initial segment of STR
+			which contains no characters from SS.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define STR	PARMS
+#define STOP	STR+4
+
+	.text
+ENTRY (strcspn)
+
+	movl STR(%esp), %edx
+	movl STOP(%esp), %eax
+
+	/* First we create a table with flags for all possible characters.
+	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+	   supported by the C string functions we have 256 characters.
+	   Before inserting marks for the stop characters we clear the whole
+	   table.  The unrolled form is much faster than a loop.  */
+	xorl %ecx, %ecx		/* %ecx = 0 !!! */
+
+	pushl %ecx		/* make a 256 bytes long block filled with 0 */
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* These immediate values make the label 2 */
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* to be aligned on a 16 byte boundary to */
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* get a better performance of the loop.  */
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L(2):	movb (%eax), %cl	/* get byte from stopset */
+	testb %cl, %cl		/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 1(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 2(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 3(%eax), %cl	/* get byte from stopset */
+	addl $4, %eax		/* increment stopset pointer */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+	testb $0xff, %cl	/* is NUL char? */
+	jnz L(2)		/* no => process next dword from stopset */
+
+L(1):	leal -4(%edx), %eax	/* prepare loop */
+
+	/* We use a neat trick for the following loop.  Normally we would
+	   have to test for two termination conditions
+	   1. a character in the stopset was found
+	   and
+	   2. the end of the string was found
+	   But as a sign that the character is in the stopset we store its
+	   value in the table.  But the value of NUL is NUL so the loop
+	   terminates for NUL in every case.  */
+
+L(3):	addl $4, %eax		/* adjust pointer for full loop round */
+
+	movb (%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L(4)			/* yes => return */
+
+	movb 1(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L(5)			/* yes => return */
+
+	movb 2(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L(6)			/* yes => return */
+
+	movb 3(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	jne L(3)		/* yes => return */
+
+	incl %eax		/* adjust pointer */
+L(6):	incl %eax
+L(5):	incl %eax
+
+L(4):	addl $256, %esp		/* remove stopset */
+	cfi_adjust_cfa_offset (-256)
+	subl %edx, %eax		/* we have to return the number of valid
+				   characters, so compute distance to first
+				   non-valid character */
+	ret
+END (strcspn)
+libc_hidden_builtin_def (strcspn)
diff --git a/REORG.TODO/sysdeps/i386/string-inlines.c b/REORG.TODO/sysdeps/i386/string-inlines.c
new file mode 100644
index 0000000000..d023bc3aa3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/string-inlines.c
@@ -0,0 +1,47 @@
+/* Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is to avoid PLT entries for the x86 version.  */
+#define __memcpy_g __memcpy_g_internal
+#define __strchr_g __strchr_g_internal
+#include <string/string-inlines.c>
+
+void *
+(__memcpy_c) (void *d, const void *s, size_t n)
+{
+  return memcpy (d, s, n);
+}
+
+void *
+__memset_cc (void *s, unsigned long int pattern, size_t n)
+{
+  return memset (s, pattern & 0xff, n);
+}
+strong_alias (__memset_cc, __memset_cg)
+
+void *
+__memset_gg (void *s, char c, size_t n)
+{
+  return memset (s, c, n);
+}
+
+#ifdef __memcpy_c
+# undef __memcpy_g
+strong_alias (__memcpy_g_internal, __memcpy_g)
+# undef __strchr_g
+strong_alias (__strchr_g_internal, __strchr_g)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/strlen.S b/REORG.TODO/sysdeps/i386/strlen.S
new file mode 100644
index 0000000000..192fadf20a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strlen.S
@@ -0,0 +1,132 @@
+/* strlen(str) -- determine the length of the string STR.
+   Optimized for Intel 80x86, x>=4.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define STR	PARMS
+
+	.text
+ENTRY (strlen)
+
+	movl STR(%esp), %ecx
+	movl %ecx, %eax		/* duplicate it */
+
+	andl $3, %ecx		/* mask alignment bits */
+	jz L(1)			/* aligned => start loop */
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L(2)			/* yes => return */
+	incl %eax		/* increment pointer */
+
+	xorl $3, %ecx		/* was alignment = 3? */
+	jz L(1)			/* yes => now it is aligned and start loop */
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L(2)			/* yes => return */
+	addl $1, %eax		/* increment pointer */
+
+	subl $1, %ecx		/* was alignment = 2? */
+	jz L(1)			/* yes => now it is aligned and start loop */
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L(2)			/* yes => return */
+
+/* Don't change the above `addl $1,%eax' and `subl $1, %ecx' into `incl %eax'
+   and `decl %ecx' resp.  The additional two byte per instruction make the
+   label 4 to be aligned on a 16 byte boundary with nops.
+
+   The following `sub $15, %eax' is part of this trick, too.  Together with
+   the next instruction (`addl $16, %eax') it is in fact a `incl %eax', just
+   as expected from the algorithm.  But doing so has the advantage that
+   no jump to label 1 is necessary and so the pipeline is not flushed.  */
+
+	subl $15, %eax		/* effectively +1 */
+
+
+L(4):	addl $16, %eax		/* adjust pointer for full loop */
+
+L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(3)		/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(3)		/* found NUL => return pointer */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(5)		/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(5)		/* found NUL => return pointer */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(6)		/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(6)		/* found NUL => return pointer */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(7)		/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L(4)			/* no NUL found => continue loop */
+
+L(7):	addl $4, %eax		/* adjust pointer */
+L(6):	addl $4, %eax
+L(5):	addl $4, %eax
+
+L(3):	testb %cl, %cl		/* is first byte NUL? */
+	jz L(2)			/* yes => return */
+	incl %eax		/* increment pointer */
+
+	testb %ch, %ch		/* is second byte NUL? */
+	jz L(2)			/* yes => return */
+	incl %eax		/* increment pointer */
+
+	testl $0xff0000, %ecx	/* is third byte NUL? */
+	jz L(2)			/* yes => return pointer */
+	incl %eax		/* increment pointer */
+
+L(2):	subl STR(%esp), %eax	/* compute difference to string start */
+
+	ret
+END (strlen)
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/i386/strlen.c b/REORG.TODO/sysdeps/i386/strlen.c
new file mode 100644
index 0000000000..0b69957392
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strlen.c
@@ -0,0 +1,35 @@
+/* Determine the length of a string.  For Intel 80x86, x>=3.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Torbjorn Granlund (tege@sics.se).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+
+size_t
+strlen (const char *str)
+{
+  int cnt;
+
+  asm("cld\n"			/* Search forward.  */
+      /* Some old versions of gas need `repne' instead of `repnz'.  */
+      "repnz\n"			/* Look for a zero byte.  */
+      "scasb" /* %0, %1, %3 */ :
+      "=c" (cnt) : "D" (str), "0" (-1), "a" (0));
+
+  return -2 - cnt;
+}
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/i386/strpbrk.S b/REORG.TODO/sysdeps/i386/strpbrk.S
new file mode 100644
index 0000000000..1109b233da
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strpbrk.S
@@ -0,0 +1,243 @@
+/* strcspn (str, ss) -- Return the length of the initial segement of STR
+			which contains no characters from SS.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define RTN	PARMS
+#define STR	RTN
+#define STOP	STR+4
+
+	.text
+ENTRY (strpbrk)
+
+	movl STR(%esp), %edx
+	movl STOP(%esp), %eax
+
+	/* First we create a table with flags for all possible characters.
+	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+	   supported by the C string functions we have 256 characters.
+	   Before inserting marks for the stop characters we clear the whole
+	   table.  The unrolled form is much faster than a loop.  */
+	xorl %ecx, %ecx		/* %ecx = 0 !!! */
+
+	pushl %ecx		/* make a 256 bytes long block filled with 0 */
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* These immediate values make the label 2 */
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* to be aligned on a 16 byte boundary to */
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* get a better performance of the loop.  */
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L(2):	movb (%eax), %cl	/* get byte from stopset */
+	testb %cl, %cl		/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 1(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 2(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 3(%eax), %cl	/* get byte from stopset */
+	addl $4, %eax		/* increment stopset pointer */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+	testb $0xff, %cl	/* is NUL char? */
+	jnz L(2)		/* no => process next dword from stopset */
+
+L(1):	leal -4(%edx), %eax	/* prepare loop */
+
+	/* We use a neat trick for the following loop.  Normally we would
+	   have to test for two termination conditions
+	   1. a character in the stopset was found
+	   and
+	   2. the end of the string was found
+	   But as a sign that the character is in the stopset we store its
+	   value in the table.  But the value of NUL is NUL so the loop
+	   terminates for NUL in every case.  */
+
+L(3):	addl $4, %eax		/* adjust pointer for full loop round */
+
+	movb (%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L(4)			/* yes => return */
+
+	movb 1(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L(5)			/* yes => return */
+
+	movb 2(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L(6)			/* yes => return */
+
+	movb 3(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	jne L(3)			/* yes => return */
+
+	incl %eax		/* adjust pointer */
+L(6):	incl %eax
+L(5):	incl %eax
+
+L(4):	addl $256, %esp		/* remove stopset */
+	cfi_adjust_cfa_offset (-256)
+
+	orb %cl, %cl		/* was last character NUL? */
+	jnz L(7)		/* no => return pointer */
+	xorl %eax, %eax
+
+L(7):	ret
+END (strpbrk)
+libc_hidden_builtin_def (strpbrk)
diff --git a/REORG.TODO/sysdeps/i386/strrchr.S b/REORG.TODO/sysdeps/i386/strrchr.S
new file mode 100644
index 0000000000..95b304dc0b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strrchr.S
@@ -0,0 +1,334 @@
+/* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4+8	/* space for 2 saved regs */
+#define RTN	PARMS
+#define STR	RTN
+#define CHR	STR+4
+
+	.text
+ENTRY (strrchr)
+
+	pushl %edi		/* Save callee-safe registers used here.  */
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 0)
+	pushl %esi
+	cfi_adjust_cfa_offset (4)
+
+	xorl %eax, %eax
+	movl STR(%esp), %esi
+	cfi_rel_offset (esi, 0)
+	movl CHR(%esp), %ecx
+
+	/* At the moment %ecx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %cl, %ch		/* now it is 0|0|c|c */
+	movl %ecx, %edx
+	shll $16, %ecx		/* now it is c|c|0|0 */
+	movw %dx, %cx		/* and finally c|c|c|c */
+
+	/* Before we start with the main loop we process single bytes
+	   until the source pointer is aligned.  This has two reasons:
+	   1. aligned 32-bit memory access is faster
+	   and (more important)
+	   2. we process in the main loop 32 bit in one step although
+	      we don't know the end of the string.  But accessing at
+	      4-byte alignment guarantees that we never access illegal
+	      memory if this would not also be done by the trivial
+	      implementation (this is because all processor inherent
+	      boundaries are multiples of 4.  */
+
+	testl $3, %esi		/* correctly aligned ? */
+	jz L(19)		/* yes => begin loop */
+	movb (%esi), %dl	/* load byte in question (we need it twice) */
+	cmpb %dl, %cl		/* compare byte */
+	jne L(11)			/* target found => return */
+	movl %esi, %eax		/* remember pointer as possible result */
+L(11):	orb %dl, %dl		/* is NUL? */
+	jz L(2)			/* yes => return NULL */
+	incl %esi		/* increment pointer */
+
+	testl $3, %esi		/* correctly aligned ? */
+	jz L(19)		/* yes => begin loop */
+	movb (%esi), %dl	/* load byte in question (we need it twice) */
+	cmpb %dl, %cl		/* compare byte */
+	jne L(12)			/* target found => return */
+	movl %esi, %eax		/* remember pointer as result */
+L(12):	orb %dl, %dl		/* is NUL? */
+	jz L(2)			/* yes => return NULL */
+	incl %esi		/* increment pointer */
+
+	testl $3, %esi		/* correctly aligned ? */
+	jz L(19)		/* yes => begin loop */
+	movb (%esi), %dl	/* load byte in question (we need it twice) */
+	cmpb %dl, %cl		/* compare byte */
+	jne L(13)			/* target found => return */
+	movl %esi, %eax		/* remember pointer as result */
+L(13):	orb %dl, %dl		/* is NUL? */
+	jz L(2)			/* yes => return NULL */
+	incl %esi		/* increment pointer */
+
+	/* No we have reached alignment.  */
+	jmp L(19)		/* begin loop */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for C, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is C.  This turns each byte that is C
+	 into a zero.  */
+
+	/* Each round the main loop processes 16 bytes.  */
+
+	/* Jump to here when the character is detected.  We chose this
+	   way around because the character one is looking for is not
+	   as frequent as the rest and taking a conditional jump is more
+	   expensive than ignoring it.
+
+	   Some more words to the code below: it might not be obvious why
+	   we decrement the source pointer here.  In the loop the pointer
+	   is not pre-incremented and so it still points before the word
+	   we are looking at.  But you should take a look at the instruction
+	   which gets executed before we get into the loop: `addl $16, %esi'.
+	   This makes the following subs into adds.  */
+
+	/* These fill bytes make the main loop be correctly aligned.
+	   We cannot use align because it is not the following instruction
+	   which should be aligned.  */
+	.byte 0, 0
+#ifndef	PROF
+	/* Profiling adds some code and so changes the alignment.  */
+	.byte 0
+#endif
+
+L(4):	subl $4, %esi		/* adjust pointer */
+L(41):	subl $4, %esi
+L(42):	subl $4, %esi
+L(43):	testl $0xff000000, %edx	/* is highest byte == C? */
+	jnz L(33)		/* no => try other bytes */
+	leal 15(%esi), %eax	/* store address as result */
+	jmp L(1)		/* and start loop again */
+
+L(3):	subl $4, %esi		/* adjust pointer */
+L(31):	subl $4, %esi
+L(32):	subl $4, %esi
+L(33):	testl $0xff0000, %edx	/* is C in third byte? */
+	jnz L(51)		/* no => try other bytes */
+	leal 14(%esi), %eax	/* store address as result */
+	jmp L(1)		/* and start loop again */
+
+L(51):
+	/* At this point we know that the byte is in one of the lower bytes.
+	   We make a guess and correct it if necessary.  This reduces the
+	   number of necessary jumps.  */
+	leal 12(%esi), %eax	/* guess address of lowest byte as result */
+	testb %dh, %dh		/* is guess correct? */
+	jnz L(1)		/* yes => start loop */
+	leal 13(%esi), %eax	/* correct guess to second byte */
+
+L(1):	addl $16, %esi		/* increment pointer for full round */
+
+L(19):	movl (%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occurred in the last byte => it was 0.	*/
+
+	jnc L(20)			/* found NUL => check last word */
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %edx, %edi		/* (word+magic)^word */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %edi.  */
+	jnz L(20)			/* found NUL => check last word */
+
+	/* Now we made sure the dword does not contain the character we are
+	   looking for.  But because we deal with strings we have to check
+	   for the end of string before testing the next dword.  */
+
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(4)		/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(3)		/* C is detected in the word => examine it */
+
+	movl 4(%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(21)		/* found NUL => check last word */
+	xorl %edx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(21)		/* found NUL => check last word */
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(41)		/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(31)		/* C is detected in the word => examine it */
+
+	movl 8(%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(22)		/* found NUL => check last word */
+	xorl %edx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(22)		/* found NUL => check last word */
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(42)		/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(32)		/* C is detected in the word => examine it */
+
+	movl 12(%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(23)		/* found NUL => check last word */
+	xorl %edx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L(23)		/* found NUL => check last word */
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L(43)		/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L(1)			/* C is not detected => restart loop */
+	jmp L(33)		/* examine word */
+
+L(23):	addl $4, %esi		/* adjust pointer */
+L(22):	addl $4, %esi
+L(21):	addl $4, %esi
+
+	/* What remains to do is to test which byte the NUL char is and
+	   whether the searched character appears in one of the bytes
+	   before.  A special case is that the searched byte maybe NUL.
+	   In this case a pointer to the terminating NUL char has to be
+	   returned.  */
+
+L(20):	cmpb %cl, %dl		/* is first byte == C? */
+	jne L(24)		/* no => skip */
+	movl %esi, %eax		/* store address as result */
+L(24):	testb %dl, %dl		/* is first byte == NUL? */
+	jz L(2)			/* yes => return */
+
+	cmpb %cl, %dh		/* is second byte == C? */
+	jne L(25)		/* no => skip */
+	leal 1(%esi), %eax	/* store address as result */
+L(25):	testb %dh, %dh		/* is second byte == NUL? */
+	jz L(2)			/* yes => return */
+
+	shrl $16,%edx		/* make upper bytes accessible */
+	cmpb %cl, %dl		/* is third byte == C */
+	jne L(26)		/* no => skip */
+	leal 2(%esi), %eax	/* store address as result */
+L(26):	testb %dl, %dl		/* is third byte == NUL */
+	jz L(2)			/* yes => return */
+
+	cmpb %cl, %dh		/* is fourth byte == C */
+	jne L(2)		/* no => skip */
+	leal 3(%esi), %eax	/* store address as result */
+
+L(2):	popl %esi		/* restore saved register content */
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl %edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (strrchr)
+
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/REORG.TODO/sysdeps/i386/strspn.S b/REORG.TODO/sysdeps/i386/strspn.S
new file mode 100644
index 0000000000..d433eb6af5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strspn.S
@@ -0,0 +1,240 @@
+/* strcspn (str, ss) -- Return the length of the initial segment of STR
+			which contains only characters from SS.
+   For Intel 80x86, x>=3.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS	4		/* no space for saved regs */
+#define STR	PARMS
+#define SKIP	STR+4
+
+	.text
+ENTRY (strspn)
+
+	movl STR(%esp), %edx
+	movl SKIP(%esp), %eax
+
+	/* First we create a table with flags for all possible characters.
+	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+	   supported by the C string functions we have 256 characters.
+	   Before inserting marks for the stop characters we clear the whole
+	   table.  The unrolled form is much faster than a loop.  */
+	xorl %ecx, %ecx		/* %ecx = 0 !!! */
+
+	pushl %ecx		/* make a 256 bytes long block filled with 0 */
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl %ecx
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* These immediate values make the label 2 */
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* to be aligned on a 16 byte boundary to */
+	cfi_adjust_cfa_offset (4)
+	pushl $0		/* get a better performance of the loop.  */
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+	pushl $0
+	cfi_adjust_cfa_offset (4)
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L(2):	movb (%eax), %cl	/* get byte from stopset */
+	testb %cl, %cl		/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 1(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 2(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 3(%eax), %cl	/* get byte from stopset */
+	addl $4, %eax		/* increment stopset pointer */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+	testb $0xff, %cl	/* is NUL char? */
+	jnz L(2)		/* no => process next dword from stopset */
+
+L(1):	leal -4(%edx), %eax	/* prepare loop */
+
+	/* We use a neat trick for the following loop.  Normally we would
+	   have to test for two termination conditions
+	   1. a character in the stopset was found
+	   and
+	   2. the end of the string was found
+	   But as a sign that the character is in the stopset we store its
+	   value in the table.  But the value of NUL is NUL so the loop
+	   terminates for NUL in every case.  */
+
+L(3):	addl $4, %eax		/* adjust pointer for full loop round */
+
+	movb (%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jz L(4)			/* no => return */
+
+	movb 1(%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jz L(5)			/* no => return */
+
+	movb 2(%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jz L(6)			/* no => return */
+
+	movb 3(%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jnz L(3)		/* yes => start loop again */
+
+	incl %eax		/* adjust pointer */
+L(6):	incl %eax
+L(5):	incl %eax
+
+L(4):	addl $256, %esp		/* remove stopset */
+	cfi_adjust_cfa_offset (-256)
+	subl %edx, %eax		/* we have to return the number of valid
+				   characters, so compute distance to first
+				   non-valid character */
+	ret
+END (strspn)
+libc_hidden_builtin_def (strspn)
diff --git a/REORG.TODO/sysdeps/i386/sub_n.S b/REORG.TODO/sysdeps/i386/sub_n.S
new file mode 100644
index 0000000000..3649da29e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/sub_n.S
@@ -0,0 +1,111 @@
+/* i80386 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+8		/* space for 2 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define S2	S1+4
+#define SIZE	S2+4
+
+	.text
+ENTRY (__mpn_sub_n)
+
+	pushl %edi
+	cfi_adjust_cfa_offset (4)
+	pushl %esi
+	cfi_adjust_cfa_offset (4)
+
+	movl	RES(%esp),%edi
+	cfi_rel_offset (edi, 4)
+	movl	S1(%esp),%esi
+	cfi_rel_offset (esi, 0)
+	movl	S2(%esp),%edx
+	movl	SIZE(%esp),%ecx
+	movl	%ecx,%eax
+	shrl	$3,%ecx			/* compute count for unrolled loop */
+	negl	%eax
+	andl	$7,%eax			/* get index where to start loop */
+	jz	L(oop)			/* necessary special case for 0 */
+	incl	%ecx			/* adjust loop count */
+	shll	$2,%eax			/* adjustment for pointers... */
+	subl	%eax,%edi		/* ... since they are offset ... */
+	subl	%eax,%esi		/* ... by a constant when we ... */
+	subl	%eax,%edx		/* ... enter the loop */
+	shrl	$2,%eax			/* restore previous value */
+#ifdef PIC
+/* Calculate start address in loop for PIC.  Due to limitations in some
+   assemblers, Loop-L0-3 cannot be put into the leal */
+	call	L(0)
+	cfi_adjust_cfa_offset (4)
+L(0):	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$(L(oop)-L(0)-3),%eax
+	addl	$4,%esp
+	cfi_adjust_cfa_offset (-4)
+#else
+/* Calculate start address in loop for non-PIC.  */
+ 	leal	(L(oop) - 3)(%eax,%eax,8),%eax
+#endif
+	jmp	*%eax			/* jump into loop */
+	ALIGN (3)
+L(oop):	movl	(%esi),%eax
+	sbbl	(%edx),%eax
+	movl	%eax,(%edi)
+	movl	4(%esi),%eax
+	sbbl	4(%edx),%eax
+	movl	%eax,4(%edi)
+	movl	8(%esi),%eax
+	sbbl	8(%edx),%eax
+	movl	%eax,8(%edi)
+	movl	12(%esi),%eax
+	sbbl	12(%edx),%eax
+	movl	%eax,12(%edi)
+	movl	16(%esi),%eax
+	sbbl	16(%edx),%eax
+	movl	%eax,16(%edi)
+	movl	20(%esi),%eax
+	sbbl	20(%edx),%eax
+	movl	%eax,20(%edi)
+	movl	24(%esi),%eax
+	sbbl	24(%edx),%eax
+	movl	%eax,24(%edi)
+	movl	28(%esi),%eax
+	sbbl	28(%edx),%eax
+	movl	%eax,28(%edi)
+	leal	32(%edi),%edi
+	leal	32(%esi),%esi
+	leal	32(%edx),%edx
+	decl	%ecx
+	jnz	L(oop)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl %esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl %edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_sub_n)
diff --git a/REORG.TODO/sysdeps/i386/submul_1.S b/REORG.TODO/sysdeps/i386/submul_1.S
new file mode 100644
index 0000000000..c765e8dd79
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/submul_1.S
@@ -0,0 +1,86 @@
+/* i80386 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+   the result from a second limb vector.
+   Copyright (C) 1992-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS	4+16	/* space for 4 saved regs */
+#define RES	PARMS
+#define S1	RES+4
+#define SIZE	S1+4
+#define S2LIMB	SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define sizeP ecx
+#define s2_limb ebx
+
+	.text
+ENTRY (__mpn_submul_1)
+
+	pushl	%edi
+	cfi_adjust_cfa_offset (4)
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebp
+	cfi_adjust_cfa_offset (4)
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (edi, 12)
+	cfi_rel_offset (esi, 8)
+	cfi_rel_offset (ebp, 4)
+	cfi_rel_offset (ebx, 0)
+
+	movl	RES(%esp), %res_ptr
+	movl	S1(%esp), %s1_ptr
+	movl	SIZE(%esp), %sizeP
+	movl	S2LIMB(%esp), %s2_limb
+	leal	(%res_ptr,%sizeP,4), %res_ptr
+	leal	(%s1_ptr,%sizeP,4), %s1_ptr
+	negl	%sizeP
+	xorl	%ebp, %ebp
+	ALIGN (3)
+L(oop):
+	movl	(%s1_ptr,%sizeP,4), %eax
+	mull	%s2_limb
+	addl	%ebp, %eax
+	adcl	$0, %edx
+	subl	%eax, (%res_ptr,%sizeP,4)
+	adcl	$0, %edx
+	movl	%edx, %ebp
+
+	incl	%sizeP
+	jnz	L(oop)
+	movl	%ebp, %eax
+
+	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	popl	%ebp
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebp)
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (esi)
+	popl	%edi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (edi)
+
+	ret
+END (__mpn_submul_1)
diff --git a/REORG.TODO/sysdeps/i386/symbol-hacks.h b/REORG.TODO/sysdeps/i386/symbol-hacks.h
new file mode 100644
index 0000000000..36a13c83f7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/symbol-hacks.h
@@ -0,0 +1,21 @@
+/* Hacks needed for symbol manipulation.  i386 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/wordsize-32/divdi3-symbol-hacks.h>
+
+#include_next "symbol-hacks.h"
diff --git a/REORG.TODO/sysdeps/i386/sys/ucontext.h b/REORG.TODO/sysdeps/i386/sys/ucontext.h
new file mode 100644
index 0000000000..fb5df11965
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/sys/ucontext.h
@@ -0,0 +1,139 @@
+/* Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* System V/i386 ABI compliant context switching support.  */
+
+#ifndef _SYS_UCONTEXT_H
+#define _SYS_UCONTEXT_H	1
+
+#include <features.h>
+
+#include <bits/types/sigset_t.h>
+#include <bits/sigcontext.h>
+#include <bits/types/stack_t.h>
+
+
+/* Type for general register.  */
+typedef int greg_t;
+
+/* Number of general registers.  */
+#define __NGREG	19
+#ifdef __USE_MISC
+# define NGREG	__NGREG
+#endif
+
+/* Container for all general registers.  */
+typedef greg_t gregset_t[__NGREG];
+
+#ifdef __USE_MISC
+/* Number of each register is the `gregset_t' array.  */
+enum
+{
+  REG_GS = 0,
+# define REG_GS	REG_GS
+  REG_FS,
+# define REG_FS	REG_FS
+  REG_ES,
+# define REG_ES	REG_ES
+  REG_DS,
+# define REG_DS	REG_DS
+  REG_EDI,
+# define REG_EDI	REG_EDI
+  REG_ESI,
+# define REG_ESI	REG_ESI
+  REG_EBP,
+# define REG_EBP	REG_EBP
+  REG_ESP,
+# define REG_ESP	REG_ESP
+  REG_EBX,
+# define REG_EBX	REG_EBX
+  REG_EDX,
+# define REG_EDX	REG_EDX
+  REG_ECX,
+# define REG_ECX	REG_ECX
+  REG_EAX,
+# define REG_EAX	REG_EAX
+  REG_TRAPNO,
+# define REG_TRAPNO	REG_TRAPNO
+  REG_ERR,
+# define REG_ERR	REG_ERR
+  REG_EIP,
+# define REG_EIP	REG_EIP
+  REG_CS,
+# define REG_CS	REG_CS
+  REG_EFL,
+# define REG_EFL	REG_EFL
+  REG_UESP,
+# define REG_UESP	REG_UESP
+  REG_SS
+# define REG_SS	REG_SS
+};
+#endif
+
+#ifdef __USE_MISC
+# define __ctx(fld) fld
+# define __ctxt(tag) tag
+#else
+# define __ctx(fld) __ ## fld
+# define __ctxt(tag) /* Empty.  */
+#endif
+
+/* Structure to describe FPU registers.  */
+typedef struct fpregset
+  {
+    union
+      {
+	struct __ctxt(fpchip_state)
+	  {
+	    int __ctx(state)[27];
+	    int __ctx(status);
+	  } __ctx(fpchip_state);
+
+	struct __ctxt(fp_emul_space)
+	  {
+	    char __ctx(fp_emul)[246];
+	    char __ctx(fp_epad)[2];
+	  } __ctx(fp_emul_space);
+
+	int __ctx(f_fpregs)[62];
+      } __ctx(fp_reg_set);
+
+    long int __ctx(f_wregs)[33];
+  } fpregset_t;
+
+/* Context to describe whole processor state.  */
+typedef struct
+  {
+    gregset_t __ctx(gregs);
+    fpregset_t __ctx(fpregs);
+  } mcontext_t;
+
+#undef __ctx
+#undef __ctxt
+
+/* Userlevel context.  */
+typedef struct ucontext
+  {
+    unsigned long int uc_flags;
+    struct ucontext *uc_link;
+    sigset_t uc_sigmask;
+    stack_t uc_stack;
+    mcontext_t uc_mcontext;
+    long int uc_filler[5];
+  } ucontext_t;
+
+#endif /* sys/ucontext.h */
diff --git a/REORG.TODO/sysdeps/i386/sysdep.h b/REORG.TODO/sysdeps/i386/sysdep.h
new file mode 100644
index 0000000000..d2b0860b99
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/sysdep.h
@@ -0,0 +1,159 @@
+/* Assembler macros for i386.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/generic/sysdep.h>
+
+#include <features.h> /* For __GNUC_PREREQ.  */
+
+/* It is desirable that the names of PIC thunks match those used by
+   GCC so that multiple copies are eliminated by the linker.  Because
+   GCC 4.6 and earlier use __i686 in the names, it is necessary to
+   override that predefined macro.  */
+#if defined __i686 && defined __ASSEMBLER__
+#undef __i686
+#define __i686 __i686
+#endif
+
+#ifdef	__ASSEMBLER__
+# define GET_PC_THUNK(reg) __x86.get_pc_thunk.reg
+#else
+# define GET_PC_THUNK_STR(reg) "__x86.get_pc_thunk." #reg
+#endif
+
+#ifdef	__ASSEMBLER__
+
+/* Syntactic details of assembler.  */
+
+/* ELF uses byte-counts for .align, most others use log2 of count of bytes.  */
+#define ALIGNARG(log2) 1<<log2
+#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+
+
+/* Define an entry point visible from C.
+
+   There is currently a bug in gdb which prevents us from specifying
+   incomplete stabs information.  Fake some entries here which specify
+   the current source file.  */
+#define	ENTRY(name)							      \
+  .globl C_SYMBOL_NAME(name);						      \
+  .type C_SYMBOL_NAME(name),@function;					      \
+  .align ALIGNARG(4);							      \
+  C_LABEL(name)								      \
+  cfi_startproc;							      \
+  CALL_MCOUNT
+
+#undef	END
+#define END(name)							      \
+  cfi_endproc;								      \
+  ASM_SIZE_DIRECTIVE(name)
+
+#define ENTRY_CHK(name) ENTRY (name)
+#define END_CHK(name) END (name)
+
+/* If compiled for profiling, call `mcount' at the start of each function.  */
+#ifdef	PROF
+/* The mcount code relies on a normal frame pointer being on the stack
+   to locate our caller, so push one just for its benefit.  */
+#define CALL_MCOUNT \
+  pushl %ebp; cfi_adjust_cfa_offset (4); movl %esp, %ebp; \
+  cfi_def_cfa_register (ebp); call JUMPTARGET(mcount); \
+  popl %ebp; cfi_def_cfa (esp, 4);
+#else
+#define CALL_MCOUNT		/* Do nothing.  */
+#endif
+
+/* Since C identifiers are not normally prefixed with an underscore
+   on this system, the asm identifier `syscall_error' intrudes on the
+   C name space.  Make sure we use an innocuous name.  */
+#define	syscall_error	__syscall_error
+#define mcount		_mcount
+
+#define	PSEUDO(name, syscall_name, args)				      \
+  .globl syscall_error;							      \
+lose: SYSCALL_PIC_SETUP							      \
+  jmp JUMPTARGET(syscall_error);					      \
+  ENTRY (name)								      \
+  DO_CALL (syscall_name, args);						      \
+  jb lose
+
+#undef	PSEUDO_END
+#define	PSEUDO_END(name)						      \
+  END (name)
+
+# define SETUP_PIC_REG(reg) \
+  .ifndef GET_PC_THUNK(reg);						      \
+  .section .gnu.linkonce.t.GET_PC_THUNK(reg),"ax",@progbits;		      \
+  .globl GET_PC_THUNK(reg);						      \
+  .hidden GET_PC_THUNK(reg);						      \
+  .p2align 4;								      \
+  .type GET_PC_THUNK(reg),@function;					      \
+GET_PC_THUNK(reg):							      \
+  movl (%esp), %e##reg;							      \
+  ret;									      \
+  .size GET_PC_THUNK(reg), . - GET_PC_THUNK(reg);			      \
+  .previous;								      \
+  .endif;								      \
+  call GET_PC_THUNK(reg)
+
+# define LOAD_PIC_REG(reg) \
+  SETUP_PIC_REG(reg); addl $_GLOBAL_OFFSET_TABLE_, %e##reg
+
+#undef JUMPTARGET
+#ifdef PIC
+#define JUMPTARGET(name)	name##@PLT
+#define SYSCALL_PIC_SETUP \
+    pushl %ebx;								      \
+    cfi_adjust_cfa_offset (4);						      \
+    call 0f;								      \
+0:  popl %ebx;								      \
+    cfi_adjust_cfa_offset (-4);						      \
+    addl $_GLOBAL_OFFSET_TABLE_+[.-0b], %ebx;
+
+#else
+#define JUMPTARGET(name)	name
+#define SYSCALL_PIC_SETUP	/* Nothing.  */
+#endif
+
+/* Local label name for asm code. */
+#ifndef L
+#define L(name)		.L##name
+#endif
+
+#define atom_text_section .section ".text.atom", "ax"
+
+#else /* __ASSEMBLER__ */
+
+# define SETUP_PIC_REG_STR(reg)						\
+  ".ifndef " GET_PC_THUNK_STR (reg) "\n"				\
+  ".section .gnu.linkonce.t." GET_PC_THUNK_STR (reg) ",\"ax\",@progbits\n" \
+  ".globl " GET_PC_THUNK_STR (reg) "\n"					\
+  ".hidden " GET_PC_THUNK_STR (reg) "\n"				\
+  ".p2align 4\n"							\
+  ".type " GET_PC_THUNK_STR (reg) ",@function\n"			\
+GET_PC_THUNK_STR (reg) ":"						\
+  "movl (%%esp), %%e" #reg "\n"						\
+  "ret\n"								\
+  ".size " GET_PC_THUNK_STR (reg) ", . - " GET_PC_THUNK_STR (reg) "\n"	\
+  ".previous\n"								\
+  ".endif\n"								\
+  "call " GET_PC_THUNK_STR (reg)
+
+# define LOAD_PIC_REG_STR(reg) \
+  SETUP_PIC_REG_STR (reg) "\naddl $_GLOBAL_OFFSET_TABLE_, %%e" #reg
+
+#endif	/* __ASSEMBLER__ */
diff --git a/REORG.TODO/sysdeps/i386/tls-macros.h b/REORG.TODO/sysdeps/i386/tls-macros.h
new file mode 100644
index 0000000000..053cba05d1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tls-macros.h
@@ -0,0 +1,78 @@
+#include <features.h> /* For __GNUC_PREREQ.  */
+
+#define TLS_LE(x) \
+  ({ int *__l;								      \
+     asm ("movl %%gs:0,%0\n\t"						      \
+	  "subl $" #x "@tpoff,%0"					      \
+	  : "=r" (__l));						      \
+     __l; })
+
+#if defined PIC && !__GNUC_PREREQ (5,0)
+# define TLS_IE(x) \
+  ({ int *__l;								      \
+     asm ("movl %%gs:0,%0\n\t"						      \
+	  "subl " #x "@gottpoff(%%ebx),%0"				      \
+	  : "=r" (__l));						      \
+     __l; })
+#else
+# define TLS_IE(x) \
+  ({ int *__l, __b;							      \
+     asm ("call 1f\n\t"							      \
+	  ".subsection 1\n"						      \
+	  "1:\tmovl (%%esp), %%ebx\n\t"					      \
+	  "ret\n\t"							      \
+	  ".previous\n\t"						      \
+	  "addl $_GLOBAL_OFFSET_TABLE_, %%ebx\n\t"			      \
+	  "movl %%gs:0,%0\n\t"						      \
+	  "subl " #x "@gottpoff(%%ebx),%0"				      \
+	  : "=r" (__l), "=&b" (__b));					      \
+     __l; })
+#endif
+
+#if defined PIC && !__GNUC_PREREQ (5,0)
+# define TLS_LD(x) \
+  ({ int *__l, __c, __d;						      \
+     asm ("leal " #x "@tlsldm(%%ebx),%%eax\n\t"				      \
+	  "call ___tls_get_addr@plt\n\t"				      \
+	  "leal " #x "@dtpoff(%%eax), %%eax"				      \
+	  : "=a" (__l), "=&c" (__c), "=&d" (__d));			      \
+     __l; })
+#else
+# define TLS_LD(x) \
+  ({ int *__l, __b, __c, __d;						      \
+     asm ("call 1f\n\t"							      \
+	  ".subsection 1\n"						      \
+	  "1:\tmovl (%%esp), %%ebx\n\t"					      \
+	  "ret\n\t"							      \
+	  ".previous\n\t"						      \
+	  "addl $_GLOBAL_OFFSET_TABLE_, %%ebx\n\t"			      \
+	  "leal " #x "@tlsldm(%%ebx),%%eax\n\t"				      \
+	  "call ___tls_get_addr@plt\n\t"				      \
+	  "leal " #x "@dtpoff(%%eax), %%eax"				      \
+	  : "=a" (__l), "=&b" (__b), "=&c" (__c), "=&d" (__d));		      \
+     __l; })
+#endif
+
+#if defined PIC && !__GNUC_PREREQ (5,0)
+# define TLS_GD(x) \
+  ({ int *__l, __c, __d;						      \
+     asm ("leal " #x "@tlsgd(%%ebx),%%eax\n\t"				      \
+	  "call ___tls_get_addr@plt\n\t"				      \
+	  "nop"								      \
+	  : "=a" (__l), "=&c" (__c), "=&d" (__d));			      \
+     __l; })
+#else
+# define TLS_GD(x) \
+  ({ int *__l, __b, __c, __d;						      \
+     asm ("call 1f\n\t"							      \
+	  ".subsection 1\n"						      \
+	  "1:\tmovl (%%esp), %%ebx\n\t"					      \
+	  "ret\n\t"							      \
+	  ".previous\n\t"						      \
+	  "addl $_GLOBAL_OFFSET_TABLE_, %%ebx\n\t"			      \
+	  "leal " #x "@tlsgd(%%ebx),%%eax\n\t"				      \
+	  "call ___tls_get_addr@plt\n\t"				      \
+	  "nop"								      \
+	  : "=a" (__l), "=&b" (__b), "=&c" (__c), "=&d" (__d));		      \
+     __l; })
+#endif
diff --git a/REORG.TODO/sysdeps/i386/tlsdesc.c b/REORG.TODO/sysdeps/i386/tlsdesc.c
new file mode 100644
index 0000000000..90de2bb05e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tlsdesc.c
@@ -0,0 +1,268 @@
+/* Manage TLS descriptors.  i386 version.
+   Copyright (C) 2005-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <link.h>
+#include <ldsodefs.h>
+#include <elf/dynamic-link.h>
+#include <tls.h>
+#include <dl-tlsdesc.h>
+#include <dl-unmap-segments.h>
+#include <tlsdeschtab.h>
+
+/* The following 4 functions take an entry_check_offset argument.
+   It's computed by the caller as an offset between its entry point
+   and the call site, such that by adding the built-in return address
+   that is implicitly passed to the function with this offset, we can
+   easily obtain the caller's entry point to compare with the entry
+   point given in the TLS descriptor.  If it's changed, we want to
+   return immediately.  */
+
+/* This function is used to lazily resolve TLS_DESC REL relocations
+   that reference the *ABS* segment in their own link maps.  The
+   argument is the addend originally stored there.  */
+
+void
+__attribute__ ((regparm (3))) attribute_hidden
+_dl_tlsdesc_resolve_abs_plus_addend_fixup (struct tlsdesc volatile *td,
+					   struct link_map *l,
+					   ptrdiff_t entry_check_offset)
+{
+  ptrdiff_t addend = (ptrdiff_t) td->arg;
+
+  if (_dl_tlsdesc_resolve_early_return_p (td, __builtin_return_address (0)
+					  - entry_check_offset))
+    return;
+
+#ifndef SHARED
+  CHECK_STATIC_TLS (l, l);
+#else
+  if (!TRY_STATIC_TLS (l, l))
+    {
+      td->arg = _dl_make_tlsdesc_dynamic (l, addend);
+      td->entry = _dl_tlsdesc_dynamic;
+    }
+  else
+#endif
+    {
+      td->arg = (void*) (addend - l->l_tls_offset);
+      td->entry = _dl_tlsdesc_return;
+    }
+
+  _dl_tlsdesc_wake_up_held_fixups ();
+}
+
+/* This function is used to lazily resolve TLS_DESC REL relocations
+   that originally had zero addends.  The argument location, that
+   originally held the addend, is used to hold a pointer to the
+   relocation, but it has to be restored before we call the function
+   that applies relocations.  */
+
+void
+__attribute__ ((regparm (3))) attribute_hidden
+_dl_tlsdesc_resolve_rel_fixup (struct tlsdesc volatile *td,
+			       struct link_map *l,
+			       ptrdiff_t entry_check_offset)
+{
+  const ElfW(Rel) *reloc = td->arg;
+
+  if (_dl_tlsdesc_resolve_early_return_p (td, __builtin_return_address (0)
+					  - entry_check_offset))
+    return;
+
+  /* The code below was borrowed from _dl_fixup(),
+     except for checking for STB_LOCAL.  */
+  const ElfW(Sym) *const symtab
+    = (const void *) D_PTR (l, l_info[DT_SYMTAB]);
+  const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]);
+  const ElfW(Sym) *sym = &symtab[ELFW(R_SYM) (reloc->r_info)];
+  lookup_t result;
+
+   /* Look up the target symbol.  If the normal lookup rules are not
+      used don't look in the global scope.  */
+  if (ELFW(ST_BIND) (sym->st_info) != STB_LOCAL
+      && __builtin_expect (ELFW(ST_VISIBILITY) (sym->st_other), 0) == 0)
+    {
+      const struct r_found_version *version = NULL;
+
+      if (l->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
+	{
+	  const ElfW(Half) *vernum =
+	    (const void *) D_PTR (l, l_info[VERSYMIDX (DT_VERSYM)]);
+	  ElfW(Half) ndx = vernum[ELFW(R_SYM) (reloc->r_info)] & 0x7fff;
+	  version = &l->l_versions[ndx];
+	  if (version->hash == 0)
+	    version = NULL;
+	}
+
+      result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym,
+				    l->l_scope, version, ELF_RTYPE_CLASS_PLT,
+				    DL_LOOKUP_ADD_DEPENDENCY, NULL);
+    }
+  else
+    {
+      /* We already found the symbol.  The module (and therefore its load
+	 address) is also known.  */
+      result = l;
+    }
+
+  if (!sym)
+    {
+      td->arg = 0;
+      td->entry = _dl_tlsdesc_undefweak;
+    }
+  else
+    {
+#  ifndef SHARED
+      CHECK_STATIC_TLS (l, result);
+#  else
+      if (!TRY_STATIC_TLS (l, result))
+	{
+	  td->arg = _dl_make_tlsdesc_dynamic (result, sym->st_value);
+	  td->entry = _dl_tlsdesc_dynamic;
+	}
+      else
+#  endif
+	{
+	  td->arg = (void*)(sym->st_value - result->l_tls_offset);
+	  td->entry = _dl_tlsdesc_return;
+	}
+    }
+
+  _dl_tlsdesc_wake_up_held_fixups ();
+}
+
+/* This function is used to lazily resolve TLS_DESC RELA relocations.
+   The argument location is used to hold a pointer to the relocation.  */
+
+void
+__attribute__ ((regparm (3))) attribute_hidden
+_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td,
+				struct link_map *l,
+				ptrdiff_t entry_check_offset)
+{
+  const ElfW(Rela) *reloc = td->arg;
+
+  if (_dl_tlsdesc_resolve_early_return_p (td, __builtin_return_address (0)
+					  - entry_check_offset))
+    return;
+
+  /* The code below was borrowed from _dl_fixup(),
+     except for checking for STB_LOCAL.  */
+  const ElfW(Sym) *const symtab
+    = (const void *) D_PTR (l, l_info[DT_SYMTAB]);
+  const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]);
+  const ElfW(Sym) *sym = &symtab[ELFW(R_SYM) (reloc->r_info)];
+  lookup_t result;
+
+   /* Look up the target symbol.  If the normal lookup rules are not
+      used don't look in the global scope.  */
+  if (ELFW(ST_BIND) (sym->st_info) != STB_LOCAL
+      && __builtin_expect (ELFW(ST_VISIBILITY) (sym->st_other), 0) == 0)
+    {
+      const struct r_found_version *version = NULL;
+
+      if (l->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
+	{
+	  const ElfW(Half) *vernum =
+	    (const void *) D_PTR (l, l_info[VERSYMIDX (DT_VERSYM)]);
+	  ElfW(Half) ndx = vernum[ELFW(R_SYM) (reloc->r_info)] & 0x7fff;
+	  version = &l->l_versions[ndx];
+	  if (version->hash == 0)
+	    version = NULL;
+	}
+
+      result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym,
+				    l->l_scope, version, ELF_RTYPE_CLASS_PLT,
+				    DL_LOOKUP_ADD_DEPENDENCY, NULL);
+    }
+  else
+    {
+      /* We already found the symbol.  The module (and therefore its load
+	 address) is also known.  */
+      result = l;
+    }
+
+  if (!sym)
+    {
+      td->arg = (void*) reloc->r_addend;
+      td->entry = _dl_tlsdesc_undefweak;
+    }
+  else
+    {
+#  ifndef SHARED
+      CHECK_STATIC_TLS (l, result);
+#  else
+      if (!TRY_STATIC_TLS (l, result))
+	{
+	  td->arg = _dl_make_tlsdesc_dynamic (result, sym->st_value
+					      + reloc->r_addend);
+	  td->entry = _dl_tlsdesc_dynamic;
+	}
+      else
+#  endif
+	{
+	  td->arg = (void*) (sym->st_value - result->l_tls_offset
+			     + reloc->r_addend);
+	  td->entry = _dl_tlsdesc_return;
+	}
+    }
+
+  _dl_tlsdesc_wake_up_held_fixups ();
+}
+
+/* This function is used to avoid busy waiting for other threads to
+   complete the lazy relocation.  Once another thread wins the race to
+   relocate a TLS descriptor, it sets the descriptor up such that this
+   function is called to wait until the resolver releases the
+   lock.  */
+
+void
+__attribute__ ((regparm (3))) attribute_hidden
+_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc volatile *td,
+				struct link_map *l __attribute__((__unused__)),
+				ptrdiff_t entry_check_offset)
+{
+  /* Maybe we're lucky and can return early.  */
+  if (__builtin_return_address (0) - entry_check_offset != td->entry)
+    return;
+
+  /* Locking here will stop execution until the running resolver runs
+     _dl_tlsdesc_wake_up_held_fixups(), releasing the lock.
+
+     FIXME: We'd be better off waiting on a condition variable, such
+     that we didn't have to hold the lock throughout the relocation
+     processing.  */
+  __rtld_lock_lock_recursive (GL(dl_load_lock));
+  __rtld_lock_unlock_recursive (GL(dl_load_lock));
+}
+
+
+/* Unmap the dynamic object, but also release its TLS descriptor table
+   if there is one.  */
+
+void
+internal_function
+_dl_unmap (struct link_map *map)
+{
+  _dl_unmap_segments (map);
+
+#ifdef SHARED
+  if (map->l_mach.tlsdesc_table)
+    htab_delete (map->l_mach.tlsdesc_table);
+#endif
+}
diff --git a/REORG.TODO/sysdeps/i386/tlsdesc.sym b/REORG.TODO/sysdeps/i386/tlsdesc.sym
new file mode 100644
index 0000000000..33854975d0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tlsdesc.sym
@@ -0,0 +1,17 @@
+#include <stddef.h>
+#include <sysdep.h>
+#include <tls.h>
+#include <link.h>
+#include <dl-tlsdesc.h>
+
+--
+
+-- Abuse tls.h macros to derive offsets relative to the thread register.
+
+DTV_OFFSET			offsetof(struct pthread, header.dtv)
+
+TLSDESC_ARG			offsetof(struct tlsdesc, arg)
+
+TLSDESC_GEN_COUNT		offsetof(struct tlsdesc_dynamic_arg, gen_count)
+TLSDESC_MODID			offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
+TLSDESC_MODOFF			offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
diff --git a/REORG.TODO/sysdeps/i386/tst-audit.h b/REORG.TODO/sysdeps/i386/tst-audit.h
new file mode 100644
index 0000000000..87bf199c85
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-audit.h
@@ -0,0 +1,25 @@
+/* Definitions for testing PLT entry/exit auditing.  i386 version.
+
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define pltenter la_i86_gnu_pltenter
+#define pltexit la_i86_gnu_pltexit
+#define La_regs La_i86_regs
+#define La_retval La_i86_retval
+#define int_retval lrv_eax
diff --git a/REORG.TODO/sysdeps/i386/tst-audit3.c b/REORG.TODO/sysdeps/i386/tst-audit3.c
new file mode 100644
index 0000000000..b67a59d733
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-audit3.c
@@ -0,0 +1,37 @@
+/* Test case for i386 preserved registers in dynamic linker.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include "tst-audit3.h"
+
+static int
+do_test (void)
+{
+  long long ll = audit1_test (1, 2, 3);
+  if (ll != 30)
+    abort ();
+
+  float f = audit2_test (1, 2, 3);
+  if (f != 30)
+    abort ();
+
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/i386/tst-audit3.h b/REORG.TODO/sysdeps/i386/tst-audit3.h
new file mode 100644
index 0000000000..f6d3b9181e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-audit3.h
@@ -0,0 +1,20 @@
+/* Test case for i386 preserved registers in dynamic linker.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+extern long long audit1_test (int, int, int) __attribute__ ((regparm(3)));
+extern float audit2_test (int, int, int) __attribute__ ((regparm(3)));
diff --git a/REORG.TODO/sysdeps/i386/tst-auditmod3a.c b/REORG.TODO/sysdeps/i386/tst-auditmod3a.c
new file mode 100644
index 0000000000..a333cdcff9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-auditmod3a.c
@@ -0,0 +1,38 @@
+/* Test case for i386 preserved registers in dynamic linker.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include "tst-audit3.h"
+
+long long
+__attribute__ ((regparm(3)))
+audit1_test (int i, int j, int k)
+{
+  if (i != 1 || j != 2 || k != 3)
+    abort ();
+  return 30;
+}
+
+float
+__attribute__ ((regparm(3)))
+audit2_test (int i, int j, int k)
+{
+  if (i != 1 || j != 2 || k != 3)
+    abort ();
+  return 30;
+}
diff --git a/REORG.TODO/sysdeps/i386/tst-auditmod3b.c b/REORG.TODO/sysdeps/i386/tst-auditmod3b.c
new file mode 100644
index 0000000000..523f3cec90
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-auditmod3b.c
@@ -0,0 +1,186 @@
+/* Test case for i386 preserved registers in dynamic linker.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <link.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  const char *flagstr;
+  switch (flag)
+    {
+    case LA_ACT_CONSISTENT:
+      flagstr = "consistent";
+      break;
+    case LA_ACT_ADD:
+      flagstr = "add";
+      break;
+    case LA_ACT_DELETE:
+      flagstr = "delete";
+      break;
+    default:
+      printf ("activity: unknown activity %u\n", flag);
+      return;
+    }
+  printf ("activity: %s\n", flagstr);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  const char *flagstr;
+  switch (flag)
+    {
+    case LA_SER_ORIG:
+      flagstr = "LA_SET_ORIG";
+      break;
+    case LA_SER_LIBPATH:
+      flagstr = "LA_SER_LIBPATH";
+      break;
+    case LA_SER_RUNPATH:
+      flagstr = "LA_SER_RUNPATH";
+      break;
+    case LA_SER_CONFIG:
+      flagstr = "LA_SER_CONFIG";
+      break;
+    case LA_SER_DEFAULT:
+      flagstr = "LA_SER_DEFAULT";
+      break;
+    case LA_SER_SECURE:
+      flagstr = "LA_SER_SECURE";
+      break;
+    default:
+      printf ("objsearch: %s, unknown flag %d\n", name, flag);
+      return (char *) name;
+    }
+
+  printf ("objsearch: %s, %s\n", name, flagstr);
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#include "tst-audit.h"
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  if (strcmp (symname, "audit1_test") == 0
+      || strcmp (symname, "audit2_test") == 0)
+    {
+      if (regs->lr_eax != 1
+	  || regs->lr_edx != 2
+	  || regs->lr_ecx != 3)
+	abort ();
+
+      *framesizep = 200;
+    }
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx,
+	  (ptrdiff_t) outregs->int_retval);
+
+  if (strcmp (symname, "audit1_test") == 0
+      || strcmp (symname, "audit2_test") == 0)
+    {
+      if (inregs->lr_eax != 1
+	  || inregs->lr_edx != 2
+	  || inregs->lr_ecx != 3)
+	abort ();
+
+      if (strcmp (symname, "audit1_test") == 0)
+	{
+	  long long x = ((unsigned long long) outregs->lrv_eax
+			 | (unsigned long long) outregs->lrv_edx << 32);
+
+	  if (x != 30)
+	    abort ();
+	}
+      else if (strcmp (symname, "audit2_test") == 0)
+	{
+	  if (outregs->lrv_st0 != 30)
+	    abort ();
+	}
+    }
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/i386/tst-ld-sse-use.sh b/REORG.TODO/sysdeps/i386/tst-ld-sse-use.sh
new file mode 100755
index 0000000000..83a1dc59fb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-ld-sse-use.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+# Make sure no code in ld.so uses xmm/ymm/zmm registers on i386.
+# Copyright (C) 2009-2017 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+set -e
+
+objpfx="$1"
+NM="$2"
+OBJDUMP="$3"
+READELF="$4"
+
+tmp=$(mktemp ${objpfx}tst-ld-sse-use.XXXXXX)
+trap 'rm -f "$tmp"' 1 2 3 15
+
+# List of object files we have to test
+rtldobjs=$($READELF -W -wi ${objpfx}dl-allobjs.os |
+    awk '/^ </ { if ($5 == "(DW_TAG_compile_unit)") c=1; else c=0 } $2 == "DW_AT_name" { if (c == 1) print $NF }' |
+    sed 's,\(.*/\|\)\([_[:alnum:]-]*[.]\).$,\2os,')
+rtldobjs="$rtldobjs $(ar t ${objpfx}rtld-libc.a)"
+
+# OBJECT symbols can be ignored.
+$READELF -sW ${objpfx}dl-allobjs.os ${objpfx}rtld-libc.a |
+egrep " OBJECT  *GLOBAL " |
+awk '{if ($7 != "ABS") print $8 }' |
+sort -u > "$tmp"
+declare -a objects
+objects=($(cat "$tmp"))
+
+objs="dl-runtime.os"
+tocheck="dl-runtime.os"
+
+while test -n "$objs"; do
+  this="$objs"
+  objs=""
+
+  for f in $this; do
+    undef=$($NM -u "$objpfx"../*/"$f" | awk '{print $2}')
+    if test -n "$undef"; then
+      for s in $undef; do
+	for obj in ${objects[*]} "_GLOBAL_OFFSET_TABLE_"; do
+	  if test "$obj" = "$s"; then
+	    continue 2
+	  fi
+	done
+        for o in $rtldobjs; do
+	  ro=$(echo "$objpfx"../*/"$o")
+	  if $NM -g --defined-only "$ro" | egrep -qs " $s\$"; then
+	    if ! (echo "$tocheck $objs" | fgrep -qs "$o"); then
+	      echo "$o needed for $s"
+	      objs="$objs $o"
+	    fi
+	    break;
+	  fi
+	done
+      done
+    fi
+  done
+  tocheck="$tocheck$objs"
+done
+
+echo
+echo
+echo "object files needed: $tocheck"
+
+cp /dev/null "$tmp"
+for f in $tocheck; do
+  $OBJDUMP -d "$objpfx"../*/"$f" |
+  awk 'BEGIN { last="" } /^[[:xdigit:]]* <[_[:alnum:]]*>:$/ { fct=substr($2, 2, length($2)-3) } /,%[xyz]mm[[:digit:]]*$/ { if (last != fct) { print fct; last=fct} }' |
+  while read fct; do
+    if test "$fct" = "_dl_runtime_profile" -o "$fct" = "_dl_x86_64_restore_sse"; then
+      continue;
+    fi
+    echo "function $fct in $f modifies xmm/ymm/zmm" >> "$tmp"
+    result=1
+  done
+done
+
+if test -s "$tmp"; then
+  echo
+  echo
+  cat "$tmp"
+  result=1
+else
+  result=0
+fi
+
+rm "$tmp"
+exit $result
diff --git a/REORG.TODO/sysdeps/i386/tst-stack-align.h b/REORG.TODO/sysdeps/i386/tst-stack-align.h
new file mode 100644
index 0000000000..76276d4a28
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-stack-align.h
@@ -0,0 +1,41 @@
+/* Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <stdint.h>
+
+typedef struct { int i[4]; } int_al16 __attribute__((aligned (16)));
+
+#define TEST_STACK_ALIGN() \
+  ({									     \
+    int_al16 _m;							     \
+    double _d = 12.0;							     \
+    long double _ld = 15.0;						     \
+    int _ret = 0;							     \
+    printf ("int_al16:  %p %zu\n", &_m, __alignof (int_al16));		     \
+    if ((((uintptr_t) &_m) & (__alignof (int_al16) - 1)) != 0)		     \
+      _ret = 1;								     \
+									     \
+    printf ("double:  %g %p %zu\n", _d, &_d, __alignof (double));	     \
+    if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0)		     \
+      _ret = 1;								     \
+									     \
+    printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double));    \
+    if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0)	     \
+      _ret = 1;								     \
+    _ret;								     \
+    })