diff options
author | Alan Modra <amodra@gmail.com> | 2013-08-17 18:47:22 +0930 |
---|---|---|
committer | Alan Modra <amodra@gmail.com> | 2013-10-04 10:41:24 +0930 |
commit | 759cfef3ac4c07dba1ece0bbc1207e099348816d (patch) | |
tree | a0e8cadce4426afb90d39b330dd50688b8975484 /sysdeps/powerpc/powerpc64/power6 | |
parent | fe6e95d7171eba5f3e07848f081676fae4e86322 (diff) | |
download | glibc-759cfef3ac4c07dba1ece0bbc1207e099348816d.tar.gz glibc-759cfef3ac4c07dba1ece0bbc1207e099348816d.tar.xz glibc-759cfef3ac4c07dba1ece0bbc1207e099348816d.zip |
PowerPC LE memcpy
http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html LIttle-endian support for memcpy. I spent some time cleaning up the 64-bit power7 memcpy, in order to avoid the extra alignment traps power7 takes for little-endian. It probably would have been better to copy the linux kernel version of memcpy. * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support. * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise. * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise. * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise. * sysdeps/powerpc/powerpc64/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise. Make better use of regs. Use power7 mtocrf. Tidy function tails.
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power6')
-rw-r--r-- | sysdeps/powerpc/powerpc64/power6/memcpy.S | 329 |
1 files changed, 329 insertions, 0 deletions
diff --git a/sysdeps/powerpc/powerpc64/power6/memcpy.S b/sysdeps/powerpc/powerpc64/power6/memcpy.S index d6d242d293..e3f3d8a303 100644 --- a/sysdeps/powerpc/powerpc64/power6/memcpy.S +++ b/sysdeps/powerpc/powerpc64/power6/memcpy.S @@ -400,15 +400,28 @@ L(das_tail2): blt cr6,5f srdi 7,6,16 bgt cr6,3f +#ifdef __LITTLE_ENDIAN__ + sth 7,0(3) +#else sth 6,0(3) +#endif b 7f .align 4 3: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,24 + stb 6,0(3) + sth 7,1(3) +#else stb 7,0(3) sth 6,1(3) +#endif b 7f .align 4 5: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,8 +#endif stb 6,0(3) 7: cmpldi cr1,10,16 @@ -595,13 +608,24 @@ L(du1_do): bf 30,L(du1_1dw) /* there are at least two DWs to copy */ + /* FIXME: can combine last shift and "or" into "rldimi" */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 8 + sldi 8,7, 64-8 +#else sldi 0,6, 8 srdi 8,7, 64-8 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 8 + sldi 8,6, 64-8 +#else sldi 0,7, 8 srdi 8,6, 64-8 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -610,8 +634,13 @@ L(du1_do): blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */ bf 31,L(du1_loop) /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 8 + sldi 8,7, 64-8 +#else sldi 0,6, 8 srdi 8,7, 64-8 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -622,8 +651,13 @@ L(du1_do): b L(du1_loop) .align 4 L(du1_1dw): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 8 + sldi 8,7, 64-8 +#else sldi 0,6, 8 srdi 8,7, 64-8 +#endif addi 5,5,16 or 0,0,8 bf 31,L(du1_loop) @@ -635,23 +669,43 @@ L(du1_1dw): .align 4 /* copy 32 bytes at a time */ L(du1_loop): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 8 + sldi 8,7, 64-8 +#else sldi 0,6, 8 srdi 8,7, 64-8 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 8 + sldi 8,6, 64-8 +#else sldi 0,7, 8 srdi 8,6, 64-8 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 8 + sldi 8,7, 64-8 +#else sldi 0,6, 8 srdi 8,7, 64-8 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 8 + sldi 8,6, 64-8 +#else sldi 0,7, 8 srdi 8,6, 64-8 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -661,8 +715,13 @@ L(du1_loop): .align 4 L(du1_fini): /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 8 + sldi 8,7, 64-8 +#else sldi 0,6, 8 srdi 8,7, 64-8 +#endif or 0,0,8 std 0,0(4) b L(du_done) @@ -672,13 +731,23 @@ L(du2_do): bf 30,L(du2_1dw) /* there are at least two DWs to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 16 + sldi 8,7, 64-16 +#else sldi 0,6, 16 srdi 8,7, 64-16 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 16 + sldi 8,6, 64-16 +#else sldi 0,7, 16 srdi 8,6, 64-16 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -687,8 +756,13 @@ L(du2_do): blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */ bf 31,L(du2_loop) /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 16 + sldi 8,7, 64-16 +#else sldi 0,6, 16 srdi 8,7, 64-16 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -699,8 +773,13 @@ L(du2_do): b L(du2_loop) .align 4 L(du2_1dw): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 16 + sldi 8,7, 64-16 +#else sldi 0,6, 16 srdi 8,7, 64-16 +#endif addi 5,5,16 or 0,0,8 bf 31,L(du2_loop) @@ -712,23 +791,43 @@ L(du2_1dw): .align 4 /* copy 32 bytes at a time */ L(du2_loop): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 16 + sldi 8,7, 64-16 +#else sldi 0,6, 16 srdi 8,7, 64-16 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 16 + sldi 8,6, 64-16 +#else sldi 0,7, 16 srdi 8,6, 64-16 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 16 + sldi 8,7, 64-16 +#else sldi 0,6, 16 srdi 8,7, 64-16 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 16 + sldi 8,6, 64-16 +#else sldi 0,7, 16 srdi 8,6, 64-16 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -738,8 +837,13 @@ L(du2_loop): .align 4 L(du2_fini): /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 16 + sldi 8,7, 64-16 +#else sldi 0,6, 16 srdi 8,7, 64-16 +#endif or 0,0,8 std 0,0(4) b L(du_done) @@ -749,13 +853,23 @@ L(du3_do): bf 30,L(du3_1dw) /* there are at least two DWs to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 24 + sldi 8,7, 64-24 +#else sldi 0,6, 24 srdi 8,7, 64-24 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 24 + sldi 8,6, 64-24 +#else sldi 0,7, 24 srdi 8,6, 64-24 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -764,8 +878,13 @@ L(du3_do): blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */ bf 31,L(du3_loop) /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 24 + sldi 8,7, 64-24 +#else sldi 0,6, 24 srdi 8,7, 64-24 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -776,8 +895,13 @@ L(du3_do): b L(du3_loop) .align 4 L(du3_1dw): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 24 + sldi 8,7, 64-24 +#else sldi 0,6, 24 srdi 8,7, 64-24 +#endif addi 5,5,16 or 0,0,8 bf 31,L(du3_loop) @@ -789,23 +913,43 @@ L(du3_1dw): .align 4 /* copy 32 bytes at a time */ L(du3_loop): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 24 + sldi 8,7, 64-24 +#else sldi 0,6, 24 srdi 8,7, 64-24 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 24 + sldi 8,6, 64-24 +#else sldi 0,7, 24 srdi 8,6, 64-24 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 24 + sldi 8,7, 64-24 +#else sldi 0,6, 24 srdi 8,7, 64-24 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 24 + sldi 8,6, 64-24 +#else sldi 0,7, 24 srdi 8,6, 64-24 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -815,8 +959,13 @@ L(du3_loop): .align 4 L(du3_fini): /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 24 + sldi 8,7, 64-24 +#else sldi 0,6, 24 srdi 8,7, 64-24 +#endif or 0,0,8 std 0,0(4) b L(du_done) @@ -832,13 +981,23 @@ L(du4_dox): bf 30,L(du4_1dw) /* there are at least two DWs to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 32 + sldi 8,7, 64-32 +#else sldi 0,6, 32 srdi 8,7, 64-32 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 32 + sldi 8,6, 64-32 +#else sldi 0,7, 32 srdi 8,6, 64-32 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -847,8 +1006,13 @@ L(du4_dox): blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */ bf 31,L(du4_loop) /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 32 + sldi 8,7, 64-32 +#else sldi 0,6, 32 srdi 8,7, 64-32 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -859,8 +1023,13 @@ L(du4_dox): b L(du4_loop) .align 4 L(du4_1dw): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 32 + sldi 8,7, 64-32 +#else sldi 0,6, 32 srdi 8,7, 64-32 +#endif addi 5,5,16 or 0,0,8 bf 31,L(du4_loop) @@ -872,23 +1041,43 @@ L(du4_1dw): .align 4 /* copy 32 bytes at a time */ L(du4_loop): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 32 + sldi 8,7, 64-32 +#else sldi 0,6, 32 srdi 8,7, 64-32 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 32 + sldi 8,6, 64-32 +#else sldi 0,7, 32 srdi 8,6, 64-32 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 32 + sldi 8,7, 64-32 +#else sldi 0,6, 32 srdi 8,7, 64-32 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 32 + sldi 8,6, 64-32 +#else sldi 0,7, 32 srdi 8,6, 64-32 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -898,8 +1087,13 @@ L(du4_loop): .align 4 L(du4_fini): /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 32 + sldi 8,7, 64-32 +#else sldi 0,6, 32 srdi 8,7, 64-32 +#endif or 0,0,8 std 0,0(4) b L(du_done) @@ -909,13 +1103,23 @@ L(du5_do): bf 30,L(du5_1dw) /* there are at least two DWs to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 40 + sldi 8,7, 64-40 +#else sldi 0,6, 40 srdi 8,7, 64-40 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 40 + sldi 8,6, 64-40 +#else sldi 0,7, 40 srdi 8,6, 64-40 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -924,8 +1128,13 @@ L(du5_do): blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */ bf 31,L(du5_loop) /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 40 + sldi 8,7, 64-40 +#else sldi 0,6, 40 srdi 8,7, 64-40 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -936,8 +1145,13 @@ L(du5_do): b L(du5_loop) .align 4 L(du5_1dw): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 40 + sldi 8,7, 64-40 +#else sldi 0,6, 40 srdi 8,7, 64-40 +#endif addi 5,5,16 or 0,0,8 bf 31,L(du5_loop) @@ -949,23 +1163,43 @@ L(du5_1dw): .align 4 /* copy 32 bytes at a time */ L(du5_loop): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 40 + sldi 8,7, 64-40 +#else sldi 0,6, 40 srdi 8,7, 64-40 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 40 + sldi 8,6, 64-40 +#else sldi 0,7, 40 srdi 8,6, 64-40 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 40 + sldi 8,7, 64-40 +#else sldi 0,6, 40 srdi 8,7, 64-40 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 40 + sldi 8,6, 64-40 +#else sldi 0,7, 40 srdi 8,6, 64-40 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -975,8 +1209,13 @@ L(du5_loop): .align 4 L(du5_fini): /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 40 + sldi 8,7, 64-40 +#else sldi 0,6, 40 srdi 8,7, 64-40 +#endif or 0,0,8 std 0,0(4) b L(du_done) @@ -986,13 +1225,23 @@ L(du6_do): bf 30,L(du6_1dw) /* there are at least two DWs to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 48 + sldi 8,7, 64-48 +#else sldi 0,6, 48 srdi 8,7, 64-48 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 48 + sldi 8,6, 64-48 +#else sldi 0,7, 48 srdi 8,6, 64-48 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -1001,8 +1250,13 @@ L(du6_do): blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */ bf 31,L(du6_loop) /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 48 + sldi 8,7, 64-48 +#else sldi 0,6, 48 srdi 8,7, 64-48 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -1013,8 +1267,13 @@ L(du6_do): b L(du6_loop) .align 4 L(du6_1dw): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 48 + sldi 8,7, 64-48 +#else sldi 0,6, 48 srdi 8,7, 64-48 +#endif addi 5,5,16 or 0,0,8 bf 31,L(du6_loop) @@ -1026,23 +1285,43 @@ L(du6_1dw): .align 4 /* copy 32 bytes at a time */ L(du6_loop): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 48 + sldi 8,7, 64-48 +#else sldi 0,6, 48 srdi 8,7, 64-48 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 48 + sldi 8,6, 64-48 +#else sldi 0,7, 48 srdi 8,6, 64-48 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 48 + sldi 8,7, 64-48 +#else sldi 0,6, 48 srdi 8,7, 64-48 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 48 + sldi 8,6, 64-48 +#else sldi 0,7, 48 srdi 8,6, 64-48 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -1052,8 +1331,13 @@ L(du6_loop): .align 4 L(du6_fini): /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 48 + sldi 8,7, 64-48 +#else sldi 0,6, 48 srdi 8,7, 64-48 +#endif or 0,0,8 std 0,0(4) b L(du_done) @@ -1063,13 +1347,23 @@ L(du7_do): bf 30,L(du7_1dw) /* there are at least two DWs to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 56 + sldi 8,7, 64-56 +#else sldi 0,6, 56 srdi 8,7, 64-56 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 56 + sldi 8,6, 64-56 +#else sldi 0,7, 56 srdi 8,6, 64-56 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -1078,8 +1372,13 @@ L(du7_do): blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */ bf 31,L(du7_loop) /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 56 + sldi 8,7, 64-56 +#else sldi 0,6, 56 srdi 8,7, 64-56 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -1090,8 +1389,13 @@ L(du7_do): b L(du7_loop) .align 4 L(du7_1dw): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 56 + sldi 8,7, 64-56 +#else sldi 0,6, 56 srdi 8,7, 64-56 +#endif addi 5,5,16 or 0,0,8 bf 31,L(du7_loop) @@ -1103,23 +1407,43 @@ L(du7_1dw): .align 4 /* copy 32 bytes at a time */ L(du7_loop): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 56 + sldi 8,7, 64-56 +#else sldi 0,6, 56 srdi 8,7, 64-56 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 56 + sldi 8,6, 64-56 +#else sldi 0,7, 56 srdi 8,6, 64-56 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 56 + sldi 8,7, 64-56 +#else sldi 0,6, 56 srdi 8,7, 64-56 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 56 + sldi 8,6, 64-56 +#else sldi 0,7, 56 srdi 8,6, 64-56 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -1129,8 +1453,13 @@ L(du7_loop): .align 4 L(du7_fini): /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 56 + sldi 8,7, 64-56 +#else sldi 0,6, 56 srdi 8,7, 64-56 +#endif or 0,0,8 std 0,0(4) b L(du_done) |